FFmmAlgorithmThreadProc.hpp 77.5 KB
Newer Older
1
// ===================================================================================
2
// Copyright ScalFmm 2011 INRIA, Olivier Coulaud, Berenger Bramas, Matthias Messner
3 4 5 6
// olivier.coulaud@inria.fr, berenger.bramas@inria.fr
// This software is a computer program whose purpose is to compute the FMM.
//
// This software is governed by the CeCILL-C and LGPL licenses and
7 8
// abiding by the rules of distribution of free software.
//
9 10 11 12
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
// GNU General Public and CeCILL-C Licenses for more details.
13
// "http://www.cecill.info".
14
// "http://www.gnu.org/licenses".
15
// ===================================================================================
16 17
#ifndef FFMMALGORITHMTHREADPROC_HPP
#define FFMMALGORITHMTHREADPROC_HPP
18

COULAUD Olivier's avatar
COULAUD Olivier committed
19
#include <omp.h>
20

COULAUD Olivier's avatar
COULAUD Olivier committed
21
//
22
#include "../Utils/FAssert.hpp"
23
#include "../Utils/FLog.hpp"
24

25
#include "../Utils/FTic.hpp"
26
#include "Utils/FAlgorithmTimers.hpp"
27

28 29
#include "../Utils/FGlobal.hpp"

30
#include "../Containers/FBoolArray.hpp"
31
#include "../Containers/FOctree.hpp"
berenger-bramas's avatar
berenger-bramas committed
32
#include "../Containers/FLightOctree.hpp"
33
#include "../Utils/FEnv.hpp"
34

35 36
#include "../Containers/FBufferWriter.hpp"
#include "../Containers/FBufferReader.hpp"
37 38
#include "../Containers/FMpiBufferWriter.hpp"
#include "../Containers/FMpiBufferReader.hpp"
39

40
#include "../Utils/FMpi.hpp"
41
#include <sys/time.h>
42

43
#include "FCoreCommon.hpp"
44
#include "FP2PExclusion.hpp"
45

46 47
#include <memory>

48
/**
49
 * @author Berenger Bramas (berenger.bramas@inria.fr)
50
 *
51 52
 * Please read the license
 *
53 54 55 56
 * This class is a threaded FMM algorithm distributed using MPI. It iterates on
 * a tree and call the kernels with good arguments. It uses the inspector -
 * executor model : iterates on the tree and builds an array to work in parallel
 * on this array
57
 *
58
 * This class does not free pointers given in arguements.
59 60
 *
 * Threaded & based on the inspector-executor model
61 62 63 64 65 66
 *
 *     schedule(runtime) export OMP_NUM_THREADS=2
 *     export OMPI_CXX=`which g++-4.4`
 *     mpirun -np 2 valgrind --suppressions=/usr/share/openmpi/openmpi-valgrind.supp
 *        --tool=memcheck --leak-check=yes --show-reachable=yes --num-callers=20
 *        --track-fds=yes ./Tests/testFmmAlgorithmProc ../Data/testLoaderSmall.fma.tmp
67
 */
68
template<class OctreeClass, class CellClass, class ContainerClass, class KernelClass, class LeafClass, class P2PExclusionClass = FP2PMiddleExclusion>
69
class FFmmAlgorithmThreadProc : public FAbstractAlgorithm, public FAlgorithmTimers {
70 71 72
private:
    OctreeClass* const tree;     ///< The octree to work on
    KernelClass** kernels;       ///< The kernels
73

74
    const FMpi::FComm& comm;     ///< MPI comm
75

76 77 78 79
    /// Used to store pointers to cells/leafs to work with
    typename OctreeClass::Iterator* iterArray;  
    /// Used to store pointers to cells/leafs to send/rcv
    typename OctreeClass::Iterator* iterArrayComm;
berenger-bramas's avatar
berenger-bramas committed
80

81 82 83 84 85
    int numberOfLeafs;           ///< To store the size at the previous level
    const int MaxThreads;        ///< Max number of thread allowed by openmp
    const int nbProcess;         ///< Process count
    const int idProcess;         ///< Current process id
    const int OctreeHeight;      ///< Tree height
86

87
    const int userChunkSize;
88
    const int leafLevelSeparationCriteria;
89

90
    /** An interval is the morton index interval
91
     * that a proc uses (i.e. it holds data in this interval) */
92
    struct Interval{
93 94
        MortonIndex leftIndex;
        MortonIndex rightIndex;
95
    };
96 97

    /// Current process interval
98
    Interval*const intervals;
99
    /// All processes intervals
100
    Interval*const workingIntervalsPerLevel;
101

102
    /// Get an interval from a process id and tree level
103
    Interval& getWorkingInterval( int level,  int proc){
104
        return workingIntervalsPerLevel[OctreeHeight * proc + level];
105
    }
106

107
    /// Get an interval from a process id and tree level
108
    const Interval& getWorkingInterval( int level,  int proc) const {
109
        return workingIntervalsPerLevel[OctreeHeight * proc + level];
110 111
    }

112 113
    /// Does \a procIdx have work at given \a idxLevel
    /** i.e. does it hold cells and is responsible of them ? */
114
    bool procHasWorkAtLevel(const int idxLevel , const int idxProc) const {
115
        return getWorkingInterval(idxLevel, idxProc).leftIndex <= getWorkingInterval(idxLevel, idxProc).rightIndex;
116 117
    }

118
    /** True if the \a idxProc left cell at \a idxLevel+1 has the same parent as us for our right cell */
119
    bool procCoversMyRightBorderCell(const int idxLevel , const int idxProc) const {
120
        return (getWorkingInterval((idxLevel+1) , idProcess).rightIndex>>3) == (getWorkingInterval((idxLevel+1) ,idxProc).leftIndex >>3);
121 122
    }

123
    /** True if the idxProc right cell at idxLevel+1 has the same parent as us for our left cell */
124
    bool procCoversMyLeftBorderCell(const int idxLevel , const int idxProc) const {
125
        return (getWorkingInterval((idxLevel+1) , idxProc).rightIndex >>3) == (getWorkingInterval((idxLevel+1) , idProcess).leftIndex>>3);
126 127
    }

128
public:
129
    /// Get current process interval at given \a level
130
    Interval& getWorkingInterval( int level){
131
        return getWorkingInterval(level, idProcess);
132 133
    }

134
    /// Does the current process has some work at this level ?
135
    bool hasWorkAtLevel( int level){
136
        return idProcess == 0 || (getWorkingInterval(level, idProcess - 1).rightIndex) < (getWorkingInterval(level, idProcess).rightIndex);
137 138
    }

139
    /**@brief Constructor
140 141
     * @param inTree the octree to work on
     * @param inKernels the kernels to call
142
     *
143 144
     * An assert is launched if one of the arguments is null
     */
145 146 147
    FFmmAlgorithmThreadProc(const FMpi::FComm& inComm, OctreeClass* const inTree,
                            KernelClass* const inKernels,
                            const int inUserChunkSize = 10, const int inLeafLevelSeperationCriteria = 1) :
148 149 150 151 152 153
        tree(inTree),
        kernels(nullptr),
        comm(inComm),
        iterArray(nullptr),
        iterArrayComm(nullptr),
        numberOfLeafs(0),
154
        MaxThreads(FEnv::GetValue("SCALFMM_ALGO_NUM_THREADS",omp_get_max_threads())),
155 156 157
        nbProcess(inComm.processCount()),
        idProcess(inComm.processId()),
        OctreeHeight(tree->getHeight()),
158
        userChunkSize(inUserChunkSize),
159
        leafLevelSeparationCriteria(inLeafLevelSeperationCriteria),
160 161
        intervals(new Interval[inComm.processCount()]),
        workingIntervalsPerLevel(new Interval[inComm.processCount() * tree->getHeight()]) {
162
        FAssertLF(tree, "tree cannot be null");
163
        FAssertLF(leafLevelSeparationCriteria < 3, "Separation criteria should be < 3");
164

165
        this->kernels = new KernelClass*[MaxThreads];
166 167
        #pragma omp parallel num_threads(MaxThreads)
        {
168
            #pragma omp critical (InitFFmmAlgorithmThreadProc)
169
            {
170
                this->kernels[omp_get_thread_num()] = new KernelClass(*inKernels);
171
            }
172
        }
173

174 175
        FAbstractAlgorithm::setNbLevelsInTree(tree->getHeight());

176 177
        FLOG(FLog::Controller << "FFmmAlgorithmThreadProc\n");
        FLOG(FLog::Controller << "Max threads = "  << MaxThreads << ", Procs = " << nbProcess << ", I am " << idProcess << ".\n");
178
        FLOG(FLog::Controller << "Chunck Size = " << userChunkSize << "\n");
179
    }
180 181

    /// Default destructor
182
    virtual ~FFmmAlgorithmThreadProc(){
183 184 185 186
        for(int idxThread = 0 ; idxThread < MaxThreads ; ++idxThread){
            delete this->kernels[idxThread];
        }
        delete [] this->kernels;
187

188 189
        delete [] intervals;
        delete [] workingIntervalsPerLevel;
190 191
    }

192
protected:
193 194 195 196
    /**
     * To execute the fmm algorithm
     * Call this function to run the complete algorithm
     */
197
    void executeCore(const unsigned operationsToProceed) override {
198
        // Count leaf
199
#ifdef SCALFMM_TRACE_ALGO
COULAUD Olivier's avatar
COULAUD Olivier committed
200
    	eztrace_resume();
201 202
#endif
	this->numberOfLeafs = 0;
203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272
        {
            Interval myFullInterval;
            {//Building the interval with the first and last leaves (and count the number of leaves)
                typename OctreeClass::Iterator octreeIterator(tree);
                octreeIterator.gotoBottomLeft();
                myFullInterval.leftIndex = octreeIterator.getCurrentGlobalIndex();
                do{
                    ++this->numberOfLeafs;
                } while(octreeIterator.moveRight());
                myFullInterval.rightIndex = octreeIterator.getCurrentGlobalIndex();
            }
            // Allocate a number to store the pointer of the cells at a level
            iterArray     = new typename OctreeClass::Iterator[numberOfLeafs];
            iterArrayComm = new typename OctreeClass::Iterator[numberOfLeafs];
            FAssertLF(iterArray,     "iterArray     bad alloc");
            FAssertLF(iterArrayComm, "iterArrayComm bad alloc");

            // We get the leftIndex/rightIndex indexes from each procs
            FMpi::MpiAssert( MPI_Allgather( &myFullInterval, sizeof(Interval), MPI_BYTE, intervals, sizeof(Interval), MPI_BYTE, comm.getComm()),  __LINE__ );

            // Build my intervals for all levels
            std::unique_ptr<Interval[]> myIntervals(new Interval[OctreeHeight]);
            // At leaf level we know it is the full interval
            myIntervals[OctreeHeight - 1] = myFullInterval;

            // We can estimate the interval for each level by using the parent/child relation
            for(int idxLevel = OctreeHeight - 2 ; idxLevel >= 0 ; --idxLevel){
                myIntervals[idxLevel].leftIndex = myIntervals[idxLevel+1].leftIndex >> 3;
                myIntervals[idxLevel].rightIndex = myIntervals[idxLevel+1].rightIndex >> 3;
            }

            // Process 0 uses the estimates as real intervals, but other processes
            // should remove cells that belong to others
            if(idProcess != 0){
                //We test for each level if process on left (idProcess-1) own cell I thought I owned
                typename OctreeClass::Iterator octreeIterator(tree);
                octreeIterator.gotoBottomLeft();
                octreeIterator.moveUp();

                // At h-1 the working limit is the parent of the right cell of the proc on the left
                MortonIndex workingLimitAtLevel = intervals[idProcess-1].rightIndex >> 3;

                // We check if we have no more work to do
                int nullIntervalFromLevel = 0;

                for(int idxLevel = OctreeHeight - 2 ; idxLevel >= 1 && nullIntervalFromLevel == 0 ; --idxLevel){
                    while(octreeIterator.getCurrentGlobalIndex() <= workingLimitAtLevel){
                        if( !octreeIterator.moveRight() ){
                            // We cannot move right we are not owner of any more cell
                            nullIntervalFromLevel = idxLevel;
                            break;
                        }
                    }
                    // If we are responsible for some cells at this level keep the first index
                    if(nullIntervalFromLevel == 0){
                        myIntervals[idxLevel].leftIndex = octreeIterator.getCurrentGlobalIndex();
                        octreeIterator.moveUp();
                        workingLimitAtLevel >>= 3;
                    }
                }
                // In case we are not responsible for any cells we put the leftIndex = rightIndex+1
                for(int idxLevel = nullIntervalFromLevel ; idxLevel >= 1 ; --idxLevel){
                    myIntervals[idxLevel].leftIndex = myIntervals[idxLevel].rightIndex + 1;
                }
            }

            // We get the leftIndex/rightIndex indexes from each procs
            FMpi::MpiAssert( MPI_Allgather( myIntervals.get(), int(sizeof(Interval)) * OctreeHeight, MPI_BYTE,
                                            workingIntervalsPerLevel, int(sizeof(Interval)) * OctreeHeight, MPI_BYTE, comm.getComm()),  __LINE__ );
        }
COULAUD Olivier's avatar
COULAUD Olivier committed
273 274 275
#ifdef SCALFMM_TRACE_ALGO
	    eztrace_enter_event("P2M", EZTRACE_YELLOW);
#endif
COULAUD Olivier's avatar
COULAUD Olivier committed
276
        Timers[P2MTimer].tic();
277
        if(operationsToProceed & FFmmP2M) bottomPass();
278
        Timers[P2MTimer].tac();
279

COULAUD Olivier's avatar
COULAUD Olivier committed
280
#ifdef SSCALFMM_TRACE_ALGO
COULAUD Olivier's avatar
COULAUD Olivier committed
281 282
	eztrace_leave_event();
	eztrace_enter_event("M2M", EZTRACE_PINK);
COULAUD Olivier's avatar
COULAUD Olivier committed
283 284
#endif

285
        Timers[M2MTimer].tic();
COULAUD Olivier's avatar
COULAUD Olivier committed
286 287 288 289
	    if(operationsToProceed & FFmmM2M) upwardPass();
      Timers[M2MTimer].tac();

#ifdef SCALFMM_TRACE_ALGO
290
	     eztrace_leave_event();
COULAUD Olivier's avatar
COULAUD Olivier committed
291 292
	    eztrace_enter_event("M2L", EZTRACE_GREEN);
#endif
293

COULAUD Olivier's avatar
COULAUD Olivier committed
294
		Timers[M2LTimer].tic();
295
        if(operationsToProceed & FFmmM2L) transferPass();
296
        Timers[M2LTimer].tac();
297

COULAUD Olivier's avatar
COULAUD Olivier committed
298 299 300 301 302 303
 #ifdef SCALFMM_TRACE_ALGO
		eztrace_leave_event();
	    eztrace_enter_event("L2L", EZTRACE_PINK);
#endif

	    Timers[L2LTimer].tic();
304
        if(operationsToProceed & FFmmL2L) downardPass();
305
        Timers[L2LTimer].tac();
306

COULAUD Olivier's avatar
COULAUD Olivier committed
307 308 309 310 311 312
#ifdef SCALFMM_TRACE_ALGO
		eztrace_leave_event();
	    eztrace_enter_event("L2P+P2P", EZTRACE_BLUE);
#endif

	    Timers[NearTimer].tic();
313 314
        if( (operationsToProceed & FFmmP2P) || (operationsToProceed & FFmmL2P) ) directPass((operationsToProceed & FFmmP2P),(operationsToProceed & FFmmL2P));
        Timers[NearTimer].tac();
315

316
#ifdef SCALFMM_TRACE_ALGO
COULAUD Olivier's avatar
COULAUD Olivier committed
317 318
		eztrace_leave_event();
	    eztrace_stop();
319
#endif
320 321
        // delete array
        delete []     iterArray;
322 323
        delete []     iterArrayComm;
        iterArray          = nullptr;
324
        iterArrayComm = nullptr;
325 326 327
#ifdef SCALFMM_TRACE_ALGO
	  eztrace_stop();
#endif
328
    }
329

330 331 332
    /////////////////////////////////////////////////////////////////////////////
    // P2M
    /////////////////////////////////////////////////////////////////////////////
333

334 335 336 337 338
    /**
     * P2M Bottom Pass
     * No communication are involved in the P2M.
     * It is similar to multi threaded version.
     */
339
    void bottomPass(){
340 341 342 343 344 345 346 347 348 349 350 351 352
        FLOG( FLog::Controller.write("\tStart Bottom Pass\n").write(FLog::Flush) );
        FLOG(FTic counterTime);
        FLOG(FTic computationCounter);
        typename OctreeClass::Iterator octreeIterator(tree);

        // Copy the ptr to leaves in array
        octreeIterator.gotoBottomLeft();
        int leafs = 0;
        do{
            iterArray[leafs++] = octreeIterator;
        } while(octreeIterator.moveRight());

        FLOG(computationCounter.tic());
353
#pragma omp parallel num_threads(MaxThreads)
354 355 356 357
        {
            // Each thread get its own kernel
            KernelClass * const myThreadkernels = kernels[omp_get_thread_num()];
            // Parallel iteration on the leaves
358
#pragma omp for nowait schedule(dynamic, userChunkSize)
359 360 361 362 363 364 365
            for(int idxLeafs = 0 ; idxLeafs < leafs ; ++idxLeafs){
                myThreadkernels->P2M( iterArray[idxLeafs].getCurrentCell() , iterArray[idxLeafs].getCurrentListSrc());
            }
        }
        FLOG(computationCounter.tac());
        FLOG( FLog::Controller << "\tFinished (@Bottom Pass (P2M) = "  << counterTime.tacAndElapsed() << " s)\n" );
        FLOG( FLog::Controller << "\t\t Computation : " << computationCounter.elapsed() << " s\n" );
BRAMAS Berenger's avatar
BRAMAS Berenger committed
366
        FLOG( FLog::Controller.flush());
367 368 369 370 371 372 373 374
    }

    /////////////////////////////////////////////////////////////////////////////
    // Upward
    /////////////////////////////////////////////////////////////////////////////

    /** M2M */
    void upwardPass(){
375 376 377 378 379 380 381 382 383 384
        FLOG( FLog::Controller.write("\tStart Upward Pass\n").write(FLog::Flush); );
        FLOG(FTic counterTime);
        FLOG(FTic computationCounter);
        FLOG(FTic singleCounter);
        FLOG(FTic parallelCounter);

        // Start from leal level (height-1)
        typename OctreeClass::Iterator octreeIterator(tree);
        octreeIterator.gotoBottomLeft();
        octreeIterator.moveUp();
385

386
        for(int idxLevel = OctreeHeight - 2 ; idxLevel > FAbstractAlgorithm::lowerWorkingLevel-1 ; --idxLevel){
387 388 389
            octreeIterator.moveUp();
        }

390 391 392 393 394 395 396 397 398 399
        typename OctreeClass::Iterator avoidGotoLeftIterator(octreeIterator);

        // The proc to send the shared cells to
        // Starting to the proc on the left this variable will go to 0
        int currentProcIdToSendTo = (idProcess - 1);

        // There are a maximum of 1 sends and 8-1 receptions
        MPI_Request requests[8];
        MPI_Status status[8];

400 401 402
        MPI_Request requestsSize[8];
        MPI_Status statusSize[8];

403
        FSize bufferSize;
404 405
        FMpiBufferWriter sendBuffer(comm.getComm(), 1);// Max = 1 + sizeof(cell)*7
        std::unique_ptr<FMpiBufferReader[]> recvBuffer(new FMpiBufferReader[7]);
406
        FSize recvBufferSize[7];
407
        CellClass recvBufferCells[7];
408 409 410 411 412 413

        // The first proc that send to me a cell
        // This variable will go to nbProcess
        int firstProcThatSend = idProcess + 1;
        FLOG(computationCounter.tic());

414 415
        // for each levels
        for(int idxLevel = FMath::Min(OctreeHeight - 2, FAbstractAlgorithm::lowerWorkingLevel - 1) ; idxLevel >= FAbstractAlgorithm::upperWorkingLevel ; --idxLevel ){
416 417 418 419 420 421 422 423 424 425 426 427 428 429 430
            // Does my cells are covered by my neighbors working interval and so I have no more work?
            const bool noMoreWorkForMe = (idProcess != 0 && !procHasWorkAtLevel(idxLevel+1, idProcess));
            if(noMoreWorkForMe){
                FAssertLF(procHasWorkAtLevel(idxLevel, idProcess) == false);
                break;
            }

            // Copy and count ALL the cells (even the ones outside the working interval)
            int totalNbCellsAtLevel = 0;
            do{
                iterArray[totalNbCellsAtLevel++] = octreeIterator;
            } while(octreeIterator.moveRight());
            avoidGotoLeftIterator.moveUp();
            octreeIterator = avoidGotoLeftIterator;

431 432
            int iterMpiRequests       = 0; // The iterator for send/recv requests
            int iterMpiRequestsSize   = 0; // The iterator for send/recv requests
433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456

            int nbCellsToSkip     = 0; // The number of cells to send
            // Skip all the cells that are out of my working interval
            while(nbCellsToSkip < totalNbCellsAtLevel && iterArray[nbCellsToSkip].getCurrentGlobalIndex() < getWorkingInterval(idxLevel, idProcess).leftIndex){
                ++nbCellsToSkip;
            }

            // We need to know if we will recv something in order to know if threads skip the last cell
            int nbCellsForThreads = totalNbCellsAtLevel; // totalNbCellsAtLevel or totalNbCellsAtLevel-1
            bool hasToReceive = false;
            if(idProcess != nbProcess-1 && procHasWorkAtLevel(idxLevel , idProcess)){
                // Find the first proc that may send to me
                while(firstProcThatSend < nbProcess && !procHasWorkAtLevel(idxLevel+1, firstProcThatSend) ){
                    firstProcThatSend += 1;
                }
                // Do we have to receive?
                if(firstProcThatSend < nbProcess && procHasWorkAtLevel(idxLevel+1, firstProcThatSend) && procCoversMyRightBorderCell(idxLevel, firstProcThatSend) ){
                    hasToReceive = true;
                    // Threads do not compute the last cell, we will do it once data are received
                    nbCellsForThreads -= 1;
                }
            }

            FLOG(parallelCounter.tic());
457
            #pragma omp parallel num_threads(MaxThreads)
458
            {
459
                KernelClass* myThreadkernels = (kernels[omp_get_thread_num()]);
460
                //This single section post and receive the comms, and then do the M2M associated with it.
461
                #pragma omp single nowait
462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490
                {
                    FLOG(singleCounter.tic());
                    // Master proc never send
                    if(idProcess != 0){
                        // Skip process that have no work at that level
                        while( currentProcIdToSendTo && !procHasWorkAtLevel(idxLevel, currentProcIdToSendTo)  ){
                            --currentProcIdToSendTo;
                        }
                        // Does the next proc that has work is sharing the parent of my left cell
                        if(procHasWorkAtLevel(idxLevel, currentProcIdToSendTo) && procCoversMyLeftBorderCell(idxLevel, currentProcIdToSendTo)){
                            FAssertLF(nbCellsToSkip != 0);

                            char packageFlags = 0;
                            sendBuffer.write(packageFlags);

                            // Only the cell the most on the right out of my working interval should be taken in
                            // consideration (at pos nbCellsToSkip-1) other (x < nbCellsToSkip-1) have already been sent
                            const CellClass* const* const child = iterArray[nbCellsToSkip-1].getCurrentChild();
                            for(int idxChild = 0 ; idxChild < 8 ; ++idxChild){
                                // Check if child exists and it was part of my working interval
                                if( child[idxChild] && getWorkingInterval((idxLevel+1), idProcess).leftIndex <= child[idxChild]->getMortonIndex() ){
                                    // Add the cell to the buffer
                                    child[idxChild]->serializeUp(sendBuffer);
                                    packageFlags = char(packageFlags | (0x1 << idxChild));
                                }
                            }
                            // Add the flag as first value
                            sendBuffer.writeAt(0,packageFlags);
                            // Post the message
491
                            bufferSize = sendBuffer.getSize();
492
                            MPI_Isend(&bufferSize, 1, FMpi::GetType(bufferSize), currentProcIdToSendTo,
493
                                      FMpi::TagFmmM2MSize + idxLevel, comm.getComm(), &requestsSize[iterMpiRequestsSize++]);
494 495
                            FAssertLF(sendBuffer.getSize() < std::numeric_limits<int>::max());
                            MPI_Isend(sendBuffer.data(), int(sendBuffer.getSize()), MPI_PACKED, currentProcIdToSendTo,
496 497 498 499 500 501 502 503 504 505 506 507 508 509
                                      FMpi::TagFmmM2M + idxLevel, comm.getComm(), &requests[iterMpiRequests++]);
                        }
                    }

                    //Post receive, Datas needed in several parts of the section
                    int nbProcThatSendToMe = 0;

                    if(hasToReceive){
                        //Test : if the firstProcThatSend father minimal value in interval is lesser than mine
                        int idProcSource = firstProcThatSend;
                        // Find the last proc that should send to me
                        while( idProcSource < nbProcess
                               && ( !procHasWorkAtLevel(idxLevel+1, idProcSource) || procCoversMyRightBorderCell(idxLevel, idProcSource) )){
                            if(procHasWorkAtLevel(idxLevel+1, idProcSource) && procCoversMyRightBorderCell(idxLevel, idProcSource)){
510
                                MPI_Irecv(&recvBufferSize[nbProcThatSendToMe], 1, FMpi::GetType(recvBufferSize[nbProcThatSendToMe]),
511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534
                                        idProcSource, FMpi::TagFmmM2MSize + idxLevel, comm.getComm(), &requestsSize[iterMpiRequestsSize++]);
                                nbProcThatSendToMe += 1;
                                FAssertLF(nbProcThatSendToMe <= 7);
                            }
                            ++idProcSource;
                        }
                    }

                    //Wait For the comms, and do the work
                    // Are we sending or waiting anything?
                    if(iterMpiRequestsSize){
                        FAssertLF(iterMpiRequestsSize <= 8);
                        MPI_Waitall( iterMpiRequestsSize, requestsSize, statusSize);
                    }

                    if(hasToReceive){
                        nbProcThatSendToMe = 0;
                        //Test : if the firstProcThatSend father minimal value in interval is lesser than mine
                        int idProcSource = firstProcThatSend;
                        // Find the last proc that should send to me
                        while( idProcSource < nbProcess
                               && ( !procHasWorkAtLevel(idxLevel+1, idProcSource) || procCoversMyRightBorderCell(idxLevel, idProcSource) )){
                            if(procHasWorkAtLevel(idxLevel+1, idProcSource) && procCoversMyRightBorderCell(idxLevel, idProcSource)){
                                recvBuffer[nbProcThatSendToMe].cleanAndResize(recvBufferSize[nbProcThatSendToMe]);
535 536
                                FAssertLF(recvBufferSize[nbProcThatSendToMe] < std::numeric_limits<int>::max());
                                MPI_Irecv(recvBuffer[nbProcThatSendToMe].data(), int(recvBufferSize[nbProcThatSendToMe]), MPI_PACKED,
537 538 539 540 541 542 543 544 545 546 547 548 549
                                        idProcSource, FMpi::TagFmmM2M + idxLevel, comm.getComm(), &requests[iterMpiRequests++]);
                                nbProcThatSendToMe += 1;
                                FAssertLF(nbProcThatSendToMe <= 7);
                            }
                            ++idProcSource;
                        }
                    }

                    //Wait For the comms, and do the work
                    // Are we sending or waiting anything?
                    if(iterMpiRequests){
                        FAssertLF(iterMpiRequests <= 8);
                        MPI_Waitall( iterMpiRequests, requests, status);
550
                    }
551

552 553 554 555 556 557 558 559
                    // We had received something so we need to proceed the last M2M
                    if( hasToReceive ){
                        FAssertLF(iterMpiRequests != 0);
                        CellClass* currentChild[8];
                        memcpy(currentChild, iterArray[totalNbCellsAtLevel - 1].getCurrentChild(), 8 * sizeof(CellClass*));

                        // Retreive data and merge my child and the child from others
                        for(int idxProc = 0 ; idxProc < nbProcThatSendToMe ; ++idxProc){
560
                            int packageFlags = int(recvBuffer[idxProc].getValue<char>());
561 562

                            int position = 0;
563
                            int positionToInsert = 0;
564 565 566 567 568
                            while( packageFlags && position < 8){
                                while(!(packageFlags & 0x1)){
                                    packageFlags >>= 1;
                                    ++position;
                                }
569 570
                                FAssertLF(positionToInsert < 7);
                                FAssertLF(position < 8);
571
                                FAssertLF(!currentChild[position], "Already has a cell here");
572 573
                                recvBufferCells[positionToInsert].deserializeUp(recvBuffer[idxProc]);
                                currentChild[position] = (CellClass*) &recvBufferCells[positionToInsert];
574 575

                                packageFlags >>= 1;
576 577
                                position += 1;
                                positionToInsert += 1;
578
                            }
579 580

                            recvBuffer[idxProc].seek(0);
581 582
                        }
                        // Finally compute
583
                        myThreadkernels->M2M( iterArray[totalNbCellsAtLevel - 1].getCurrentCell() , currentChild, idxLevel);
584 585 586 587 588 589 590 591
                        firstProcThatSend += nbProcThatSendToMe - 1;
                    }
                    // Reset buffer
                    sendBuffer.reset();
                    FLOG(singleCounter.tac());
                }//End Of Single section

                // All threads proceed the M2M
592
                #pragma omp for nowait schedule(dynamic, userChunkSize)
593 594 595 596 597 598 599 600 601 602 603 604 605
                for( int idxCell = nbCellsToSkip ; idxCell < nbCellsForThreads ; ++idxCell){
                    myThreadkernels->M2M( iterArray[idxCell].getCurrentCell() , iterArray[idxCell].getCurrentChild(), idxLevel);
                }
            }//End of parallel section
            FLOG(parallelCounter.tac());
        }

        FLOG(counterTime.tac());
        FLOG(computationCounter.tac());
        FLOG( FLog::Controller << "\tFinished (@Upward Pass (M2M) = "  << counterTime.elapsed() << " s)\n" );
        FLOG( FLog::Controller << "\t\t Computation : " << computationCounter.elapsed() << " s\n" );
        FLOG( FLog::Controller << "\t\t Single : " << singleCounter.cumulated() << " s\n" );
        FLOG( FLog::Controller << "\t\t Parallel : " << parallelCounter.cumulated() << " s\n" );
BRAMAS Berenger's avatar
BRAMAS Berenger committed
606
        FLOG( FLog::Controller.flush());
607
    }
608

609 610 611
    /////////////////////////////////////////////////////////////////////////////
    // Downard
    /////////////////////////////////////////////////////////////////////////////
612 613


614
    void transferPass(){
615 616 617 618 619 620 621 622 623 624 625 626 627
        FLOG( FLog::Controller.write("\tStart Downward Pass (M2L)\n").write(FLog::Flush); );
        FLOG(FTic counterTime);
        FLOG(FTic computationCounter);
        FLOG(FTic sendCounter);
        FLOG(FTic receiveCounter);
        FLOG(FTic prepareCounter);
        FLOG(FTic gatherCounter);

        //////////////////////////////////////////////////////////////////
        // First know what to send to who
        //////////////////////////////////////////////////////////////////

        // pointer to send
628
        std::unique_ptr<FVector<typename OctreeClass::Iterator>[]> toSend(new FVector<typename OctreeClass::Iterator>[nbProcess * OctreeHeight]);
629
        // index
630 631
        long long int*const indexToSend = new long long int[nbProcess * OctreeHeight];
        memset(indexToSend, 0, sizeof(long long int) * nbProcess * OctreeHeight);
632 633 634 635 636 637
        // To know which one has need someone
        FBoolArray** const leafsNeedOther = new FBoolArray*[OctreeHeight];
        memset(leafsNeedOther, 0, sizeof(FBoolArray*) * OctreeHeight);

        // All process say to each others
        // what the will send to who
638 639
        long long int*const globalReceiveMap = new long long  int[nbProcess * nbProcess * OctreeHeight];
        memset(globalReceiveMap, 0, sizeof(long long  int) * nbProcess * nbProcess * OctreeHeight);
640 641 642 643 644 645

        FMpiBufferWriter**const sendBuffer = new FMpiBufferWriter*[nbProcess * OctreeHeight];
        memset(sendBuffer, 0, sizeof(FMpiBufferWriter*) * nbProcess * OctreeHeight);

        FMpiBufferReader**const recvBuffer = new FMpiBufferReader*[nbProcess * OctreeHeight];
        memset(recvBuffer, 0, sizeof(FMpiBufferReader*) * nbProcess * OctreeHeight);
646

647
        #pragma omp parallel num_threads(MaxThreads)
648
        {
649
            #pragma omp master
650 651 652 653 654 655 656 657 658 659 660 661
            {
                {
                    FLOG(prepareCounter.tic());

                    std::unique_ptr<typename OctreeClass::Iterator[]> iterArrayLocal(new typename OctreeClass::Iterator[numberOfLeafs]);

                    // To know if a leaf has been already sent to a proc
                    bool*const alreadySent = new bool[nbProcess];
                    memset(alreadySent, 0, sizeof(bool) * nbProcess);

                    typename OctreeClass::Iterator octreeIterator(tree);
                    octreeIterator.moveDown();
662

663
                    for(int idxLevel = 2 ; idxLevel < FAbstractAlgorithm::upperWorkingLevel ; ++idxLevel){
664 665 666
                        octreeIterator.moveDown();
                    }

667 668
                    typename OctreeClass::Iterator avoidGotoLeftIterator(octreeIterator);
                    // for each levels
669
                    for(int idxLevel = FAbstractAlgorithm::upperWorkingLevel ; idxLevel < FAbstractAlgorithm::lowerWorkingLevel ; ++idxLevel ){
670

671
                        const int separationCriteria = (idxLevel != FAbstractAlgorithm::lowerWorkingLevel-1 ? 1 : leafLevelSeparationCriteria);
672

673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696
                        if(!procHasWorkAtLevel(idxLevel, idProcess)){
                            avoidGotoLeftIterator.moveDown();
                            octreeIterator = avoidGotoLeftIterator;
                            continue;
                        }

                        int numberOfCells = 0;

                        while(octreeIterator.getCurrentGlobalIndex() <  getWorkingInterval(idxLevel , idProcess).leftIndex){
                            octreeIterator.moveRight();
                        }

                        // for each cells
                        do{
                            iterArrayLocal[numberOfCells] = octreeIterator;
                            ++numberOfCells;
                        } while(octreeIterator.moveRight());
                        avoidGotoLeftIterator.moveDown();
                        octreeIterator = avoidGotoLeftIterator;

                        leafsNeedOther[idxLevel] = new FBoolArray(numberOfCells);

                        // Which cell potentialy needs other data and in the same time
                        // are potentialy needed by other
697
                        MortonIndex neighborsIndexes[/*189+26+1*/216];
698 699
                        for(int idxCell = 0 ; idxCell < numberOfCells ; ++idxCell){
                            // Find the M2L neigbors of a cell
700
                            const int counter = iterArrayLocal[idxCell].getCurrentGlobalCoordinate().getInteractionNeighbors(idxLevel, neighborsIndexes, separationCriteria);
701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724

                            memset(alreadySent, false, sizeof(bool) * nbProcess);
                            bool needOther = false;
                            // Test each negibors to know which one do not belong to us
                            for(int idxNeigh = 0 ; idxNeigh < counter ; ++idxNeigh){
                                if(neighborsIndexes[idxNeigh] < getWorkingInterval(idxLevel , idProcess).leftIndex
                                        || (getWorkingInterval(idxLevel , idProcess).rightIndex) < neighborsIndexes[idxNeigh]){
                                    int procToReceive = idProcess;
                                    while( 0 != procToReceive && neighborsIndexes[idxNeigh] < getWorkingInterval(idxLevel , procToReceive).leftIndex ){
                                        --procToReceive;
                                    }
                                    while( procToReceive != nbProcess -1 && (getWorkingInterval(idxLevel , procToReceive).rightIndex) < neighborsIndexes[idxNeigh]){
                                        ++procToReceive;
                                    }
                                    // Maybe already sent to that proc?
                                    if( !alreadySent[procToReceive]
                                            && getWorkingInterval(idxLevel , procToReceive).leftIndex <= neighborsIndexes[idxNeigh]
                                            && neighborsIndexes[idxNeigh] <= getWorkingInterval(idxLevel , procToReceive).rightIndex){

                                        alreadySent[procToReceive] = true;

                                        needOther = true;

                                        toSend[idxLevel * nbProcess + procToReceive].push(iterArrayLocal[idxCell]);
725 726 727 728 729
                                        if(indexToSend[idxLevel * nbProcess + procToReceive] == 0){
                                            indexToSend[idxLevel * nbProcess + procToReceive] = sizeof(int);
                                        }
                                        indexToSend[idxLevel * nbProcess + procToReceive] += iterArrayLocal[idxCell].getCurrentCell()->getSavedSizeUp();
                                        indexToSend[idxLevel * nbProcess + procToReceive] += sizeof(MortonIndex);
730
                                        indexToSend[idxLevel * nbProcess + procToReceive] += sizeof(FSize);
731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748
                                    }
                                }
                            }
                            if(needOther){
                                leafsNeedOther[idxLevel]->set(idxCell,true);
                            }
                        }
                    }
                    FLOG(prepareCounter.tac());

                    delete[] alreadySent;
                }

                //////////////////////////////////////////////////////////////////
                // Gather this information
                //////////////////////////////////////////////////////////////////

                FLOG(gatherCounter.tic());
749
                FMpi::MpiAssert( MPI_Allgather( indexToSend, nbProcess * OctreeHeight, MPI_LONG_LONG_INT, globalReceiveMap, nbProcess * OctreeHeight, MPI_LONG_LONG_INT, comm.getComm()),  __LINE__ );
750 751 752 753 754 755 756 757 758
                FLOG(gatherCounter.tac());

                //////////////////////////////////////////////////////////////////
                // Send and receive for real
                //////////////////////////////////////////////////////////////////

                FLOG(sendCounter.tic());
                // Then they can send and receive (because they know what they will receive)
                // To send in asynchrone way
759 760
                std::vector<MPI_Request> requests;
                requests.reserve(2 * nbProcess * OctreeHeight);
761 762 763

                for(int idxLevel = 2 ; idxLevel < OctreeHeight ; ++idxLevel ){
                    for(int idxProc = 0 ; idxProc < nbProcess ; ++idxProc){
764
                        const long long int toSendAtProcAtLevel = indexToSend[idxLevel * nbProcess + idxProc];
765
                        if(toSendAtProcAtLevel != 0){
766
                            sendBuffer[idxLevel * nbProcess + idxProc] = new FMpiBufferWriter(comm.getComm(),int(toSendAtProcAtLevel));
767

768 769 770
                            sendBuffer[idxLevel * nbProcess + idxProc]->write(int(toSend[idxLevel * nbProcess + idxProc].getSize()));

                            for(int idxLeaf = 0 ; idxLeaf < toSend[idxLevel * nbProcess + idxProc].getSize(); ++idxLeaf){
771 772
                                const FSize currentTell = sendBuffer[idxLevel * nbProcess + idxProc]->getSize();
                                sendBuffer[idxLevel * nbProcess + idxProc]->write(currentTell);
773 774 775 776 777
                                const MortonIndex cellIndex = toSend[idxLevel * nbProcess + idxProc][idxLeaf].getCurrentGlobalIndex();
                                sendBuffer[idxLevel * nbProcess + idxProc]->write(cellIndex);
                                toSend[idxLevel * nbProcess + idxProc][idxLeaf].getCurrentCell()->serializeUp(*sendBuffer[idxLevel * nbProcess + idxProc]);
                            }

778 779
                            FAssertLF(sendBuffer[idxLevel * nbProcess + idxProc]->getSize() == toSendAtProcAtLevel);

BRAMAS Berenger's avatar
BRAMAS Berenger committed
780 781 782 783 784 785 786 787
//                            FAssertLF(sendBuffer[idxLevel * nbProcess + idxProc]->getSize() < std::numeric_limits<int>::max());
//                            requests.emplace_back();
//                            FMpi::MpiAssert( MPI_Isend( sendBuffer[idxLevel * nbProcess + idxProc]->data(),
//                                             int(sendBuffer[idxLevel * nbProcess + idxProc]->getSize()),MPI_PACKED, idxProc,
//                                    FMpi::TagLast + idxLevel*100, comm.getComm(), &requests.back()) , __LINE__ );
                            FMpi::ISendSplit(sendBuffer[idxLevel * nbProcess + idxProc]->data(),
                                    sendBuffer[idxLevel * nbProcess + idxProc]->getSize(), idxProc,
                                    FMpi::TagLast + idxLevel*100, comm, &requests);
788 789
                        }

790
                        const long long int toReceiveFromProcAtLevel = globalReceiveMap[(idxProc * nbProcess * OctreeHeight) + idxLevel * nbProcess + idProcess];
791
                        if(toReceiveFromProcAtLevel){
792
                            recvBuffer[idxLevel * nbProcess + idxProc] = new FMpiBufferReader(comm.getComm(),int(toReceiveFromProcAtLevel));
793

BRAMAS Berenger's avatar
BRAMAS Berenger committed
794 795 796 797 798 799 800 801
//                            FAssertLF(recvBuffer[idxLevel * nbProcess + idxProc]->getCapacity() < std::numeric_limits<int>::max());
//                            requests.emplace_back();
//                            FMpi::MpiAssert( MPI_Irecv(recvBuffer[idxLevel * nbProcess + idxProc]->data(),
//                                             int(recvBuffer[idxLevel * nbProcess + idxProc]->getCapacity()), MPI_PACKED,idxProc,
//                                    FMpi::TagLast + idxLevel*100, comm.getComm(), &requests.back()) , __LINE__ );
                            FMpi::IRecvSplit(recvBuffer[idxLevel * nbProcess + idxProc]->data(),
                                    recvBuffer[idxLevel * nbProcess + idxProc]->getCapacity(), idxProc,
                                    FMpi::TagLast + idxLevel*100, comm, &requests);
802 803 804 805 806 807 808 809 810
                        }
                    }
                }

                //////////////////////////////////////////////////////////////////
                // Wait received data and compute
                //////////////////////////////////////////////////////////////////

                // Wait to receive every things (and send every things)
811
                FMpi::MpiAssert(MPI_Waitall(int(requests.size()), requests.data(), MPI_STATUS_IGNORE), __LINE__);
812 813 814 815 816 817 818

                FLOG(sendCounter.tac());
            }//End of Master region

            //////////////////////////////////////////////////////////////////
            // Do M2L
            //////////////////////////////////////////////////////////////////
PIACIBELLO Cyrille's avatar
PIACIBELLO Cyrille committed
819

820
            #pragma omp single nowait
821 822 823
            {
                typename OctreeClass::Iterator octreeIterator(tree);
                octreeIterator.moveDown();
824

825
                for(int idxLevel = 2 ; idxLevel < FAbstractAlgorithm::upperWorkingLevel ; ++idxLevel){
826 827 828
                    octreeIterator.moveDown();
                }

829 830 831
                typename OctreeClass::Iterator avoidGotoLeftIterator(octreeIterator);
                // Now we can compute all the data
                // for each levels
832
                for(int idxLevel = FAbstractAlgorithm::upperWorkingLevel ; idxLevel < FAbstractAlgorithm::lowerWorkingLevel ; ++idxLevel ){
833
                    const int separationCriteria = (idxLevel != FAbstractAlgorithm::lowerWorkingLevel-1 ? 1 : leafLevelSeparationCriteria);
834

835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854
                    if(!procHasWorkAtLevel(idxLevel, idProcess)){
                        avoidGotoLeftIterator.moveDown();
                        octreeIterator = avoidGotoLeftIterator;
                        continue;
                    }

                    int numberOfCells = 0;
                    while(octreeIterator.getCurrentGlobalIndex() <  getWorkingInterval(idxLevel , idProcess).leftIndex){
                        octreeIterator.moveRight();
                    }
                    // for each cells
                    do{
                        iterArray[numberOfCells] = octreeIterator;
                        ++numberOfCells;
                    } while(octreeIterator.moveRight());
                    avoidGotoLeftIterator.moveDown();
                    octreeIterator = avoidGotoLeftIterator;

                    FLOG(computationCounter.tic());
                    {
855
                        const int chunckSize = userChunkSize;
856
                        for(int idxCell = 0 ; idxCell < numberOfCells ; idxCell += chunckSize){
857
                            #pragma omp task default(none) shared(numberOfCells,idxLevel) firstprivate(idxCell) //+ shared(chunckSize)
858 859
                            {
                                KernelClass * const myThreadkernels = kernels[omp_get_thread_num()];
860 861
                                const CellClass* neighbors[342];
                                int neighborPositions[342];
862 863 864

                                const int nbCellToCompute = FMath::Min(chunckSize, numberOfCells-idxCell);
                                for(int idxCellToCompute = idxCell ; idxCellToCompute < idxCell+nbCellToCompute ; ++idxCellToCompute){
865 866
                                    const int counter = tree->getInteractionNeighbors(neighbors,  neighborPositions, iterArray[idxCellToCompute].getCurrentGlobalCoordinate(), idxLevel, separationCriteria);
                                    if(counter) myThreadkernels->M2L( iterArray[idxCellToCompute].getCurrentCell() , neighbors, neighborPositions, counter, idxLevel);
867 868 869 870
                                }
                            }
                        }
                    }//End of task spawning
PIACIBELLO Cyrille's avatar
PIACIBELLO Cyrille committed
871

872
                    #pragma omp taskwait
PIACIBELLO Cyrille's avatar
PIACIBELLO Cyrille committed
873

874
                    for(int idxThread = 0 ; idxThread < omp_get_num_threads() ; ++idxThread){
875
                        #pragma omp task  default(none) firstprivate(idxThread,idxLevel)
876 877 878 879
                        {
                            kernels[idxThread]->finishedLevelM2L(idxLevel);
                        }
                    }
880
                    #pragma omp taskwait
881 882 883 884 885 886 887 888 889 890 891

                    FLOG(computationCounter.tac());
                }
            }
        }


        {
            FLOG(receiveCounter.tic());
            typename OctreeClass::Iterator octreeIterator(tree);
            octreeIterator.moveDown();
892

893
            for(int idxLevel = 2 ; idxLevel < FAbstractAlgorithm::upperWorkingLevel ; ++idxLevel){
894 895 896
                octreeIterator.moveDown();
            }

897 898 899
            typename OctreeClass::Iterator avoidGotoLeftIterator(octreeIterator);
            // compute the second time
            // for each levels
900
            for(int idxLevel = FAbstractAlgorithm::upperWorkingLevel ; idxLevel < FAbstractAlgorithm::lowerWorkingLevel ; ++idxLevel ){
901
                const int separationCriteria = (idxLevel != FAbstractAlgorithm::lowerWorkingLevel-1 ? 1 : leafLevelSeparationCriteria);
902

903 904 905 906 907 908 909 910 911
                if(!procHasWorkAtLevel(idxLevel, idProcess)){
                    avoidGotoLeftIterator.moveDown();
                    octreeIterator = avoidGotoLeftIterator;
                    continue;
                }

                // put the received data into a temporary tree
                FLightOctree<CellClass> tempTree;
                for(int idxProc = 0 ; idxProc < nbProcess ; ++idxProc){
912
                    if(recvBuffer[idxLevel * nbProcess + idxProc]){
913
                        const int toReceiveFromProcAtLevel = recvBuffer[idxLevel * nbProcess + idxProc]->template getValue<int>();
914

915
                        for(int idxCell = 0 ; idxCell < toReceiveFromProcAtLevel ; ++idxCell){
916 917 918 919
                            const FSize currentTell = recvBuffer[idxLevel * nbProcess + idxProc]->tell();
                            const FSize verifCurrentTell = recvBuffer[idxLevel * nbProcess + idxProc]->template getValue<FSize>();
                            FAssertLF(currentTell == verifCurrentTell, currentTell, " ", verifCurrentTell);

920
                            const MortonIndex cellIndex = recvBuffer[idxLevel * nbProcess + idxProc]->template getValue<MortonIndex>();
921

922 923 924
                            CellClass* const newCell = new CellClass;
                            newCell->setMortonIndex(cellIndex);
                            newCell->deserializeUp(*recvBuffer[idxLevel * nbProcess + idxProc]);
925

926 927 928 929 930
                            tempTree.insertCell(cellIndex, idxLevel, newCell);
                        }

                        FAssertLF(globalReceiveMap[(idxProc * nbProcess * OctreeHeight) + idxLevel * nbProcess + idProcess] ==
                                recvBuffer[idxLevel * nbProcess + idxProc]->tell());
931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956
                    }
                }

                // take cells from our octree only if they are
                // linked to received data
                int numberOfCells = 0;
                int realCellId = 0;

                while(octreeIterator.getCurrentGlobalIndex() <  getWorkingInterval(idxLevel , idProcess).leftIndex){
                    octreeIterator.moveRight();
                }
                // for each cells
                do{
                    // copy cells that need data from others
                    if(leafsNeedOther[idxLevel]->get(realCellId++)){
                        iterArray[numberOfCells++] = octreeIterator;
                    }
                } while(octreeIterator.moveRight());
                avoidGotoLeftIterator.moveDown();
                octreeIterator = avoidGotoLeftIterator;

                delete leafsNeedOther[idxLevel];
                leafsNeedOther[idxLevel] = nullptr;

                // Compute this cells
                FLOG(computationCounter.tic());
957
                #pragma omp parallel num_threads(MaxThreads)
958 959
                {
                    KernelClass * const myThreadkernels = kernels[omp_get_thread_num()];
960 961
                    MortonIndex neighborsIndex[/*189+26+1*/216];
                    int neighborsPosition[/*189+26+1*/216];
962 963
                    const CellClass* neighbors[342];
                    int neighborPositions[342];
964

965
                    #pragma omp for  schedule(dynamic, userChunkSize) nowait
966 967
                    for(int idxCell = 0 ; idxCell < numberOfCells ; ++idxCell){
                        // compute indexes
968
                        const int counterNeighbors = iterArray[idxCell].getCurrentGlobalCoordinate().getInteractionNeighbors(idxLevel, neighborsIndex, neighborsPosition, separationCriteria);
969 970 971 972 973 974 975 976 977 978

                        int counter = 0;
                        // does we receive this index from someone?
                        for(int idxNeig = 0 ;idxNeig < counterNeighbors ; ++idxNeig){
                            if(neighborsIndex[idxNeig] < (getWorkingInterval(idxLevel , idProcess).leftIndex)
                                    || (getWorkingInterval(idxLevel , idProcess).rightIndex) < neighborsIndex[idxNeig]){

                                CellClass*const otherCell = tempTree.getCell(neighborsIndex[idxNeig], idxLevel);

                                if(otherCell){
979 980
                                    neighbors[counter] = otherCell;
                                    neighborPositions[counter] = neighborsPosition[idxNeig];
981 982 983 984 985 986
                                    ++counter;
                                }
                            }
                        }
                        // need to compute
                        if(counter){
987
                            myThreadkernels->M2L( iterArray[idxCell].getCurrentCell() , neighbors, neighborPositions, counter, idxLevel);
988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017
                        }
                    }

                    myThreadkernels->finishedLevelM2L(idxLevel);
                }
                FLOG(computationCounter.tac());
            }
            FLOG(receiveCounter.tac());
        }

        for(int idxComm = 0 ; idxComm < nbProcess * OctreeHeight; ++idxComm){
            delete sendBuffer[idxComm];
            delete recvBuffer[idxComm];
        }
        for(int idxComm = 0 ; idxComm < OctreeHeight; ++idxComm){
            delete leafsNeedOther[idxComm];
        }
        delete[] sendBuffer;
        delete[] recvBuffer;
        delete[] indexToSend;
        delete[] leafsNeedOther;
        delete[] globalReceiveMap;


        FLOG( FLog::Controller << "\tFinished (@Downward Pass (M2L) = "  << counterTime.tacAndElapsed() << " s)\n" );
        FLOG( FLog::Controller << "\t\t Computation : " << computationCounter.cumulated() << " s\n" );
        FLOG( FLog::Controller << "\t\t Send : " << sendCounter.cumulated() << " s\n" );
        FLOG( FLog::Controller << "\t\t Receive : " << receiveCounter.cumulated() << " s\n" );
        FLOG( FLog::Controller << "\t\t Gather : " << gatherCounter.cumulated() << " s\n" );
        FLOG( FLog::Controller << "\t\t Prepare : " << prepareCounter.cumulated() << " s\n" );
BRAMAS Berenger's avatar
BRAMAS Berenger committed
1018
        FLOG( FLog::Controller.flush());
1019

1020
    }
1021

1022 1023 1024 1025
    //////////////////////////////////////////////////////////////////
    // ---------------- L2L ---------------
    //////////////////////////////////////////////////////////////////

1026
    void downardPass(){ // second L2L
1027 1028 1029 1030 1031 1032 1033 1034 1035
        FLOG( FLog::Controller.write("\tStart Downward Pass (L2L)\n").write(FLog::Flush); );
        FLOG(FTic counterTime);
        FLOG(FTic computationCounter);
        FLOG(FTic prepareCounter);
        FLOG(FTic waitCounter);

        // Start from leal level - 1
        typename OctreeClass::Iterator octreeIterator(tree);
        octreeIterator.moveDown();
1036

1037
        for(int idxLevel = 2 ; idxLevel < FAbstractAlgorithm::upperWorkingLevel ; ++idxLevel){
1038 1039 1040
            octreeIterator.moveDown();
        }

1041 1042 1043 1044
        typename OctreeClass::Iterator avoidGotoLeftIterator(octreeIterator);

        // Max 1 receive and 7 send (but 7 times the same data)
        MPI_Request*const requests = new MPI_Request[8];
1045
        MPI_Request*const requestsSize = new MPI_Request[8];
1046

1047
        const int heightMinusOne = FAbstractAlgorithm::lowerWorkingLevel - 1;
1048

1049 1050
        FMpiBufferWriter sendBuffer(comm.getComm());
        FMpiBufferReader recvBuffer(comm.getComm());
1051 1052 1053 1054

        int righestProcToSendTo   = nbProcess - 1;

        // for each levels exepted leaf level
1055
        for(int idxLevel = FAbstractAlgorithm::upperWorkingLevel ; idxLevel < heightMinusOne ; ++idxLevel ){
1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089
            // If nothing to do in the next level skip the current one
            if(idProcess != 0 && !procHasWorkAtLevel(idxLevel+1, idProcess) ){
                avoidGotoLeftIterator.moveDown();
                octreeIterator = avoidGotoLeftIterator;
                continue;
            }

            // Copy all the cells in an array even the one that are out of my working interval
            int totalNbCellsAtLevel = 0;
            do{
                iterArray[totalNbCellsAtLevel++] = octreeIterator;
            } while(octreeIterator.moveRight());
            avoidGotoLeftIterator.moveDown();
            octreeIterator = avoidGotoLeftIterator;

            // Count the number of cells that are out of my working interval
            int nbCellsToSkip = 0;
            while(nbCellsToSkip < totalNbCellsAtLevel && iterArray[nbCellsToSkip].getCurrentGlobalIndex() < getWorkingInterval(idxLevel , idProcess).leftIndex){
                nbCellsToSkip += 1;
            }

            // Check if someone will send a cell to me
            bool hasToReceive = false;
            int idxProcToReceive = idProcess - 1;
            if(idProcess != 0 && nbCellsToSkip){
                // Starting from my left neighbor stop at the first proc that has work to do (not null interval)
                while(idxProcToReceive && !procHasWorkAtLevel(idxLevel, idxProcToReceive) ){
                    idxProcToReceive -= 1;
                }
                // Check if we find such a proc and that it share a cell with us on the border
                if(procHasWorkAtLevel(idxLevel, idxProcToReceive) && procCoversMyLeftBorderCell(idxLevel, idxProcToReceive)){
                    hasToReceive = true;
                }
            }
1090

1091
            #pragma omp parallel num_threads(MaxThreads)
1092
            {
1093 1094
                KernelClass* myThreadkernels = (kernels[omp_get_thread_num()]);
                #pragma omp single nowait
1095 1096 1097
                {
                    FLOG(prepareCounter.tic());
                    int iterRequests = 0;
1098
                    int iterRequestsSize = 0;
1099 1100
                    FSize recvBufferSize = 0;
                    FSize sendBufferSize;
1101 1102
                    // Post the receive
                    if(hasToReceive){
1103
                        FMpi::MpiAssert( MPI_Irecv( &recvBufferSize, 1, FMpi::GetType(recvBufferSize), idxProcToReceive,
1104
                                                    FMpi::TagFmmL2LSize + idxLevel, comm.getComm(), &requestsSize[iterRequestsSize++]), __LINE__ );
1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119
                    }

                    // We have to be sure that we are not sending if we have no work in the current level
                    if(idProcess != nbProcess - 1 && idProcess < righestProcToSendTo && procHasWorkAtLevel(idxLevel, idProcess)){
                        int idxProcSend = idProcess + 1;
                        int nbMessageSent = 0;
                        // From the proc on the right to righestProcToSendTo, check if we have to send something
                        while(idxProcSend <= righestProcToSendTo && ( !procHasWorkAtLevel(idxLevel+1, idxProcSend) || procCoversMyRightBorderCell(idxLevel, idxProcSend)) ){
                            // We know that if the proc has work at the next level it share a cell with us due to the while condition
                            if(procHasWorkAtLevel(idxLevel+1, idxProcSend)){
                                FAssertLF(procCoversMyRightBorderCell(idxLevel, idxProcSend));
                                // If first message then serialize the cell to send
                                if( nbMessageSent == 0 ){
                                    // We send our last cell
                                    iterArray[totalNbCellsAtLevel - 1].getCurrentCell()->serializeDown(sendBuffer);
1120
                                    sendBufferSize = sendBuffer.getSize();
1121 1122
                                }
                                // Post the send message
1123
                                FMpi::MpiAssert( MPI_Isend(&sendBufferSize, 1, FMpi::GetType(sendBufferSize), idxProcSend,
1124
                                                           FMpi::TagFmmL2LSize + idxLevel, comm.getComm(), &requestsSize[iterRequestsSize++]), __LINE__);
1125 1126
                                FAssertLF(sendBuffer.getSize() < std::numeric_limits<int>::max());
                                FMpi::MpiAssert( MPI_Isend(sendBuffer.data(), int(sendBuffer.getSize()), MPI_PACKED, idxProcSend,
1127 1128 1129 1130 1131 1132 1133 1134 1135 1136
                                                           FMpi::TagFmmL2L + idxLevel, comm.getComm(), &requests[iterRequests++]), __LINE__);
                                // Inc and check the counter
                                nbMessageSent += 1;
                                FAssertLF(nbMessageSent <= 7);
                            }
                            idxProcSend += 1;
                        }
                        // Next time we will not need to go further than idxProcSend
                        righestProcToSendTo = idxProcSend;
                    }
1137

1138
                    // Finalize the communication
1139 1140
                    if(iterRequestsSize){
                        FLOG(waitCounter.tic());
1141 1142
                        FAssertLF(iterRequestsSize < 8);
                        FMpi::MpiAssert(MPI_Waitall( iterRequestsSize, requestsSize, MPI_STATUSES_IGNORE), __LINE__);
1143 1144 1145 1146
                        FLOG(waitCounter.tac());
                    }

                    if(hasToReceive){
1147
                        recvBuffer.cleanAndResize(recvBufferSize);
1148 1149
                        FAssertLF(recvBuffer.getCapacity() < std::numeric_limits<int>::max());
                        FMpi::MpiAssert( MPI_Irecv( recvBuffer.data(), int(recvBuffer.getCapacity()), MPI_PACKED, idxProcToReceive,
1150 1151 1152
                                                    FMpi::TagFmmL2L + idxLevel, comm.getComm(), &requests[iterRequests++]), __LINE__ );
                    }

1153 1154
                    if(iterRequests){
                        FLOG(waitCounter.tic());
1155 1156
                        FAssertLF(iterRequests < 8);
                        FMpi::MpiAssert(MPI_Waitall( iterRequests, requests, MPI_STATUSES_IGNORE), __LINE__);
1157 1158
                        FLOG(waitCounter.tac());
                    }
1159

1160 1161 1162 1163 1164 1165
                    // If we receive something proceed the L2L
                    if(hasToReceive){
                        FAssertLF(iterRequests != 0);
                        // In this case we know that we have to perform the L2L with the last cell that are
                        // exclude from our working interval nbCellsToSkip-1
                        iterArray[nbCellsToSkip-1].getCurrentCell()->deserializeDown(recvBuffer);
1166
                        myThreadkernels->L2L( iterArray[nbCellsToSkip-1].getCurrentCell() , iterArray[nbCellsToSkip-1].getCurrentChild(), idxLevel);
1167 1168 1169
                    }
                    FLOG(prepareCounter.tac());
                }
1170

1171
                #pragma omp single nowait
1172 1173 1174 1175
                {
                    FLOG(computationCounter.tic());
                }
                // Threads are working on all the cell of our working interval at that level
1176
                #pragma omp for nowait  schedule(dynamic, userChunkSize)
1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187
                for(int idxCell = nbCellsToSkip ; idxCell < totalNbCellsAtLevel ; ++idxCell){
                    myThreadkernels->L2L( iterArray[idxCell].getCurrentCell() , iterArray[idxCell].getCurrentChild(), idxLevel);
                }
            }
            FLOG(computationCounter.tac());

            sendBuffer.reset();
            recvBuffer.seek(0);
        }

        delete[] requests;
1188
        delete[] requestsSize;
1189 1190 1191 1192 1193

        FLOG( FLog::Controller << "\tFinished (@Downward Pass (L2L) = "  << counterTime.tacAndElapsed() << " s)\n" );
        FLOG( FLog::Controller << "\t\t Computation : " << computationCounter.cumulated() << " s\n" );
        FLOG( FLog::Controller << "\t\t Prepare : " << prepareCounter.cumulated() << " s\n" );
        FLOG( FLog::Controller << "\t\t Wait : " << waitCounter.cumulated() << " s\n" );
BRAMAS Berenger's avatar
BRAMAS Berenger committed
1194
        FLOG( FLog::Controller.flush());
1195 1196 1197 1198 1199 1200 1201
    }


    /////////////////////////////////////////////////////////////////////////////
    // Direct
    /////////////////////////////////////////////////////////////////////////////
    struct LeafData{
1202 1203 1204 1205
        FTreeCoordinate coord;
        CellClass* cell;
        ContainerClass* targets;
        ContainerClass* sources;
1206 1207 1208 1209
    };


    /** P2P */
1210
    void directPass(const bool p2pEnabled, const bool l2pEnabled){
1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234
        FLOG( FLog::Controller.write("\tStart Direct Pass\n").write(FLog::Flush); );
        FLOG( FTic counterTime);
        FLOG( FTic prepareCounter);
        FLOG( FTic gatherCounter);
        FLOG( FTic waitCounter);
        FLOG(FTic computationCounter);
        FLOG(FTic computation2Counter);

        ///////////////////////////////////////////////////
        // Prepare data to send receive
        ///////////////////////////////////////////////////
        FLOG(prepareCounter.tic());


        FMpiBufferWriter**const sendBuffer = new FMpiBufferWriter*[nbProcess];
        memset(sendBuffer, 0, sizeof(FMpiBufferWriter*) * nbProcess);

        FMpiBufferReader**const recvBuffer = new FMpiBufferReader*[nbProcess];
        memset(recvBuffer, 0, sizeof(FMpiBufferReader*) * nbProcess);

        /* This a nbProcess x nbProcess matrix of integer
     * let U and V be id of processes :
     * globalReceiveMap[U*nbProcess + V] == size of information needed by V and own by U
     */
1235 1236
        FSize*const globalReceiveMap = new FSize[nbProcess * nbProcess];
        memset(globalReceiveMap, 0, sizeof(FSize) * nbProcess * nbProcess);
PIACIBELLO Cyrille's avatar
PIACIBELLO Cyrille committed
1237

1238 1239
        FBoolArray leafsNeedOther(this->numberOfLeafs);
        int countNeedOther = 0;
PIACIBELLO Cyrille's avatar
PIACIBELLO Cyrille committed
1240

1241 1242
        // To store the result
        OctreeClass otherP2Ptree( tree->getHeight(), tree->getSubHeight(), tree->getBoxWidth(), tree->getBoxCenter() );
PIACIBELLO Cyrille's avatar
PIACIBELLO Cyrille committed
1243

1244
        // init
1245
        const int SizeShape = P2PExclusionClass::SizeShape;
PIACIBELLO Cyrille's avatar
PIACIBELLO Cyrille committed
1246

1247 1248
        int shapeLeaf[SizeShape];
        memset(shapeLeaf,0,SizeShape*sizeof(int));
PIACIBELLO Cyrille's avatar
PIACIBELLO Cyrille committed
1249

1250
        LeafData* const leafsDataArray = new LeafData[this->numberOfLeafs];
PIACIBELLO Cyrille's avatar
PIACIBELLO Cyrille committed
1251

1252
        FVector<LeafData> leafsNeedOtherData(countNeedOther);
PIACIBELLO Cyrille's avatar
PIACIBELLO Cyrille committed
1253

1254
        FVector<typename OctreeClass::Iterator>*const toSend = new FVector<typename OctreeClass::Iterator>[nbProcess];
1255 1256
        FSize partsToSend[nbProcess];
        memset(partsToSend, 0, sizeof(FSize) * nbProcess);
PIACIBELLO Cyrille's avatar
PIACIBELLO Cyrille committed
1257

1258
#pragma omp parallel num_threads(MaxThreads)
1259
        {
PIACIBELLO Cyrille's avatar
PIACIBELLO Cyrille committed
1260
#pragma omp master // MUST WAIT to fill leafsNeedOther
1261
            if(p2pEnabled){
1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316
                // Copy leafs
                {
                    typename OctreeClass::Iterator octreeIterator(tree);
                    octreeIterator.gotoBottomLeft();
                    int idxLeaf = 0;
                    do{
                        this->iterArray[idxLeaf++] = octreeIterator;
                    } while(octreeIterator.moveRight());
                }

                int alreadySent[nbProcess];

                //Will store the indexes of the neighbors of current cell
                MortonIndex indexesNeighbors[26];

                for(int idxLeaf = 0 ; idxLeaf < this->numberOfLeafs ; ++idxLeaf){
                    memset(alreadySent, 0, sizeof(int) * nbProcess);
                    bool needOther = false;
                    //Get the neighbors of current cell in indexesNeighbors, and their number in neighCount
                    const int neighCount = (iterArray[idxLeaf].getCurrentGlobalCoordinate()).getNeighborsIndexes(OctreeHeight,indexesNeighbors);
                    //Loop over the neighbor leafs
                    for(int idxNeigh = 0 ; idxNeigh < neighCount ; ++idxNeigh){
                        //Test if leaf belongs to someone else (false if it's mine)
                        if(indexesNeighbors[idxNeigh] < (intervals[idProcess].leftIndex) || (intervals[idProcess].rightIndex) < indexesNeighbors[idxNeigh]){
                            needOther = true;

                            // find the proc that will need current leaf
                            int procToReceive = idProcess;
                            while( procToReceive != 0 && indexesNeighbors[idxNeigh] < intervals[procToReceive].leftIndex){
                                --procToReceive; //scroll process "before" current process
                            }

                            while( procToReceive != nbProcess - 1 && (intervals[procToReceive].rightIndex) < indexesNeighbors[idxNeigh]){
                                ++procToReceive;//scroll process "after" current process
                            }
                            //  Test : Not Already Send && be sure someone hold this interval
                            if( !alreadySent[procToReceive] && intervals[procToReceive].leftIndex <= indexesNeighbors[idxNeigh] && indexesNeighbors[idxNeigh] <= intervals[procToReceive].rightIndex){

                                alreadySent[procToReceive] = 1;
                                toSend[procToReceive].push( iterArray[idxLeaf] );
                                partsToSend[procToReceive] += iterArray[idxLeaf].getCurrentListSrc()->getSavedSize();
                                partsToSend[procToReceive] += int(sizeof(MortonIndex));
                            }
                        }
                    }

                    if(needOther){ //means that something need to be sent (or received)
                        leafsNeedOther.set(idxLeaf,true);
                        ++countNeedOther;
                    }
                }

                // No idea why it is mandatory there, could it be a few line before,
                for(int idxProc = 0 ; idxProc < nbProcess ; ++idxProc){
                    if(partsToSend[idxProc]){
1317
                        partsToSend[idxProc] += int(sizeof(FSize));
1318 1319 1320
                    }
                }
            }
PIACIBELLO Cyrille's avatar
PIACIBELLO Cyrille committed
1321 1322 1323 1324

#pragma omp barrier

#pragma omp master // nowait
1325
            if(p2pEnabled){
1326 1327
                //Share to all processus globalReceiveMap
                FLOG(gatherCounter.tic());
1328 1329
                FMpi::MpiAssert( MPI_Allgather( partsToSend, nbProcess, FMpi::GetType(*partsToSend),
                                                globalReceiveMap, nbProcess, FMpi::GetType(*partsToSend), comm.getComm()),  __LINE__ );
1330 1331
                FLOG(gatherCounter.tac());

1332 1333 1334
                // To send in asynchrone way
                std::vector<MPI_Request> requests;
                requests.reserve(2 * nbProcess);
1335 1336 1337 1338 1339
                //Prepare receive
                for(int idxProc = 0 ; idxProc < nbProcess ; ++idxProc){
                    if(globalReceiveMap[idxProc * nbProcess + idProcess]){ //if idxProc has sth for me.
                        //allocate buffer of right size
                        recvBuffer[idxProc] = new FMpiBufferReader(comm.getComm(),globalReceiveMap[idxProc * nbProcess + idProcess]);
BRAMAS Berenger's avatar
BRAMAS Berenger committed
1340 1341 1342 1343 1344 1345
                        // FAssertLF(recvBuffer[idxProc]->getCapacity() < std::numeric_limits<int>::max());
                        // requests.emplace_back();
                        // FMpi::MpiAssert( MPI_Irecv(recvBuffer[idxProc]->data(), int(recvBuffer[idxProc]->getCapacity()), MPI_PACKED,
                        //                           idxProc, FMpi::TagFmmP2P, comm.getComm(), &requests.back()) , __LINE__ );
                        FMpi::IRecvSplit(recvBuffer[idxProc]->data(), recvBuffer[idxProc]->getCapacity(),
                                         idxProc, FMpi::TagFmmP2P, comm, &requests);
1346 1347 1348
                    }
                }

1349
                const int nbMessagesToRecv = int(requests.size());
1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360
                // Prepare send
                for(int idxProc = 0 ; idxProc < nbProcess ; ++idxProc){
                    if(toSend[idxProc].getSize() != 0){
                        sendBuffer[idxProc] = new FMpiBufferWriter(comm.getComm(),globalReceiveMap[idProcess*nbProcess+idxProc]);
                        // << is equivalent to write().
                        (*sendBuffer[idxProc]) << toSend[idxProc].getSize();
                        for(int idxLeaf = 0 ; idxLeaf < toSend[idxProc].getSize() ; ++idxLeaf){
                            (*sendBuffer[idxProc]) << toSend[idxProc][idxLeaf].getCurrentGlobalIndex();
                            toSend[idxProc][idxLeaf].getCurrentListSrc()->save(*sendBuffer[idxProc]);
                        }

1361
                        FAssertLF(sendBuffer[idxProc]->getSize() == globalReceiveMap[idProcess*nbProcess+idxProc]);
BRAMAS Berenger's avatar
BRAMAS Berenger committed
1362 1363 1364 1365 1366 1367
                        // FAssertLF(sendBuffer[idxProc]->getSize() < std::numeric_limits<int>::max());
                        // requests.emplace_back();
                        // FMpi::MpiAssert( MPI_Isend( sendBuffer[idxProc]->data(), int(sendBuffer[idxProc]->getSize()) , MPI_PACKED ,
                        //                            idxProc, FMpi::TagFmmP2P, comm.getComm(), &requests.back()) , __LINE__ );
                        FMpi::ISendSplit(sendBuffer[idxProc]->data(), sendBuffer[idxProc]->getSize(),
                                         idxProc, FMpi::TagFmmP2P, comm, &requests);
1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378

                    }
                }

                delete[] toSend;


                //////////////////////////////////////////////////////////
                // Waitsend receive
                //////////////////////////////////////////////////////////