FFmmAlgorithmThreadProc.hpp 60.6 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
#ifndef FFMMALGORITHMTHREADPROC_HPP
#define FFMMALGORITHMTHREADPROC_HPP
// /!\ Please, you must read the license at the bottom of this page

#include "../Utils/FAssertable.hpp"
#include "../Utils/FDebug.hpp"
#include "../Utils/FTrace.hpp"
#include "../Utils/FTic.hpp"
#include "../Utils/FGlobal.hpp"

11
#include "../Containers/FBoolArray.hpp"
12
#include "../Containers/FOctree.hpp"
berenger-bramas's avatar
berenger-bramas committed
13
#include "../Containers/FLightOctree.hpp"
14

berenger-bramas's avatar
berenger-bramas committed
15
#include "../Utils/FMpi.hpp"
16
17
18

#include <omp.h>

19

20
21
22
23
24
25
/**
* @author Berenger Bramas (berenger.bramas@inria.fr)
* @class FFmmAlgorithmThreadProc
* @brief
* Please read the license
*
berenger-bramas's avatar
berenger-bramas committed
26
* This class is a threaded FMM algorithm with mpi.
27
28
29
30
31
32
33
* It just iterates on a tree and call the kernels with good arguments.
* It used the inspector-executor model :
* iterates on the tree and builds an array to work in parallel on this array
*
* Of course this class does not deallocate pointer given in arguements.
*
* Threaded & based on the inspector-executor model
berenger-bramas's avatar
berenger-bramas committed
34
35
* schedule(runtime) export OMP_NUM_THREADS=2
* export OMPI_CXX=`which g++-4.4`
36
37
38
* mpirun -np 2 valgrind --suppressions=/usr/share/openmpi/openmpi-valgrind.supp
* --tool=memcheck --leak-check=yes --show-reachable=yes --num-callers=20 --track-fds=yes
* ./Tests/testFmmAlgorithmProc ../Data/testLoaderSmall.fma.tmp
39
*/
berenger-bramas's avatar
berenger-bramas committed
40
41
template<class OctreeClass, class ParticleClass, class CellClass, class ContainerClass, class KernelClass, class LeafClass>
class FFmmAlgorithmThreadProc : protected FAssertable {
42

43

berenger-bramas's avatar
berenger-bramas committed
44
45
    OctreeClass* const tree;                 //< The octree to work on
    KernelClass** kernels;                   //< The kernels
46

berenger-bramas's avatar
berenger-bramas committed
47
48
    typename OctreeClass::Iterator* iterArray;
    int numberOfLeafs;                          //< To store the size at the previous level
49

berenger-bramas's avatar
berenger-bramas committed
50
    const int MaxThreads;               //< the max number of thread allowed by openmp
51

berenger-bramas's avatar
berenger-bramas committed
52
    const int nbProcess;                //< Number of process
berenger-bramas's avatar
berenger-bramas committed
53
    const int idProcess;                //< Id of current process
berenger-bramas's avatar
berenger-bramas committed
54

55
56
    const int OctreeHeight;

berenger-bramas's avatar
berenger-bramas committed
57
58
59
60
61
    struct Interval{
        MortonIndex min;
        MortonIndex max;
    };
    Interval*const intervals;
62
    Interval*const workingIntervalsPerLevel;
berenger-bramas's avatar
berenger-bramas committed
63
64
65
66
67
68
69
70
71
72


    static void mpiassert(const int test, const unsigned line, const char* const message = 0){
        if(test != MPI_SUCCESS){
            printf("[ERROR] Test failled at line %d, result is %d", line, test);
            if(message) printf(", message: %s",message);
            printf("\n");
            fflush(stdout);
            MPI_Abort(MPI_COMM_WORLD, int(line) );
        }
73
74
    }

75
76
77
78
    Interval& getWorkingInterval(const int level, const int proc){
        return workingIntervalsPerLevel[OctreeHeight * proc + level];
    }

79

80
public:
81
82
83
84
85
86
87
88
89

    Interval& getWorkingInterval(const int level){
        return getWorkingInterval(level, idProcess);
    }

    bool hasWorkAtLevel(const int level){
        return idProcess == 0 || getWorkingInterval(level, idProcess - 1).max < getWorkingInterval(level, idProcess).max;
    }

90
91
92
93
94
    /** The constructor need the octree and the kernels used for computation
      * @param inTree the octree to work on
      * @param inKernels the kernels to call
      * An assert is launched if one of the arguments is null
      */
95
    FFmmAlgorithmThreadProc(const FMpi::FComm& comm, OctreeClass* const inTree, KernelClass* const inKernels)
berenger-bramas's avatar
berenger-bramas committed
96
            : tree(inTree) , kernels(0), numberOfLeafs(0),
97
98
99
            MaxThreads(omp_get_max_threads()), nbProcess(comm.processCount()), idProcess(comm.processId()),
            OctreeHeight(tree->getHeight()),intervals(new Interval[comm.processCount()]),
            workingIntervalsPerLevel(new Interval[comm.processCount() * tree->getHeight()]){
100

101
        fassert(tree, "tree cannot be null", __LINE__, __FILE__);
102

berenger-bramas's avatar
berenger-bramas committed
103
        this->kernels = new KernelClass*[MaxThreads];
berenger-bramas's avatar
berenger-bramas committed
104
        for(int idxThread = 0 ; idxThread < MaxThreads ; ++idxThread){
berenger-bramas's avatar
berenger-bramas committed
105
            this->kernels[idxThread] = new KernelClass(*inKernels);
106
107
108
        }

        FDEBUG(FDebug::Controller << "FFmmAlgorithmThreadProc\n");
109
        FDEBUG(FDebug::Controller << "Max threads = "  << MaxThreads << ", Procs = " << nbProcess << ", I am " << idProcess << ".\n");
110
111
112
113
    }

    /** Default destructor */
    virtual ~FFmmAlgorithmThreadProc(){
berenger-bramas's avatar
berenger-bramas committed
114
        for(int idxThread = 0 ; idxThread < MaxThreads ; ++idxThread){
115
116
            delete this->kernels[idxThread];
        }
berenger-bramas's avatar
berenger-bramas committed
117
        delete [] this->kernels;
118

berenger-bramas's avatar
berenger-bramas committed
119
        delete [] intervals;
120
        delete [] workingIntervalsPerLevel;
121
122
123
124
125
126
127
    }

    /**
      * To execute the fmm algorithm
      * Call this function to run the complete algorithm
      */
    void execute(){
128
        FTRACE( FTrace::FFunction functionTrace( __FUNCTION__, "Fmm" , __FILE__ , __LINE__ ) );
129
130

        // Count leaf
berenger-bramas's avatar
berenger-bramas committed
131
        this->numberOfLeafs = 0;
132
        {
133
134
            FTRACE( FTrace::FRegion regionTrace( "Preprocess" , __FUNCTION__ , __FILE__ , __LINE__) );

135
136
137
138
139
140
141
142
143
144
            Interval myLastInterval;
            {
                typename OctreeClass::Iterator octreeIterator(tree);
                octreeIterator.gotoBottomLeft();
                myLastInterval.min = octreeIterator.getCurrentGlobalIndex();
                do{
                    ++this->numberOfLeafs;
                } while(octreeIterator.moveRight());
                myLastInterval.max = octreeIterator.getCurrentGlobalIndex();
            }
145
146
            iterArray = new typename OctreeClass::Iterator[numberOfLeafs];
            fassert(iterArray, "iterArray bad alloc", __LINE__, __FILE__);
147

148
            // We get the min/max indexes from each procs
149
            mpiassert( MPI_Allgather( &myLastInterval, sizeof(Interval), MPI_BYTE, intervals, sizeof(Interval), MPI_BYTE, MPI_COMM_WORLD),  __LINE__ );
150

151
152
153
154
155
            Interval myIntervals[OctreeHeight];
            myIntervals[OctreeHeight - 1] = myLastInterval;
            for(int idxLevel = OctreeHeight - 2 ; idxLevel >= 0 ; --idxLevel){
                myIntervals[idxLevel].min = myIntervals[idxLevel+1].min >> 3;
                myIntervals[idxLevel].max = myIntervals[idxLevel+1].max >> 3;
berenger-bramas's avatar
berenger-bramas committed
156
            }
157
158
159
160
            if(idProcess != 0){
                typename OctreeClass::Iterator octreeIterator(tree);
                octreeIterator.gotoBottomLeft();
                octreeIterator.moveUp();
161

162
163
164
165
166
167
168
169
170
171
                MortonIndex currentLimit = intervals[idProcess-1].max >> 3;

                for(int idxLevel = OctreeHeight - 2 ; idxLevel >= 1 ; --idxLevel){
                    while(octreeIterator.getCurrentGlobalIndex() <= currentLimit){
                        if( !octreeIterator.moveRight() ) break;
                    }
                    myIntervals[idxLevel].min = octreeIterator.getCurrentGlobalIndex();
                    octreeIterator.moveUp();
                    currentLimit >>= 3;
                }
berenger-bramas's avatar
berenger-bramas committed
172
            }
173
174
175
176

            // We get the min/max indexes from each procs
            mpiassert( MPI_Allgather( myIntervals, sizeof(Interval) * OctreeHeight, MPI_BYTE,
                                      workingIntervalsPerLevel, sizeof(Interval) * OctreeHeight, MPI_BYTE, MPI_COMM_WORLD),  __LINE__ );
177
178
        }

179
        // run;
180
181
182
183
184
185
186
187
188
189
190
191
        bottomPass();

        upwardPass();

        downardPass();

        directPass();

        // delete array
        delete [] iterArray;
        iterArray = 0;

192

193
194
    }

berenger-bramas's avatar
berenger-bramas committed
195

berenger-bramas's avatar
berenger-bramas committed
196
197
198
199
    /////////////////////////////////////////////////////////////////////////////
    // Utils functions
    /////////////////////////////////////////////////////////////////////////////

berenger-bramas's avatar
berenger-bramas committed
200
201
    int getLeft(const int inSize) const {
        const float step = (float(inSize) / nbProcess);
berenger-bramas's avatar
berenger-bramas committed
202
        return int(FMath::Ceil(step * idProcess));
berenger-bramas's avatar
berenger-bramas committed
203
204
    }

berenger-bramas's avatar
berenger-bramas committed
205
206
    int getRight(const int inSize) const {
        const float step = (float(inSize) / nbProcess);
berenger-bramas's avatar
berenger-bramas committed
207
        const int res = int(FMath::Ceil(step * (idProcess+1)));
berenger-bramas's avatar
berenger-bramas committed
208
209
210
211
        if(res > inSize) return inSize;
        else return res;
    }

berenger-bramas's avatar
berenger-bramas committed
212
213
214
215
216
217
218
    int getOtherRight(const int inSize,const int other) const {
        const float step = (float(inSize) / nbProcess);
        const int res = int(FMath::Ceil(step * (other+1)));
        if(res > inSize) return inSize;
        else return res;
    }

berenger-bramas's avatar
berenger-bramas committed
219
220
    int getProc(const int position, const int inSize) const {
        const float step = (float(inSize) / nbProcess);
berenger-bramas's avatar
berenger-bramas committed
221
222
223
        return int(position/step);
    }

berenger-bramas's avatar
berenger-bramas committed
224
225
226
227
    /////////////////////////////////////////////////////////////////////////////
    // P2M
    /////////////////////////////////////////////////////////////////////////////

berenger-bramas's avatar
berenger-bramas committed
228
    /** P2M Bottom Pass */
229
    void bottomPass(){
230
        FTRACE( FTrace::FFunction functionTrace(__FUNCTION__, "Fmm" , __FILE__ , __LINE__) );
231
        FDEBUG( FDebug::Controller.write("\tStart Bottom Pass\n").write(FDebug::Flush) );
berenger-bramas's avatar
berenger-bramas committed
232
        FDEBUG(FTic counterTime);
233

berenger-bramas's avatar
berenger-bramas committed
234
        typename OctreeClass::Iterator octreeIterator(tree);
berenger-bramas's avatar
berenger-bramas committed
235

236
237
        // Iterate on leafs
        octreeIterator.gotoBottomLeft();
berenger-bramas's avatar
berenger-bramas committed
238
        int leafs = 0;
239
        do{
berenger-bramas's avatar
berenger-bramas committed
240
            iterArray[leafs++] = octreeIterator;
241
242
        } while(octreeIterator.moveRight());

berenger-bramas's avatar
berenger-bramas committed
243
        FDEBUG(FTic computationCounter);
berenger-bramas's avatar
berenger-bramas committed
244
        #pragma omp parallel
245
        {
berenger-bramas's avatar
berenger-bramas committed
246
            KernelClass * const myThreadkernels = kernels[omp_get_thread_num()];
berenger-bramas's avatar
berenger-bramas committed
247
            #pragma omp for nowait
berenger-bramas's avatar
berenger-bramas committed
248
            for(int idxLeafs = 0 ; idxLeafs < leafs ; ++idxLeafs){
249
250
251
252
253
                myThreadkernels->P2M( iterArray[idxLeafs].getCurrentCell() , iterArray[idxLeafs].getCurrentListSrc());
            }
        }
        FDEBUG(computationCounter.tac());

254

255
        FDEBUG( FDebug::Controller << "\tFinished (@Bottom Pass (P2M) = "  << counterTime.tacAndElapsed() << "s)\n" );
256
        FDEBUG( FDebug::Controller << "\t\t Computation : " << computationCounter.elapsed() << " s\n" );
257

258
259
    }

berenger-bramas's avatar
berenger-bramas committed
260
261
262
263
    /////////////////////////////////////////////////////////////////////////////
    // Upward
    /////////////////////////////////////////////////////////////////////////////

264
265
    /** M2M */
    void upwardPass(){
266
        FTRACE( FTrace::FFunction functionTrace(__FUNCTION__, "Fmm" , __FILE__ , __LINE__) );
267
        FDEBUG( FDebug::Controller.write("\tStart Upward Pass\n").write(FDebug::Flush); );
berenger-bramas's avatar
berenger-bramas committed
268
269
        FDEBUG(FTic counterTime);
        FDEBUG(FTic computationCounter);
270
271
        FDEBUG(FTic prepareCounter);
        FDEBUG(FTic waitCounter);
272
273

        // Start from leal level - 1
berenger-bramas's avatar
berenger-bramas committed
274
        typename OctreeClass::Iterator octreeIterator(tree);
275
276
        octreeIterator.gotoBottomLeft();
        octreeIterator.moveUp();
berenger-bramas's avatar
berenger-bramas committed
277
        typename OctreeClass::Iterator avoidGotoLeftIterator(octreeIterator);
278

279
280
        // This variable is the proc responsible
        // of the shared cells
281
        int sendToProc = idProcess;
berenger-bramas's avatar
berenger-bramas committed
282

283
        // There are a maximum of 8-1 sends and 8-1 receptions
284
        MPI_Request requests[14];
285
        MPI_Status status[14];
286

287
        // Maximum data per message is:
288
        const int recvBufferOffset = 8 * CellClass::SerializedSizeUp + 1;
289
290
        char sendBuffer[recvBufferOffset];
        char recvBuffer[nbProcess * recvBufferOffset];
291
        CellClass recvBufferCells[8];
292

293
294
        int firstProcThatSend = idProcess + 1;

295
        // for each levels
296
        for(int idxLevel = OctreeHeight - 2 ; idxLevel > 1 ; --idxLevel ){
297
298
299
300
301
302
            // No more work for me
            if(idProcess != 0
                    && getWorkingInterval((idxLevel+1), idProcess).max <= getWorkingInterval((idxLevel+1), idProcess - 1).max){
                break;
            }

berenger-bramas's avatar
berenger-bramas committed
303
            // copy cells to work with
berenger-bramas's avatar
berenger-bramas committed
304
            int numberOfCells = 0;
305
306
            // for each cells
            do{
307
                iterArray[numberOfCells++] = octreeIterator;
308
309
            } while(octreeIterator.moveRight());
            avoidGotoLeftIterator.moveUp();
berenger-bramas's avatar
berenger-bramas committed
310
311
            octreeIterator = avoidGotoLeftIterator;

312
            // We may need to send something
berenger-bramas's avatar
berenger-bramas committed
313
            int iterRequests = 0;
314
315
316
317
318
            int cellsToSend = -1;

            while(iterArray[cellsToSend+1].getCurrentGlobalIndex() < getWorkingInterval(idxLevel, idProcess).min){
                ++cellsToSend;
            }
berenger-bramas's avatar
berenger-bramas committed
319

320
321
            FTRACE( FTrace::FRegion regionTrace( "Preprocess" , __FUNCTION__ , __FILE__ , __LINE__) );

322
            FDEBUG(prepareCounter.tic());
323
            if(idProcess != 0
324
325
                    && (getWorkingInterval((idxLevel+1), idProcess).min >>3) <= (getWorkingInterval((idxLevel+1), idProcess - 1).max >>3)){

326
327
328
                char state = 0;
                int idxBuff = 1;

329
                const CellClass* const* const child = iterArray[cellsToSend].getCurrentChild();
330
                for(int idxChild = 0 ; idxChild < 8 ; ++idxChild){
331
                    if( child[idxChild] && getWorkingInterval((idxLevel+1), idProcess).min <= child[idxChild]->getMortonIndex() ){
332
333
                        child[idxChild]->serializeUp(&sendBuffer[idxBuff]);
                        idxBuff += CellClass::SerializedSizeUp;
334
                        state |= char(0x1 << idxChild);
berenger-bramas's avatar
berenger-bramas committed
335
                    }
336
337
                }
                sendBuffer[0] = state;
338

339
                while( sendToProc && iterArray[cellsToSend].getCurrentGlobalIndex() == getWorkingInterval(idxLevel , sendToProc - 1).max){
340
                    --sendToProc;
341
                }
342

343
                MPI_Isend(sendBuffer, idxBuff, MPI_BYTE, sendToProc, FMpi::TagFmmM2M, MPI_COMM_WORLD, &requests[iterRequests++]);
berenger-bramas's avatar
berenger-bramas committed
344
            }
345

346
            // We may need to receive something
347
            bool hasToReceive = false;
348
349
            int endProcThatSend = firstProcThatSend;

berenger-bramas's avatar
berenger-bramas committed
350
            if(idProcess != nbProcess - 1){
351
352
                while(firstProcThatSend < nbProcess
                      && getWorkingInterval((idxLevel+1), firstProcThatSend).max < getWorkingInterval((idxLevel+1), idProcess).max){
berenger-bramas's avatar
berenger-bramas committed
353
                    ++firstProcThatSend;
354
                }
355

356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
                if(firstProcThatSend < nbProcess &&
                      (getWorkingInterval((idxLevel+1), firstProcThatSend).min >>3) <= (getWorkingInterval((idxLevel+1) , idProcess).max>>3) ){

                    endProcThatSend = firstProcThatSend;

                    while( endProcThatSend < nbProcess &&
                             (getWorkingInterval((idxLevel+1) ,endProcThatSend).min >>3) <= (getWorkingInterval((idxLevel+1) , idProcess).max>>3)){
                        ++endProcThatSend;
                    }


                    if(firstProcThatSend != endProcThatSend){
                        hasToReceive = true;

                        for(int idxProc = firstProcThatSend ; idxProc < endProcThatSend ; ++idxProc ){

                            MPI_Irecv(&recvBuffer[idxProc * recvBufferOffset], recvBufferOffset, MPI_BYTE, idxProc, FMpi::TagFmmM2M, MPI_COMM_WORLD, &requests[iterRequests++]);
                        }
374
                    }
375
                }
376
            }
377
            FDEBUG(prepareCounter.tac());
378
            FTRACE( regionTrace.end() );
379

380
381
382
383
384
385
386
            // Compute
            FDEBUG(computationCounter.tic());
            #pragma omp parallel
            {
                const int endIndex = (hasToReceive?numberOfCells-1:numberOfCells);
                KernelClass& myThreadkernels = (*kernels[omp_get_thread_num()]);
                #pragma omp for
387
                for( int idxCell = cellsToSend + 1 ; idxCell < endIndex ; ++idxCell){
388
389
                    myThreadkernels.M2M( iterArray[idxCell].getCurrentCell() , iterArray[idxCell].getCurrentChild(), idxLevel);
                }
390
            }
391
            FDEBUG(computationCounter.tac());
392

393
            // Are we sending or waiting anything?
394
            if(iterRequests){
395
                FDEBUG(waitCounter.tic());
396
                MPI_Waitall( iterRequests, requests, status);
397
                FDEBUG(waitCounter.tac());
398

399
                // we were receiving data
400
401
402
403
                if( hasToReceive ){
                    CellClass* currentChild[8];
                    memcpy(currentChild, iterArray[numberOfCells - 1].getCurrentChild(), 8 * sizeof(CellClass*));

404
                    // retreive data and merge my child and the child from others
405
                    for(int idxProc = firstProcThatSend ; idxProc < endProcThatSend ; ++idxProc){
406
                        int state = int(recvBuffer[idxProc * recvBufferOffset]);
407
408
409
410
411
412
413
414
415
416

                        int position = 0;
                        int bufferIndex = 1;
                        while( state && position < 8){
                            while(!(state & 0x1)){
                                state >>= 1;
                                ++position;
                            }

                            fassert(!currentChild[position], "Already has a cell here", __LINE__, __FILE__);
417
418
419
420
421

                            recvBufferCells[position].deserializeUp(&recvBuffer[idxProc * recvBufferOffset + bufferIndex]);
                            bufferIndex += CellClass::SerializedSizeUp;

                            currentChild[position] = (CellClass*) &recvBufferCells[position];
422
423
424
425
426

                            state >>= 1;
                            ++position;
                        }
                    }
berenger-bramas's avatar
berenger-bramas committed
427

428
429
430
431
                    // Finally compute
                    FDEBUG(computationCounter.tic());
                    (*kernels[0]).M2M( iterArray[numberOfCells - 1].getCurrentCell() , currentChild, idxLevel);
                    FDEBUG(computationCounter.tac());
432
433
434


                    firstProcThatSend = endProcThatSend - 1;
435
                }
436
437
            }
        }
berenger-bramas's avatar
berenger-bramas committed
438

439

440
        FDEBUG( FDebug::Controller << "\tFinished (@Upward Pass (M2M) = "  << counterTime.tacAndElapsed() << "s)\n" );
berenger-bramas's avatar
berenger-bramas committed
441
        FDEBUG( FDebug::Controller << "\t\t Computation : " << computationCounter.cumulated() << " s\n" );
442
443
        FDEBUG( FDebug::Controller << "\t\t Prepare : " << prepareCounter.cumulated() << " s\n" );
        FDEBUG( FDebug::Controller << "\t\t Wait : " << waitCounter.cumulated() << " s\n" );
444

445
446
    }

berenger-bramas's avatar
berenger-bramas committed
447
448
449
450
    /////////////////////////////////////////////////////////////////////////////
    // Downard
    /////////////////////////////////////////////////////////////////////////////

451
452
    /** M2L L2L */
    void downardPass(){
453
        FTRACE( FTrace::FFunction functionTrace(__FUNCTION__, "Fmm" , __FILE__ , __LINE__) );
454
455

        { // first M2L
456
            FTRACE( FTrace::FRegion regionM2LTrace("M2L", __FUNCTION__ , __FILE__ , __LINE__) );
berenger-bramas's avatar
berenger-bramas committed
457
458
459
460
461
            FDEBUG( FDebug::Controller.write("\tStart Downward Pass (M2L)\n").write(FDebug::Flush); );
            FDEBUG(FTic counterTime);
            FDEBUG(FTic computationCounter);
            FDEBUG(FTic sendCounter);
            FDEBUG(FTic receiveCounter);
462
463
            FDEBUG(FTic prepareCounter);
            FDEBUG(FTic gatherCounter);
berenger-bramas's avatar
berenger-bramas committed
464

465
466
467
            //////////////////////////////////////////////////////////////////
            // First know what to send to who
            //////////////////////////////////////////////////////////////////
berenger-bramas's avatar
berenger-bramas committed
468

469
470
471
472
473
474
475
476
            // pointer to send
            typename OctreeClass::Iterator* toSend[nbProcess * OctreeHeight];
            memset(toSend, 0, sizeof(typename OctreeClass::Iterator*) * nbProcess * OctreeHeight );
            int sizeToSend[nbProcess * OctreeHeight];
            memset(sizeToSend, 0, sizeof(int) * nbProcess * OctreeHeight);
            // index
            int indexToSend[nbProcess * OctreeHeight];
            memset(indexToSend, 0, sizeof(int) * nbProcess * OctreeHeight);
477

478
479
            // To know if a leaf has been already sent to a proc
            bool alreadySent[nbProcess];
480

481
482
            FBoolArray* leafsNeedOther[OctreeHeight];
            memset(leafsNeedOther, 0, sizeof(FBoolArray) * OctreeHeight);
483

484
            {
485
                FTRACE( FTrace::FRegion regionTrace( "Preprocess" , __FUNCTION__ , __FILE__ , __LINE__) );
486
487
                FDEBUG(prepareCounter.tic());

488
489
490
491
492
                typename OctreeClass::Iterator octreeIterator(tree);
                octreeIterator.moveDown();
                typename OctreeClass::Iterator avoidGotoLeftIterator(octreeIterator);
                // for each levels
                for(int idxLevel = 2 ; idxLevel < OctreeHeight ; ++idxLevel ){
493
494
495
496
497
498
499
500
                    if(idProcess != 0
                            && getWorkingInterval(idxLevel, idProcess).max <= getWorkingInterval(idxLevel, idProcess - 1).max){
                        avoidGotoLeftIterator.moveDown();
                        octreeIterator = avoidGotoLeftIterator;

                        continue;
                    }

501
                    int numberOfCells = 0;
502

503
                    while(octreeIterator.getCurrentGlobalIndex() <  getWorkingInterval(idxLevel , idProcess).min){
504
505
                        octreeIterator.moveRight();
                    }
506

507
508
509
510
                    // for each cells
                    do{
                        iterArray[numberOfCells] = octreeIterator;
                        ++numberOfCells;
511
                    } while(octreeIterator.moveRight());
512
513
514
515
516
                    avoidGotoLeftIterator.moveDown();
                    octreeIterator = avoidGotoLeftIterator;

                    leafsNeedOther[idxLevel] = new FBoolArray(numberOfCells);

517

518
                    // Which cell potentialy needs other data and in the same time
519
                    // are potentialy needed by other
520
                    MortonIndex neighborsIndexes[189];
521
                    for(int idxCell = 0 ; idxCell < numberOfCells ; ++idxCell){
522
523
524
525
526
527
528
                        // Find the M2L neigbors of a cell
                        const int counter = getDistantNeighbors(iterArray[idxCell].getCurrentGlobalCoordinate(),idxLevel,neighborsIndexes);

                        memset(alreadySent, false, sizeof(bool) * nbProcess);
                        bool needOther = false;
                        // Test each negibors to know which one do not belong to us
                        for(int idxNeigh = 0 ; idxNeigh < counter ; ++idxNeigh){
529
530
                            if(neighborsIndexes[idxNeigh] < getWorkingInterval(idxLevel , idProcess).min
                                    || getWorkingInterval(idxLevel , idProcess).max < neighborsIndexes[idxNeigh]){
531
                                int procToReceive = idProcess;
532
                                while( 0 != procToReceive && neighborsIndexes[idxNeigh] < getWorkingInterval(idxLevel , procToReceive).min ){
533
534
                                    --procToReceive;
                                }
535
                                while( procToReceive != nbProcess -1 && getWorkingInterval(idxLevel , procToReceive).max < neighborsIndexes[idxNeigh]){
536
537
538
539
                                    ++procToReceive;
                                }
                                // Maybe already sent to that proc?
                                if( !alreadySent[procToReceive]
540
541
                                    && getWorkingInterval(idxLevel , procToReceive).min <= neighborsIndexes[idxNeigh]
                                    && neighborsIndexes[idxNeigh] <= getWorkingInterval(idxLevel , procToReceive).max){
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566

                                    alreadySent[procToReceive] = true;

                                    needOther = true;

                                    if(indexToSend[idxLevel * nbProcess + procToReceive] ==  sizeToSend[idxLevel * nbProcess + procToReceive]){
                                        const int previousSize = sizeToSend[idxLevel * nbProcess + procToReceive];
                                        sizeToSend[idxLevel * nbProcess + procToReceive] = FMath::Max(int(10*sizeof(typename OctreeClass::Iterator)), int(sizeToSend[idxLevel * nbProcess + procToReceive] * 1.5));
                                        typename OctreeClass::Iterator* temp = toSend[idxLevel * nbProcess + procToReceive];
                                        toSend[idxLevel * nbProcess + procToReceive] = reinterpret_cast<typename OctreeClass::Iterator*>(new char[sizeof(typename OctreeClass::Iterator) * sizeToSend[idxLevel * nbProcess + procToReceive]]);
                                        memcpy(toSend[idxLevel * nbProcess + procToReceive], temp, previousSize * sizeof(typename OctreeClass::Iterator));
                                        delete[] reinterpret_cast<char*>(temp);
                                    }

                                    toSend[idxLevel * nbProcess + procToReceive][indexToSend[idxLevel * nbProcess + procToReceive]++] = iterArray[idxCell];
                                }
                            }
                        }
                        if(needOther){
                            leafsNeedOther[idxLevel]->set(idxCell,true);
                        }

                    }

                }
567
568
                FDEBUG(prepareCounter.tac());

569
570
            }

571
572
573
            //////////////////////////////////////////////////////////////////
            // Gather this information
            //////////////////////////////////////////////////////////////////
574

575
            FDEBUG(gatherCounter.tic());
576
577
578
579
580
            // All process say to each others
            // what the will send to who
            int globalReceiveMap[nbProcess * nbProcess * OctreeHeight];
            memset(globalReceiveMap, 0, sizeof(int) * nbProcess * nbProcess * OctreeHeight);
            mpiassert( MPI_Allgather( indexToSend, nbProcess * OctreeHeight, MPI_INT, globalReceiveMap, nbProcess * OctreeHeight, MPI_INT, MPI_COMM_WORLD),  __LINE__ );
581
            FDEBUG(gatherCounter.tac());
582

583
584
585
586
587

            //////////////////////////////////////////////////////////////////
            // Send and receive for real
            //////////////////////////////////////////////////////////////////

588
            FDEBUG(sendCounter.tic());
589
590
591
            // Then they can send and receive (because they know what they will receive)
            // To send in asynchrone way
            MPI_Request requests[2 * nbProcess * OctreeHeight];
592
            MPI_Status status[2 * nbProcess * OctreeHeight];
593
594
            int iterRequest = 0;

595
596
597
598
599
600
            struct CellToSend{
                MortonIndex index;
                char data[CellClass::SerializedSizeUp];
            };

            CellToSend* sendBuffer[nbProcess * OctreeHeight];
601
602
            memset(sendBuffer, 0, sizeof(CellClass*) * nbProcess * OctreeHeight);

603
            CellToSend* recvBuffer[nbProcess * OctreeHeight];
604
605
606
607
608
609
610
            memset(recvBuffer, 0, sizeof(CellClass*) * nbProcess * OctreeHeight);


            for(int idxLevel = 2 ; idxLevel < OctreeHeight ; ++idxLevel ){
                for(int idxProc = 0 ; idxProc < nbProcess ; ++idxProc){
                    const int toSendAtProcAtLevel = indexToSend[idxLevel * nbProcess + idxProc];
                    if(toSendAtProcAtLevel != 0){
611
                        sendBuffer[idxLevel * nbProcess + idxProc] = new CellToSend[toSendAtProcAtLevel];
612
613

                        for(int idxLeaf = 0 ; idxLeaf < toSendAtProcAtLevel; ++idxLeaf){
614
615
                            sendBuffer[idxLevel * nbProcess + idxProc][idxLeaf].index = toSend[idxLevel * nbProcess + idxProc][idxLeaf].getCurrentGlobalIndex();
                            toSend[idxLevel * nbProcess + idxProc][idxLeaf].getCurrentCell()->serializeUp(sendBuffer[idxLevel * nbProcess + idxProc][idxLeaf].data);
616
617
                        }

618
                        mpiassert( MPI_Isend( sendBuffer[idxLevel * nbProcess + idxProc], toSendAtProcAtLevel * sizeof(CellToSend) , MPI_BYTE ,
619
                                             idxProc, FMpi::TagLast + idxLevel, MPI_COMM_WORLD, &requests[iterRequest++]) , __LINE__ );
620
621
622
623
                    }

                    const int toReceiveFromProcAtLevel = globalReceiveMap[(idxProc * nbProcess * OctreeHeight) + idxLevel * nbProcess + idProcess];
                    if(toReceiveFromProcAtLevel){
624
                        recvBuffer[idxLevel * nbProcess + idxProc] = new CellToSend[toReceiveFromProcAtLevel];
625

626
                        mpiassert( MPI_Irecv(recvBuffer[idxLevel * nbProcess + idxProc], toReceiveFromProcAtLevel * sizeof(CellToSend), MPI_BYTE,
627
                                            idxProc, FMpi::TagLast + idxLevel, MPI_COMM_WORLD, &requests[iterRequest++]) , __LINE__ );
628
629
                    }
                }
630
            }
631
            FDEBUG(sendCounter.tac());
632

633
634
635
636
            //////////////////////////////////////////////////////////////////
            // Do M2L
            //////////////////////////////////////////////////////////////////

637
            {
638
                FTRACE( FTrace::FRegion regionTrace("Compute", __FUNCTION__ , __FILE__ , __LINE__) );
639
640
641
642
643
644
                typename OctreeClass::Iterator octreeIterator(tree);
                octreeIterator.moveDown();
                typename OctreeClass::Iterator avoidGotoLeftIterator(octreeIterator);
                // Now we can compute all the data
                // for each levels
                for(int idxLevel = 2 ; idxLevel < OctreeHeight ; ++idxLevel ){
645
646
647
648
649
650
651
652
653
                    if(idProcess != 0
                            && getWorkingInterval(idxLevel, idProcess).max <= getWorkingInterval(idxLevel, idProcess - 1).max){

                        avoidGotoLeftIterator.moveDown();
                        octreeIterator = avoidGotoLeftIterator;

                        continue;
                    }

654
                    int numberOfCells = 0;
655
                    while(octreeIterator.getCurrentGlobalIndex() <  getWorkingInterval(idxLevel , idProcess).min){
656
657
658
659
660
661
                        octreeIterator.moveRight();
                    }
                    // for each cells
                    do{
                        iterArray[numberOfCells] = octreeIterator;
                        ++numberOfCells;
662
                    } while(octreeIterator.moveRight());
663
664
665
666
667
668
669
                    avoidGotoLeftIterator.moveDown();
                    octreeIterator = avoidGotoLeftIterator;

                    FDEBUG(computationCounter.tic());
                    #pragma omp parallel
                    {
                        KernelClass * const myThreadkernels = kernels[omp_get_thread_num()];
670
                        const CellClass* neighbors[189];
671
672
673
674
675
676
677
678
679
680
681

                        #pragma omp for  schedule(dynamic) nowait
                        for(int idxCell = 0 ; idxCell < numberOfCells ; ++idxCell){
                            const int counter = tree->getDistantNeighbors(neighbors,  iterArray[idxCell].getCurrentGlobalCoordinate(),idxLevel);
                            if(counter) myThreadkernels->M2L( iterArray[idxCell].getCurrentCell() , neighbors, counter, idxLevel);
                        }
                    }
                    FDEBUG(computationCounter.tac());
                }
            }

682
683
684
            //////////////////////////////////////////////////////////////////
            // Wait received data and compute
            //////////////////////////////////////////////////////////////////
685
686

            // Wait to receive every things (and send every things)
687
            MPI_Waitall(iterRequest, requests, status);
688
689

            {
690
                FTRACE( FTrace::FRegion regionTrace("Compute Received data", __FUNCTION__ , __FILE__ , __LINE__) );
691
                FDEBUG(receiveCounter.tic());
692
693
694
695
696
697
                typename OctreeClass::Iterator octreeIterator(tree);
                octreeIterator.moveDown();
                typename OctreeClass::Iterator avoidGotoLeftIterator(octreeIterator);
                // compute the second time
                // for each levels
                for(int idxLevel = 2 ; idxLevel < OctreeHeight ; ++idxLevel ){
698
699
700
701
702
703
704
705
706
                    if(idProcess != 0
                            && getWorkingInterval(idxLevel, idProcess).max <= getWorkingInterval(idxLevel, idProcess - 1).max){

                        avoidGotoLeftIterator.moveDown();
                        octreeIterator = avoidGotoLeftIterator;

                        continue;
                    }

707
                    // put the received data into a temporary tree
708
                    FLightOctree tempTree;
709
710
                    for(int idxProc = 0 ; idxProc < nbProcess ; ++idxProc){
                        const int toReceiveFromProcAtLevel = globalReceiveMap[(idxProc * nbProcess * OctreeHeight) + idxLevel * nbProcess + idProcess];
711
                        const CellToSend* const cells = recvBuffer[idxLevel * nbProcess + idxProc];
712
                        for(int idxCell = 0 ; idxCell < toReceiveFromProcAtLevel ; ++idxCell){
713
                            tempTree.insertCell(cells[idxCell].index, cells[idxCell].data, idxLevel);
714
715
716
717
718
719
720
721
                        }
                    }

                    // take cells from our octree only if they are
                    // linked to received data
                    int numberOfCells = 0;
                    int realCellId = 0;

722
                    while(octreeIterator.getCurrentGlobalIndex() <  getWorkingInterval(idxLevel , idProcess).min){
723
724
725
726
                        octreeIterator.moveRight();
                    }
                    // for each cells
                    do{
727
                        // copy cells that need data from others
728
729
730
                        if(leafsNeedOther[idxLevel]->get(realCellId++)){
                            iterArray[numberOfCells++] = octreeIterator;
                        }
731
                    } while(octreeIterator.moveRight());
732
733
734
735
736
737
738
739
740
741
742
                    avoidGotoLeftIterator.moveDown();
                    octreeIterator = avoidGotoLeftIterator;

                    delete leafsNeedOther[idxLevel];
                    leafsNeedOther[idxLevel] = 0;

                    // Compute this cells
                    FDEBUG(computationCounter.tic());
                    #pragma omp parallel
                    {
                        KernelClass * const myThreadkernels = kernels[omp_get_thread_num()];
743
744
745
                        MortonIndex neighborsIndex[189];
                        const CellClass* neighbors[189];
                        CellClass neighborsData[189];
746
747
748
749
750
751
752
753
754

                        #pragma omp for  schedule(dynamic) nowait
                        for(int idxCell = 0 ; idxCell < numberOfCells ; ++idxCell){
                            // compute indexes
                            const int counterNeighbors = getDistantNeighbors(iterArray[idxCell].getCurrentGlobalCoordinate(), idxLevel, neighborsIndex);

                            int counter = 0;
                            // does we receive this index from someone?
                            for(int idxNeig = 0 ;idxNeig < counterNeighbors ; ++idxNeig){
755
                                const void* const cellFromOtherProc = tempTree.getCell(neighborsIndex[idxNeig], idxLevel);
756
                                if(cellFromOtherProc){
757
758
759
                                    neighborsData[counter].deserializeUp(cellFromOtherProc);
                                    neighbors[counter] = &neighborsData[counter];
                                    ++counter;
760
761
762
763
764
765
766
767
768
769
                                }
                            }
                            // need to compute
                            if(counter){
                                myThreadkernels->M2L( iterArray[idxCell].getCurrentCell() , neighbors, counter, idxLevel);
                            }
                        }
                    }
                    FDEBUG(computationCounter.tac());
                }
770
                FDEBUG(receiveCounter.tac());
771
            }
772

773
774
775
            for(int idxComm = 0 ; idxComm < nbProcess * OctreeHeight; ++idxComm){
                delete[] sendBuffer[idxComm];
                delete[] recvBuffer[idxComm];
776
                delete[] reinterpret_cast<char*>( toSend[idxComm] );
777
778
            }

779
            FDEBUG( FDebug::Controller << "\tFinished (@Downward Pass (M2L) = "  << counterTime.tacAndElapsed() << "s)\n" );
berenger-bramas's avatar
berenger-bramas committed
780
781
782
            FDEBUG( FDebug::Controller << "\t\t Computation : " << computationCounter.cumulated() << " s\n" );
            FDEBUG( FDebug::Controller << "\t\t Send : " << sendCounter.cumulated() << " s\n" );
            FDEBUG( FDebug::Controller << "\t\t Receive : " << receiveCounter.cumulated() << " s\n" );
783
784
            FDEBUG( FDebug::Controller << "\t\t Gather : " << gatherCounter.cumulated() << " s\n" );
            FDEBUG( FDebug::Controller << "\t\t Prepare : " << prepareCounter.cumulated() << " s\n" );
785
        }
berenger-bramas's avatar
berenger-bramas committed
786

787
788
789
790
        //////////////////////////////////////////////////////////////////
        // ---------------- L2L ---------------
        //////////////////////////////////////////////////////////////////

791
        { // second L2L
792
            FTRACE( FTrace::FRegion regionTrace("L2L", __FUNCTION__ , __FILE__ , __LINE__) );
berenger-bramas's avatar
berenger-bramas committed
793
794
795
            FDEBUG( FDebug::Controller.write("\tStart Downward Pass (L2L)\n").write(FDebug::Flush); );
            FDEBUG(FTic counterTime);
            FDEBUG(FTic computationCounter);
796
797
            FDEBUG(FTic prepareCounter);
            FDEBUG(FTic waitCounter);
berenger-bramas's avatar
berenger-bramas committed
798

799
800
801
802
803
804
            // Start from leal level - 1
            typename OctreeClass::Iterator octreeIterator(tree);
            octreeIterator.moveDown();
            typename OctreeClass::Iterator avoidGotoLeftIterator(octreeIterator);

            MPI_Request requests[nbProcess];
805
            MPI_Status status[nbProcess];
806
807
808

            const int heightMinusOne = OctreeHeight - 1;

809
810
811
            char sendBuffer[CellClass::SerializedSizeDown];
            char recvBuffer[CellClass::SerializedSizeDown];

812
813
            // for each levels exepted leaf level
            for(int idxLevel = 2 ; idxLevel < heightMinusOne ; ++idxLevel ){
814
815
816
817
818
819
820
821
822
                if(idProcess != 0
                        && getWorkingInterval((idxLevel+1) , idProcess).max <= getWorkingInterval((idxLevel+1) , idProcess - 1).max){

                    avoidGotoLeftIterator.moveDown();
                    octreeIterator = avoidGotoLeftIterator;

                    continue;
                }

823
824
825
826
827
828
829
830
831
                // copy cells to work with
                int numberOfCells = 0;
                // for each cells
                do{
                    iterArray[numberOfCells++] = octreeIterator;
                } while(octreeIterator.moveRight());
                avoidGotoLeftIterator.moveDown();
                octreeIterator = avoidGotoLeftIterator;

832
833
834
835
                int firstCellWork = -1;
                while(iterArray[firstCellWork+1].getCurrentGlobalIndex() < getWorkingInterval(idxLevel , idProcess).min){
                    ++firstCellWork;
                }
836
837
838
839

                bool needToRecv = false;
                int iterRequests = 0;

840
841
                FDEBUG(prepareCounter.tic());

842
843
                // do we need to receive one or zeros cell
                if(idProcess != 0
844
                        && (getWorkingInterval((idxLevel + 1) , idProcess).min >> 3 ) <= (getWorkingInterval((idxLevel+1) , idProcess - 1).max >> 3 ) ){
845
                    needToRecv = true;
846

847
                    MPI_Irecv( recvBuffer, CellClass::SerializedSizeDown, MPI_BYTE, MPI_ANY_SOURCE, FMpi::TagFmmL2L, MPI_COMM_WORLD, &requests[iterRequests++]);
848
849
850
851
852
853
                }


                if(idProcess != nbProcess - 1){
                    int firstProcThatRecv = idProcess + 1;
                    while( firstProcThatRecv < nbProcess &&
854
                            getWorkingInterval((idxLevel + 1) , firstProcThatRecv).max <= getWorkingInterval((idxLevel+1) , idProcess).max){
855
856
857
858
859
                        ++firstProcThatRecv;
                    }

                    int endProcThatRecv = firstProcThatRecv;
                    while( endProcThatRecv < nbProcess &&
860
                            (getWorkingInterval((idxLevel + 1) , endProcThatRecv).min >> 3) <= (getWorkingInterval((idxLevel+1) , idProcess).max >> 3) ){
861
862
                        ++endProcThatRecv;
                    }
863

864
                    if(firstProcThatRecv != endProcThatRecv){
865
                        iterArray[numberOfCells - 1].getCurrentCell()->serializeDown(sendBuffer);
866
                        for(int idxProc = firstProcThatRecv ; idxProc < endProcThatRecv ; ++idxProc ){
867

868
                            MPI_Isend(sendBuffer, CellClass::SerializedSizeDown, MPI_BYTE, idxProc, FMpi::TagFmmL2L, MPI_COMM_WORLD, &requests[iterRequests++]);
869
870
871
                        }
                    }
                }
872
873
874
875
876
877
878
                FDEBUG(prepareCounter.tac());

                FDEBUG(computationCounter.tic());
                #pragma omp parallel
                {
                    KernelClass& myThreadkernels = (*kernels[omp_get_thread_num()]);
                    #pragma omp for
879
                    for(int idxCell = firstCellWork + 1 ; idxCell < numberOfCells ; ++idxCell){
880
881
                        myThreadkernels.L2L( iterArray[idxCell].getCurrentCell() , iterArray[idxCell].getCurrentChild(), idxLevel);
                    }
882
                }
883
                FDEBUG(computationCounter.tac());
884

885
                // are we sending or receiving?
886
                if(iterRequests){
887
888
                    // process
                    FDEBUG(waitCounter.tic());
889
                    MPI_Waitall( iterRequests, requests, status);
890
891
                    FDEBUG(waitCounter.tac());

892
                    if(needToRecv){
893
894
                        // Need to compute
                        FDEBUG(computationCounter.tic());
895
896
                        iterArray[firstCellWork].getCurrentCell()->deserializeDown(recvBuffer);
                        kernels[0]->L2L( iterArray[firstCellWork].getCurrentCell() , iterArray[firstCellWork].getCurrentChild(), idxLevel);
897
                        FDEBUG(computationCounter.tac());
898
899
900
                    }
                }
            }
901

902
            FDEBUG( FDebug::Controller << "\tFinished (@Downward Pass (L2L) = "  << counterTime.tacAndElapsed() << "s)\n" );
berenger-bramas's avatar
berenger-bramas committed
903
            FDEBUG( FDebug::Controller << "\t\t Computation : " << computationCounter.cumulated() << " s\n" );
904
905
            FDEBUG( FDebug::Controller << "\t\t Prepare : " << prepareCounter.cumulated() << " s\n" );
            FDEBUG( FDebug::Controller << "\t\t Wait : " << waitCounter.cumulated() << " s\n" );
berenger-bramas's avatar
berenger-bramas committed
906
        }
berenger-bramas's avatar
berenger-bramas committed
907

908

909
910
    }

berenger-bramas's avatar
berenger-bramas committed
911
912
913
914
    /////////////////////////////////////////////////////////////////////////////
    // Direct
    /////////////////////////////////////////////////////////////////////////////

915
916
    /** P2P */
    void directPass(){
917
        FTRACE( FTrace::FFunction functionTrace(__FUNCTION__, "Fmm" , __FILE__ , __LINE__) );
918
        FDEBUG( FDebug::Controller.write("\tStart Direct Pass\n").write(FDebug::Flush); );
919
920
921
922
        FDEBUG( FTic counterTime);
        FDEBUG( FTic prepareCounter);
        FDEBUG( FTic gatherCounter);
        FDEBUG( FTic waitCounter);
923