FFmmAlgorithmThreadProc.hpp 70.3 KB
Newer Older
1
// ===================================================================================
2
// Copyright ScalFmm 2011 INRIA, Olivier Coulaud, Berenger Bramas, Matthias Messner
3
4
5
6
// olivier.coulaud@inria.fr, berenger.bramas@inria.fr
// This software is a computer program whose purpose is to compute the FMM.
//
// This software is governed by the CeCILL-C and LGPL licenses and
7
8
// abiding by the rules of distribution of free software.
//
9
10
11
12
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
// GNU General Public and CeCILL-C Licenses for more details.
13
// "http://www.cecill.info".
14
// "http://www.gnu.org/licenses".
15
// ===================================================================================
16
17
#ifndef FFMMALGORITHMTHREADPROC_HPP
#define FFMMALGORITHMTHREADPROC_HPP
18

COULAUD Olivier's avatar
COULAUD Olivier committed
19
#include <omp.h>
20

COULAUD Olivier's avatar
COULAUD Olivier committed
21
//
BRAMAS Berenger's avatar
BRAMAS Berenger committed
22
#include "../Utils/FAssert.hpp"
BRAMAS Berenger's avatar
BRAMAS Berenger committed
23
#include "../Utils/FLog.hpp"
24

25
#include "../Utils/FTic.hpp"
26

27
28
#include "../Utils/FGlobal.hpp"

29
#include "../Containers/FBoolArray.hpp"
30
#include "../Containers/FOctree.hpp"
berenger-bramas's avatar
berenger-bramas committed
31
#include "../Containers/FLightOctree.hpp"
32

33
34
#include "../Containers/FBufferWriter.hpp"
#include "../Containers/FBufferReader.hpp"
35
36
#include "../Containers/FMpiBufferWriter.hpp"
#include "../Containers/FMpiBufferReader.hpp"
37

berenger-bramas's avatar
berenger-bramas committed
38
#include "../Utils/FMpi.hpp"
39
#include <sys/time.h>
40

41
#include "FCoreCommon.hpp"
42

43
/**
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
 * @author Berenger Bramas (berenger.bramas@inria.fr)
 * @class FFmmAlgorithmThreadProc
 * @brief
 * Please read the license
 *
 * This class is a threaded FMM algorithm with mpi.
 * It just iterates on a tree and call the kernels with good arguments.
 * It used the inspector-executor model :
 * iterates on the tree and builds an array to work in parallel on this array
 *
 * Of course this class does not deallocate pointer given in arguements.
 *
 * Threaded & based on the inspector-executor model
 * schedule(runtime) export OMP_NUM_THREADS=2
 * export OMPI_CXX=`which g++-4.4`
 * mpirun -np 2 valgrind --suppressions=/usr/share/openmpi/openmpi-valgrind.supp
 * --tool=memcheck --leak-check=yes --show-reachable=yes --num-callers=20 --track-fds=yes
 * ./Tests/testFmmAlgorithmProc ../Data/testLoaderSmall.fma.tmp
 */
63
template<class OctreeClass, class CellClass, class ContainerClass, class KernelClass, class LeafClass>
BRAMAS Berenger's avatar
BRAMAS Berenger committed
64
class FFmmAlgorithmThreadProc : public FAbstractAlgorithm {
65
66
    OctreeClass* const tree;                 //< The octree to work on
    KernelClass** kernels;                   //< The kernels
67

68
    const FMpi::FComm& comm;                 //< MPI comm
69

70
71
72
    typename OctreeClass::Iterator*     iterArray;  //Will be used to store pointers to cells/leafs to work with
    typename OctreeClass::Iterator* iterArrayComm;  //Will be used to store pointers to cells/leafs to send/rcv
    int numberOfLeafs;                          //< To store the size at the previous level
73

74
    const int MaxThreads;               //< the max number of thread allowed by openmp
berenger-bramas's avatar
berenger-bramas committed
75

76
77
    const int nbProcess;                //< Number of process
    const int idProcess;                //< Id of current process
78

79
    const int OctreeHeight;            //<Height of the tree
berenger-bramas's avatar
berenger-bramas committed
80

81

82
83
84
85
    /** An interval is the morton index interval
     * that a proc use (it holds data in this interval)
     */
    struct Interval{
86
87
        MortonIndex leftIndex;
        MortonIndex rightIndex;
88
89
90
91
92
    };
    /** My interval */
    Interval*const intervals;
    /** All process intervals */
    Interval*const workingIntervalsPerLevel;
93

94
95
    /** Get an interval from proc id and level */
    Interval& getWorkingInterval( int level,  int proc){
96
        return workingIntervalsPerLevel[OctreeHeight * proc + level];
97
    }
98

99
    const Interval& getWorkingInterval( int level,  int proc) const {
100
        return workingIntervalsPerLevel[OctreeHeight * proc + level];
101
102
103
104
    }

    /** To know if a proc has work at a given level (if it hold cells and was responsible of them) */
    bool procHasWorkAtLevel(const int idxLevel , const int idxProc) const {
105
        return getWorkingInterval(idxLevel, idxProc).leftIndex <= getWorkingInterval(idxLevel, idxProc).rightIndex;
106
107
108
109
    }

    /** Return true if the idxProc left cell at idxLevel+1 has the same parent as us for our right cell */
    bool procCoversMyRightBorderCell(const int idxLevel , const int idxProc) const {
110
        return (getWorkingInterval((idxLevel+1) , idProcess).rightIndex>>3) == (getWorkingInterval((idxLevel+1) ,idxProc).leftIndex >>3);
111
112
113
114
    }

    /** Return true if the idxProc right cell at idxLevel+1 has the same parent as us for our left cell */
    bool procCoversMyLeftBorderCell(const int idxLevel , const int idxProc) const {
115
        return (getWorkingInterval((idxLevel+1) , idxProc).rightIndex >>3) == (getWorkingInterval((idxLevel+1) , idProcess).leftIndex>>3);
116
117
    }

118
119
120
public:
    /** Get current proc interval at level */
    Interval& getWorkingInterval( int level){
121
        return getWorkingInterval(level, idProcess);
122
123
124
125
    }

    /** Does the current proc has some work at this level */
    bool hasWorkAtLevel( int level){
126
        return idProcess == 0 || (getWorkingInterval(level, idProcess - 1).rightIndex) < (getWorkingInterval(level, idProcess).rightIndex);
127
128
129
130
131
132
133
134
    }

    /** The constructor need the octree and the kernels used for computation
     * @param inTree the octree to work on
     * @param inKernels the kernels to call
     * An assert is launched if one of the arguments is null
     */
    FFmmAlgorithmThreadProc(const FMpi::FComm& inComm, OctreeClass* const inTree, KernelClass* const inKernels)
135
136
137
138
        : tree(inTree) , kernels(nullptr), comm(inComm), iterArray(nullptr),iterArrayComm(nullptr),numberOfLeafs(0),
          MaxThreads(omp_get_max_threads()), nbProcess(inComm.processCount()), idProcess(inComm.processId()),
          OctreeHeight(tree->getHeight()),intervals(new Interval[inComm.processCount()]),
          workingIntervalsPerLevel(new Interval[inComm.processCount() * tree->getHeight()])
139
    {
140
        FAssertLF(tree, "tree cannot be null");
141
142


143
        this->kernels = new KernelClass*[MaxThreads];
144
        #pragma omp parallel for schedule(static)
145
146
147
        for(int idxThread = 0 ; idxThread < MaxThreads ; ++idxThread){
            this->kernels[idxThread] = new KernelClass(*inKernels);
        }
148

149
150
        FLOG(FLog::Controller << "FFmmAlgorithmThreadProc\n");
        FLOG(FLog::Controller << "Max threads = "  << MaxThreads << ", Procs = " << nbProcess << ", I am " << idProcess << ".\n");
151
152
153
    }
    /** Default destructor */
    virtual ~FFmmAlgorithmThreadProc(){
154
155
156
157
        for(int idxThread = 0 ; idxThread < MaxThreads ; ++idxThread){
            delete this->kernels[idxThread];
        }
        delete [] this->kernels;
158

159
160
        delete [] intervals;
        delete [] workingIntervalsPerLevel;
161
162
163
164
165
166
167
    }

    /**
     * To execute the fmm algorithm
     * Call this function to run the complete algorithm
     */
    void execute(const unsigned operationsToProceed = FFmmNearAndFarFields){
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
        // Count leaf
        this->numberOfLeafs = 0;
        {
            Interval myFullInterval;
            {//Building the interval with the first and last leaves (and count the number of leaves)
                typename OctreeClass::Iterator octreeIterator(tree);
                octreeIterator.gotoBottomLeft();
                myFullInterval.leftIndex = octreeIterator.getCurrentGlobalIndex();
                do{
                    ++this->numberOfLeafs;
                } while(octreeIterator.moveRight());
                myFullInterval.rightIndex = octreeIterator.getCurrentGlobalIndex();
            }
            // Allocate a number to store the pointer of the cells at a level
            iterArray     = new typename OctreeClass::Iterator[numberOfLeafs];
            iterArrayComm = new typename OctreeClass::Iterator[numberOfLeafs];
            FAssertLF(iterArray,     "iterArray     bad alloc");
            FAssertLF(iterArrayComm, "iterArrayComm bad alloc");

            // We get the leftIndex/rightIndex indexes from each procs
            FMpi::MpiAssert( MPI_Allgather( &myFullInterval, sizeof(Interval), MPI_BYTE, intervals, sizeof(Interval), MPI_BYTE, comm.getComm()),  __LINE__ );

            // Build my intervals for all levels
            std::unique_ptr<Interval[]> myIntervals(new Interval[OctreeHeight]);
            // At leaf level we know it is the full interval
            myIntervals[OctreeHeight - 1] = myFullInterval;

            // We can estimate the interval for each level by using the parent/child relation
            for(int idxLevel = OctreeHeight - 2 ; idxLevel >= 0 ; --idxLevel){
                myIntervals[idxLevel].leftIndex = myIntervals[idxLevel+1].leftIndex >> 3;
                myIntervals[idxLevel].rightIndex = myIntervals[idxLevel+1].rightIndex >> 3;
            }

            // Process 0 uses the estimates as real intervals, but other processes
            // should remove cells that belong to others
            if(idProcess != 0){
                //We test for each level if process on left (idProcess-1) own cell I thought I owned
                typename OctreeClass::Iterator octreeIterator(tree);
                octreeIterator.gotoBottomLeft();
                octreeIterator.moveUp();

                // At h-1 the working limit is the parent of the right cell of the proc on the left
                MortonIndex workingLimitAtLevel = intervals[idProcess-1].rightIndex >> 3;

                // We check if we have no more work to do
                int nullIntervalFromLevel = 0;

                for(int idxLevel = OctreeHeight - 2 ; idxLevel >= 1 && nullIntervalFromLevel == 0 ; --idxLevel){
                    while(octreeIterator.getCurrentGlobalIndex() <= workingLimitAtLevel){
                        if( !octreeIterator.moveRight() ){
                            // We cannot move right we are not owner of any more cell
                            nullIntervalFromLevel = idxLevel;
                            break;
                        }
                    }
                    // If we are responsible for some cells at this level keep the first index
                    if(nullIntervalFromLevel == 0){
                        myIntervals[idxLevel].leftIndex = octreeIterator.getCurrentGlobalIndex();
                        octreeIterator.moveUp();
                        workingLimitAtLevel >>= 3;
                    }
                }
                // In case we are not responsible for any cells we put the leftIndex = rightIndex+1
                for(int idxLevel = nullIntervalFromLevel ; idxLevel >= 1 ; --idxLevel){
                    myIntervals[idxLevel].leftIndex = myIntervals[idxLevel].rightIndex + 1;
                }
            }

            // We get the leftIndex/rightIndex indexes from each procs
            FMpi::MpiAssert( MPI_Allgather( myIntervals.get(), int(sizeof(Interval)) * OctreeHeight, MPI_BYTE,
                                            workingIntervalsPerLevel, int(sizeof(Interval)) * OctreeHeight, MPI_BYTE, comm.getComm()),  __LINE__ );
        }

        // run;
        if(operationsToProceed & FFmmP2M) bottomPass();

        if(operationsToProceed & FFmmM2M) upwardPass();

        if(operationsToProceed & FFmmM2L) transferPass();

        if(operationsToProceed & FFmmL2L) downardPass();

250
        if((operationsToProceed & FFmmP2P) || (operationsToProceed & FFmmL2P)) directPass((operationsToProceed & FFmmP2P),(operationsToProceed & FFmmL2P));
251
252
253
254
255
256
257


        // delete array
        delete []     iterArray;
        delete [] iterArrayComm;
        iterArray     = nullptr;
        iterArrayComm = nullptr;
258
    }
259
260
261

private:

262
263
264
    /////////////////////////////////////////////////////////////////////////////
    // P2M
    /////////////////////////////////////////////////////////////////////////////
265

266
267
268
269
270
    /**
     * P2M Bottom Pass
     * No communication are involved in the P2M.
     * It is similar to multi threaded version.
     */
271
    void bottomPass(){
272
273
274
275
276
277
278
279
280
281
282
283
284
        FLOG( FLog::Controller.write("\tStart Bottom Pass\n").write(FLog::Flush) );
        FLOG(FTic counterTime);
        FLOG(FTic computationCounter);
        typename OctreeClass::Iterator octreeIterator(tree);

        // Copy the ptr to leaves in array
        octreeIterator.gotoBottomLeft();
        int leafs = 0;
        do{
            iterArray[leafs++] = octreeIterator;
        } while(octreeIterator.moveRight());

        FLOG(computationCounter.tic());
285
#pragma omp parallel
286
287
288
289
        {
            // Each thread get its own kernel
            KernelClass * const myThreadkernels = kernels[omp_get_thread_num()];
            // Parallel iteration on the leaves
290
#pragma omp for nowait
291
292
293
294
295
296
297
            for(int idxLeafs = 0 ; idxLeafs < leafs ; ++idxLeafs){
                myThreadkernels->P2M( iterArray[idxLeafs].getCurrentCell() , iterArray[idxLeafs].getCurrentListSrc());
            }
        }
        FLOG(computationCounter.tac());
        FLOG( FLog::Controller << "\tFinished (@Bottom Pass (P2M) = "  << counterTime.tacAndElapsed() << " s)\n" );
        FLOG( FLog::Controller << "\t\t Computation : " << computationCounter.elapsed() << " s\n" );
298
299
300
301
302
303
304
305
    }

    /////////////////////////////////////////////////////////////////////////////
    // Upward
    /////////////////////////////////////////////////////////////////////////////

    /** M2M */
    void upwardPass(){
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
        FLOG( FLog::Controller.write("\tStart Upward Pass\n").write(FLog::Flush); );
        FLOG(FTic counterTime);
        FLOG(FTic computationCounter);
        FLOG(FTic singleCounter);
        FLOG(FTic parallelCounter);

        // Start from leal level (height-1)
        typename OctreeClass::Iterator octreeIterator(tree);
        octreeIterator.gotoBottomLeft();
        octreeIterator.moveUp();
        typename OctreeClass::Iterator avoidGotoLeftIterator(octreeIterator);

        // The proc to send the shared cells to
        // Starting to the proc on the left this variable will go to 0
        int currentProcIdToSendTo = (idProcess - 1);

        // There are a maximum of 1 sends and 8-1 receptions
        MPI_Request requests[8];
        MPI_Status status[8];

326
327
328
329
330
331
332
333
        MPI_Request requestsSize[8];
        MPI_Status statusSize[8];

        int bufferSize;
        FMpiBufferWriter sendBuffer(comm.getComm(), 1);// Max = 1 + sizeof(cell)*7
        std::unique_ptr<FMpiBufferReader[]> recvBuffer(new FMpiBufferReader[7]);
        int recvBufferSize[7];
        CellClass recvBufferCells[7];
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356

        // The first proc that send to me a cell
        // This variable will go to nbProcess
        int firstProcThatSend = idProcess + 1;
        FLOG(computationCounter.tic());

        // We work from height-1 to 1
        for(int idxLevel = OctreeHeight - 2 ; idxLevel > 1 ; --idxLevel ){
            // Does my cells are covered by my neighbors working interval and so I have no more work?
            const bool noMoreWorkForMe = (idProcess != 0 && !procHasWorkAtLevel(idxLevel+1, idProcess));
            if(noMoreWorkForMe){
                FAssertLF(procHasWorkAtLevel(idxLevel, idProcess) == false);
                break;
            }

            // Copy and count ALL the cells (even the ones outside the working interval)
            int totalNbCellsAtLevel = 0;
            do{
                iterArray[totalNbCellsAtLevel++] = octreeIterator;
            } while(octreeIterator.moveRight());
            avoidGotoLeftIterator.moveUp();
            octreeIterator = avoidGotoLeftIterator;

357
358
            int iterMpiRequests       = 0; // The iterator for send/recv requests
            int iterMpiRequestsSize   = 0; // The iterator for send/recv requests
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382

            int nbCellsToSkip     = 0; // The number of cells to send
            // Skip all the cells that are out of my working interval
            while(nbCellsToSkip < totalNbCellsAtLevel && iterArray[nbCellsToSkip].getCurrentGlobalIndex() < getWorkingInterval(idxLevel, idProcess).leftIndex){
                ++nbCellsToSkip;
            }

            // We need to know if we will recv something in order to know if threads skip the last cell
            int nbCellsForThreads = totalNbCellsAtLevel; // totalNbCellsAtLevel or totalNbCellsAtLevel-1
            bool hasToReceive = false;
            if(idProcess != nbProcess-1 && procHasWorkAtLevel(idxLevel , idProcess)){
                // Find the first proc that may send to me
                while(firstProcThatSend < nbProcess && !procHasWorkAtLevel(idxLevel+1, firstProcThatSend) ){
                    firstProcThatSend += 1;
                }
                // Do we have to receive?
                if(firstProcThatSend < nbProcess && procHasWorkAtLevel(idxLevel+1, firstProcThatSend) && procCoversMyRightBorderCell(idxLevel, firstProcThatSend) ){
                    hasToReceive = true;
                    // Threads do not compute the last cell, we will do it once data are received
                    nbCellsForThreads -= 1;
                }
            }

            FLOG(parallelCounter.tic());
383
            #pragma omp parallel
384
            {
385
                KernelClass* myThreadkernels = (kernels[omp_get_thread_num()]);
386
                //This single section post and receive the comms, and then do the M2M associated with it.
387
                #pragma omp single nowait
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
                {
                    FLOG(singleCounter.tic());
                    // Master proc never send
                    if(idProcess != 0){
                        // Skip process that have no work at that level
                        while( currentProcIdToSendTo && !procHasWorkAtLevel(idxLevel, currentProcIdToSendTo)  ){
                            --currentProcIdToSendTo;
                        }
                        // Does the next proc that has work is sharing the parent of my left cell
                        if(procHasWorkAtLevel(idxLevel, currentProcIdToSendTo) && procCoversMyLeftBorderCell(idxLevel, currentProcIdToSendTo)){
                            FAssertLF(nbCellsToSkip != 0);

                            char packageFlags = 0;
                            sendBuffer.write(packageFlags);

                            // Only the cell the most on the right out of my working interval should be taken in
                            // consideration (at pos nbCellsToSkip-1) other (x < nbCellsToSkip-1) have already been sent
                            const CellClass* const* const child = iterArray[nbCellsToSkip-1].getCurrentChild();
                            for(int idxChild = 0 ; idxChild < 8 ; ++idxChild){
                                // Check if child exists and it was part of my working interval
                                if( child[idxChild] && getWorkingInterval((idxLevel+1), idProcess).leftIndex <= child[idxChild]->getMortonIndex() ){
                                    // Add the cell to the buffer
                                    child[idxChild]->serializeUp(sendBuffer);
                                    packageFlags = char(packageFlags | (0x1 << idxChild));
                                }
                            }
                            // Add the flag as first value
                            sendBuffer.writeAt(0,packageFlags);
                            // Post the message
417
418
419
                            bufferSize = sendBuffer.getSize();
                            MPI_Isend(&bufferSize, 1, MPI_INT, currentProcIdToSendTo,
                                      FMpi::TagFmmM2MSize + idxLevel, comm.getComm(), &requestsSize[iterMpiRequestsSize++]);
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
                            MPI_Isend(sendBuffer.data(), sendBuffer.getSize(), MPI_PACKED, currentProcIdToSendTo,
                                      FMpi::TagFmmM2M + idxLevel, comm.getComm(), &requests[iterMpiRequests++]);
                        }
                    }

                    //Post receive, Datas needed in several parts of the section
                    int nbProcThatSendToMe = 0;

                    if(hasToReceive){
                        //Test : if the firstProcThatSend father minimal value in interval is lesser than mine
                        int idProcSource = firstProcThatSend;
                        // Find the last proc that should send to me
                        while( idProcSource < nbProcess
                               && ( !procHasWorkAtLevel(idxLevel+1, idProcSource) || procCoversMyRightBorderCell(idxLevel, idProcSource) )){
                            if(procHasWorkAtLevel(idxLevel+1, idProcSource) && procCoversMyRightBorderCell(idxLevel, idProcSource)){
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
                                MPI_Irecv(&recvBufferSize[nbProcThatSendToMe], 1, MPI_INT,
                                        idProcSource, FMpi::TagFmmM2MSize + idxLevel, comm.getComm(), &requestsSize[iterMpiRequestsSize++]);
                                nbProcThatSendToMe += 1;
                                FAssertLF(nbProcThatSendToMe <= 7);
                            }
                            ++idProcSource;
                        }
                    }

                    //Wait For the comms, and do the work
                    // Are we sending or waiting anything?
                    if(iterMpiRequestsSize){
                        FAssertLF(iterMpiRequestsSize <= 8);
                        MPI_Waitall( iterMpiRequestsSize, requestsSize, statusSize);
                    }

                    if(hasToReceive){
                        nbProcThatSendToMe = 0;
                        //Test : if the firstProcThatSend father minimal value in interval is lesser than mine
                        int idProcSource = firstProcThatSend;
                        // Find the last proc that should send to me
                        while( idProcSource < nbProcess
                               && ( !procHasWorkAtLevel(idxLevel+1, idProcSource) || procCoversMyRightBorderCell(idxLevel, idProcSource) )){
                            if(procHasWorkAtLevel(idxLevel+1, idProcSource) && procCoversMyRightBorderCell(idxLevel, idProcSource)){
                                recvBuffer[nbProcThatSendToMe].cleanAndResize(recvBufferSize[nbProcThatSendToMe]);
                                MPI_Irecv(recvBuffer[nbProcThatSendToMe].data(), recvBufferSize[nbProcThatSendToMe], MPI_PACKED,
461
462
463
464
465
466
467
468
469
470
471
472
473
                                        idProcSource, FMpi::TagFmmM2M + idxLevel, comm.getComm(), &requests[iterMpiRequests++]);
                                nbProcThatSendToMe += 1;
                                FAssertLF(nbProcThatSendToMe <= 7);
                            }
                            ++idProcSource;
                        }
                    }

                    //Wait For the comms, and do the work
                    // Are we sending or waiting anything?
                    if(iterMpiRequests){
                        FAssertLF(iterMpiRequests <= 8);
                        MPI_Waitall( iterMpiRequests, requests, status);
474
475
                    }                                       

476
477
478
479
480
481
482
483
                    // We had received something so we need to proceed the last M2M
                    if( hasToReceive ){
                        FAssertLF(iterMpiRequests != 0);
                        CellClass* currentChild[8];
                        memcpy(currentChild, iterArray[totalNbCellsAtLevel - 1].getCurrentChild(), 8 * sizeof(CellClass*));

                        // Retreive data and merge my child and the child from others
                        for(int idxProc = 0 ; idxProc < nbProcThatSendToMe ; ++idxProc){
484
                            int packageFlags = int(recvBuffer[idxProc].getValue<char>());
485
486
487
488
489
490
491
492

                            int position = 0;
                            while( packageFlags && position < 8){
                                while(!(packageFlags & 0x1)){
                                    packageFlags >>= 1;
                                    ++position;
                                }
                                FAssertLF(!currentChild[position], "Already has a cell here");
493
                                recvBufferCells[position].deserializeUp(recvBuffer[idxProc]);
494
495
496
497
498
                                currentChild[position] = (CellClass*) &recvBufferCells[position];

                                packageFlags >>= 1;
                                ++position;
                            }
499
500

                            recvBuffer[idxProc].seek(0);
501
502
                        }
                        // Finally compute
503
                        myThreadkernels->M2M( iterArray[totalNbCellsAtLevel - 1].getCurrentCell() , currentChild, idxLevel);
504
505
506
507
508
509
510
511
                        firstProcThatSend += nbProcThatSendToMe - 1;
                    }
                    // Reset buffer
                    sendBuffer.reset();
                    FLOG(singleCounter.tac());
                }//End Of Single section

                // All threads proceed the M2M
512
                #pragma omp for nowait
513
514
515
516
517
518
519
520
521
522
523
524
525
                for( int idxCell = nbCellsToSkip ; idxCell < nbCellsForThreads ; ++idxCell){
                    myThreadkernels->M2M( iterArray[idxCell].getCurrentCell() , iterArray[idxCell].getCurrentChild(), idxLevel);
                }
            }//End of parallel section
            FLOG(parallelCounter.tac());
        }

        FLOG(counterTime.tac());
        FLOG(computationCounter.tac());
        FLOG( FLog::Controller << "\tFinished (@Upward Pass (M2M) = "  << counterTime.elapsed() << " s)\n" );
        FLOG( FLog::Controller << "\t\t Computation : " << computationCounter.elapsed() << " s\n" );
        FLOG( FLog::Controller << "\t\t Single : " << singleCounter.cumulated() << " s\n" );
        FLOG( FLog::Controller << "\t\t Parallel : " << parallelCounter.cumulated() << " s\n" );
526
    }
527

528
529
530
    /////////////////////////////////////////////////////////////////////////////
    // Downard
    /////////////////////////////////////////////////////////////////////////////
531
532


533
    void transferPass(){
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
        FLOG( FLog::Controller.write("\tStart Downward Pass (M2L)\n").write(FLog::Flush); );
        FLOG(FTic counterTime);
        FLOG(FTic computationCounter);
        FLOG(FTic sendCounter);
        FLOG(FTic receiveCounter);
        FLOG(FTic prepareCounter);
        FLOG(FTic gatherCounter);

        //////////////////////////////////////////////////////////////////
        // First know what to send to who
        //////////////////////////////////////////////////////////////////

        // pointer to send
        FVector<typename OctreeClass::Iterator> toSend[nbProcess * OctreeHeight];
        // index
549
550
        long long int*const indexToSend = new long long int[nbProcess * OctreeHeight];
        memset(indexToSend, 0, sizeof(long long int) * nbProcess * OctreeHeight);
551
552
553
554
555
556
        // To know which one has need someone
        FBoolArray** const leafsNeedOther = new FBoolArray*[OctreeHeight];
        memset(leafsNeedOther, 0, sizeof(FBoolArray*) * OctreeHeight);

        // All process say to each others
        // what the will send to who
557
558
        long long int*const globalReceiveMap = new long long  int[nbProcess * nbProcess * OctreeHeight];
        memset(globalReceiveMap, 0, sizeof(long long  int) * nbProcess * nbProcess * OctreeHeight);
559
560
561
562
563
564

        FMpiBufferWriter**const sendBuffer = new FMpiBufferWriter*[nbProcess * OctreeHeight];
        memset(sendBuffer, 0, sizeof(FMpiBufferWriter*) * nbProcess * OctreeHeight);

        FMpiBufferReader**const recvBuffer = new FMpiBufferReader*[nbProcess * OctreeHeight];
        memset(recvBuffer, 0, sizeof(FMpiBufferReader*) * nbProcess * OctreeHeight);
565

566
        #pragma omp parallel
567
        {
568
            #pragma omp master
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
            {
                {
                    FLOG(prepareCounter.tic());

                    std::unique_ptr<typename OctreeClass::Iterator[]> iterArrayLocal(new typename OctreeClass::Iterator[numberOfLeafs]);

                    // To know if a leaf has been already sent to a proc
                    bool*const alreadySent = new bool[nbProcess];
                    memset(alreadySent, 0, sizeof(bool) * nbProcess);

                    typename OctreeClass::Iterator octreeIterator(tree);
                    octreeIterator.moveDown();
                    typename OctreeClass::Iterator avoidGotoLeftIterator(octreeIterator);
                    // for each levels
                    for(int idxLevel = 2 ; idxLevel < OctreeHeight ; ++idxLevel ){
                        if(!procHasWorkAtLevel(idxLevel, idProcess)){
                            avoidGotoLeftIterator.moveDown();
                            octreeIterator = avoidGotoLeftIterator;
                            continue;
                        }

                        int numberOfCells = 0;

                        while(octreeIterator.getCurrentGlobalIndex() <  getWorkingInterval(idxLevel , idProcess).leftIndex){
                            octreeIterator.moveRight();
                        }

                        // for each cells
                        do{
                            iterArrayLocal[numberOfCells] = octreeIterator;
                            ++numberOfCells;
                        } while(octreeIterator.moveRight());
                        avoidGotoLeftIterator.moveDown();
                        octreeIterator = avoidGotoLeftIterator;

                        leafsNeedOther[idxLevel] = new FBoolArray(numberOfCells);

                        // Which cell potentialy needs other data and in the same time
                        // are potentialy needed by other
                        MortonIndex neighborsIndexes[189];
                        for(int idxCell = 0 ; idxCell < numberOfCells ; ++idxCell){
                            // Find the M2L neigbors of a cell
                            const int counter = iterArrayLocal[idxCell].getCurrentGlobalCoordinate().getInteractionNeighbors(idxLevel, neighborsIndexes);

                            memset(alreadySent, false, sizeof(bool) * nbProcess);
                            bool needOther = false;
                            // Test each negibors to know which one do not belong to us
                            for(int idxNeigh = 0 ; idxNeigh < counter ; ++idxNeigh){
                                if(neighborsIndexes[idxNeigh] < getWorkingInterval(idxLevel , idProcess).leftIndex
                                        || (getWorkingInterval(idxLevel , idProcess).rightIndex) < neighborsIndexes[idxNeigh]){
                                    int procToReceive = idProcess;
                                    while( 0 != procToReceive && neighborsIndexes[idxNeigh] < getWorkingInterval(idxLevel , procToReceive).leftIndex ){
                                        --procToReceive;
                                    }
                                    while( procToReceive != nbProcess -1 && (getWorkingInterval(idxLevel , procToReceive).rightIndex) < neighborsIndexes[idxNeigh]){
                                        ++procToReceive;
                                    }
                                    // Maybe already sent to that proc?
                                    if( !alreadySent[procToReceive]
                                            && getWorkingInterval(idxLevel , procToReceive).leftIndex <= neighborsIndexes[idxNeigh]
                                            && neighborsIndexes[idxNeigh] <= getWorkingInterval(idxLevel , procToReceive).rightIndex){

                                        alreadySent[procToReceive] = true;

                                        needOther = true;

                                        toSend[idxLevel * nbProcess + procToReceive].push(iterArrayLocal[idxCell]);
636
637
638
639
640
                                        if(indexToSend[idxLevel * nbProcess + procToReceive] == 0){
                                            indexToSend[idxLevel * nbProcess + procToReceive] = sizeof(int);
                                        }
                                        indexToSend[idxLevel * nbProcess + procToReceive] += iterArrayLocal[idxCell].getCurrentCell()->getSavedSizeUp();
                                        indexToSend[idxLevel * nbProcess + procToReceive] += sizeof(MortonIndex);
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
                                    }
                                }
                            }
                            if(needOther){
                                leafsNeedOther[idxLevel]->set(idxCell,true);
                            }
                        }
                    }
                    FLOG(prepareCounter.tac());

                    delete[] alreadySent;
                }

                //////////////////////////////////////////////////////////////////
                // Gather this information
                //////////////////////////////////////////////////////////////////

                FLOG(gatherCounter.tic());
659
                FMpi::MpiAssert( MPI_Allgather( indexToSend, nbProcess * OctreeHeight, MPI_LONG_LONG_INT, globalReceiveMap, nbProcess * OctreeHeight, MPI_LONG_LONG_INT, comm.getComm()),  __LINE__ );
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
                FLOG(gatherCounter.tac());

                //////////////////////////////////////////////////////////////////
                // Send and receive for real
                //////////////////////////////////////////////////////////////////

                FLOG(sendCounter.tic());
                // Then they can send and receive (because they know what they will receive)
                // To send in asynchrone way
                MPI_Request*const requests = new MPI_Request[2 * nbProcess * OctreeHeight];
                MPI_Status*const status = new MPI_Status[2 * nbProcess * OctreeHeight];
                int iterRequest = 0;

                for(int idxLevel = 2 ; idxLevel < OctreeHeight ; ++idxLevel ){
                    for(int idxProc = 0 ; idxProc < nbProcess ; ++idxProc){
675
                        const long long int toSendAtProcAtLevel = indexToSend[idxLevel * nbProcess + idxProc];
676
                        if(toSendAtProcAtLevel != 0){
677
                            sendBuffer[idxLevel * nbProcess + idxProc] = new FMpiBufferWriter(comm.getComm(),toSendAtProcAtLevel);
678

679
680
681
                            sendBuffer[idxLevel * nbProcess + idxProc]->write(int(toSend[idxLevel * nbProcess + idxProc].getSize()));

                            for(int idxLeaf = 0 ; idxLeaf < toSend[idxLevel * nbProcess + idxProc].getSize(); ++idxLeaf){
682
683
684
685
686
                                const MortonIndex cellIndex = toSend[idxLevel * nbProcess + idxProc][idxLeaf].getCurrentGlobalIndex();
                                sendBuffer[idxLevel * nbProcess + idxProc]->write(cellIndex);
                                toSend[idxLevel * nbProcess + idxProc][idxLeaf].getCurrentCell()->serializeUp(*sendBuffer[idxLevel * nbProcess + idxProc]);
                            }

687
688
                            FAssertLF(sendBuffer[idxLevel * nbProcess + idxProc]->getSize() == toSendAtProcAtLevel);

689
690
691
692
693
                            FMpi::MpiAssert( MPI_Isend( sendBuffer[idxLevel * nbProcess + idxProc]->data(),
                                             sendBuffer[idxLevel * nbProcess + idxProc]->getSize(),MPI_PACKED, idxProc,
                                    FMpi::TagLast + idxLevel, comm.getComm(), &requests[iterRequest++]) , __LINE__ );
                        }

694
                        const long long int toReceiveFromProcAtLevel = globalReceiveMap[(idxProc * nbProcess * OctreeHeight) + idxLevel * nbProcess + idProcess];
695
                        if(toReceiveFromProcAtLevel){
696
                            recvBuffer[idxLevel * nbProcess + idxProc] = new FMpiBufferReader(comm.getComm(),toReceiveFromProcAtLevel);
697
698
699
700
701
702
703
704
705
706
707
708
709

                            FMpi::MpiAssert( MPI_Irecv(recvBuffer[idxLevel * nbProcess + idxProc]->data(),
                                             recvBuffer[idxLevel * nbProcess + idxProc]->getCapacity(), MPI_PACKED,idxProc,
                                    FMpi::TagLast + idxLevel, comm.getComm(), &requests[iterRequest++]) , __LINE__ );
                        }
                    }
                }

                //////////////////////////////////////////////////////////////////
                // Wait received data and compute
                //////////////////////////////////////////////////////////////////

                // Wait to receive every things (and send every things)
710
                FMpi::MpiAssert(MPI_Waitall(iterRequest, requests, status), __LINE__);
711
712
713
714
715
716
717
718
719
720

                delete[] requests;
                delete[] status;

                FLOG(sendCounter.tac());
            }//End of Master region

            //////////////////////////////////////////////////////////////////
            // Do M2L
            //////////////////////////////////////////////////////////////////
PIACIBELLO Cyrille's avatar
PIACIBELLO Cyrille committed
721

722
            #pragma omp single nowait
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
            {
                typename OctreeClass::Iterator octreeIterator(tree);
                octreeIterator.moveDown();
                typename OctreeClass::Iterator avoidGotoLeftIterator(octreeIterator);
                // Now we can compute all the data
                // for each levels
                for(int idxLevel = 2 ; idxLevel < OctreeHeight ; ++idxLevel ){
                    if(!procHasWorkAtLevel(idxLevel, idProcess)){
                        avoidGotoLeftIterator.moveDown();
                        octreeIterator = avoidGotoLeftIterator;
                        continue;
                    }

                    int numberOfCells = 0;
                    while(octreeIterator.getCurrentGlobalIndex() <  getWorkingInterval(idxLevel , idProcess).leftIndex){
                        octreeIterator.moveRight();
                    }
                    // for each cells
                    do{
                        iterArray[numberOfCells] = octreeIterator;
                        ++numberOfCells;
                    } while(octreeIterator.moveRight());
                    avoidGotoLeftIterator.moveDown();
                    octreeIterator = avoidGotoLeftIterator;

                    FLOG(computationCounter.tic());
                    {
                        const int chunckSize = FMath::Max(1, numberOfCells/(omp_get_num_threads()*omp_get_num_threads()));
                        for(int idxCell = 0 ; idxCell < numberOfCells ; idxCell += chunckSize){
752
                            #pragma omp task default(none) shared(numberOfCells,idxLevel) firstprivate(idxCell) //+ shared(chunckSize)
753
754
755
756
757
758
759
760
761
762
763
764
                            {
                                KernelClass * const myThreadkernels = kernels[omp_get_thread_num()];
                                const CellClass* neighbors[343];

                                const int nbCellToCompute = FMath::Min(chunckSize, numberOfCells-idxCell);
                                for(int idxCellToCompute = idxCell ; idxCellToCompute < idxCell+nbCellToCompute ; ++idxCellToCompute){
                                    const int counter = tree->getInteractionNeighbors(neighbors,  iterArray[idxCellToCompute].getCurrentGlobalCoordinate(), idxLevel);
                                    if(counter) myThreadkernels->M2L( iterArray[idxCellToCompute].getCurrentCell() , neighbors, counter, idxLevel);
                                }
                            }
                        }
                    }//End of task spawning
PIACIBELLO Cyrille's avatar
PIACIBELLO Cyrille committed
765

766
                    #pragma omp taskwait
PIACIBELLO Cyrille's avatar
PIACIBELLO Cyrille committed
767

768
                    for(int idxThread = 0 ; idxThread < omp_get_num_threads() ; ++idxThread){
769
                        #pragma omp task  default(none) firstprivate(idxThread) shared(idxLevel)
770
771
772
773
                        {
                            kernels[idxThread]->finishedLevelM2L(idxLevel);
                        }
                    }
774
                    #pragma omp taskwait
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798

                    FLOG(computationCounter.tac());
                }
            }
        }


        {
            FLOG(receiveCounter.tic());
            typename OctreeClass::Iterator octreeIterator(tree);
            octreeIterator.moveDown();
            typename OctreeClass::Iterator avoidGotoLeftIterator(octreeIterator);
            // compute the second time
            // for each levels
            for(int idxLevel = 2 ; idxLevel < OctreeHeight ; ++idxLevel ){
                if(!procHasWorkAtLevel(idxLevel, idProcess)){
                    avoidGotoLeftIterator.moveDown();
                    octreeIterator = avoidGotoLeftIterator;
                    continue;
                }

                // put the received data into a temporary tree
                FLightOctree<CellClass> tempTree;
                for(int idxProc = 0 ; idxProc < nbProcess ; ++idxProc){
799
800
                    if(recvBuffer[idxLevel * nbProcess + idxProc]){
                        const int toReceiveFromProcAtLevel = recvBuffer[idxLevel * nbProcess + idxProc]->FMpiBufferReader::getValue<int>();
801

802
803
                        for(int idxCell = 0 ; idxCell < toReceiveFromProcAtLevel ; ++idxCell){
                            const MortonIndex cellIndex = recvBuffer[idxLevel * nbProcess + idxProc]->FMpiBufferReader::getValue<MortonIndex>();
804

805
806
807
                            CellClass* const newCell = new CellClass;
                            newCell->setMortonIndex(cellIndex);
                            newCell->deserializeUp(*recvBuffer[idxLevel * nbProcess + idxProc]);
808

809
810
811
812
813
                            tempTree.insertCell(cellIndex, idxLevel, newCell);
                        }

                        FAssertLF(globalReceiveMap[(idxProc * nbProcess * OctreeHeight) + idxLevel * nbProcess + idProcess] ==
                                recvBuffer[idxLevel * nbProcess + idxProc]->tell());
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
                    }
                }

                // take cells from our octree only if they are
                // linked to received data
                int numberOfCells = 0;
                int realCellId = 0;

                while(octreeIterator.getCurrentGlobalIndex() <  getWorkingInterval(idxLevel , idProcess).leftIndex){
                    octreeIterator.moveRight();
                }
                // for each cells
                do{
                    // copy cells that need data from others
                    if(leafsNeedOther[idxLevel]->get(realCellId++)){
                        iterArray[numberOfCells++] = octreeIterator;
                    }
                } while(octreeIterator.moveRight());
                avoidGotoLeftIterator.moveDown();
                octreeIterator = avoidGotoLeftIterator;

                delete leafsNeedOther[idxLevel];
                leafsNeedOther[idxLevel] = nullptr;

                // Compute this cells
                FLOG(computationCounter.tic());
840
                #pragma omp parallel
841
842
843
844
845
                {
                    KernelClass * const myThreadkernels = kernels[omp_get_thread_num()];
                    MortonIndex neighborsIndex[189];
                    int neighborsPosition[189];
                    const CellClass* neighbors[343];
846

847
                    #pragma omp for schedule(static) nowait
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
                    for(int idxCell = 0 ; idxCell < numberOfCells ; ++idxCell){
                        // compute indexes
                        memset(neighbors, 0, 343 * sizeof(CellClass*));
                        const int counterNeighbors = iterArray[idxCell].getCurrentGlobalCoordinate().getInteractionNeighbors(idxLevel, neighborsIndex, neighborsPosition);

                        int counter = 0;
                        // does we receive this index from someone?
                        for(int idxNeig = 0 ;idxNeig < counterNeighbors ; ++idxNeig){
                            if(neighborsIndex[idxNeig] < (getWorkingInterval(idxLevel , idProcess).leftIndex)
                                    || (getWorkingInterval(idxLevel , idProcess).rightIndex) < neighborsIndex[idxNeig]){

                                CellClass*const otherCell = tempTree.getCell(neighborsIndex[idxNeig], idxLevel);

                                if(otherCell){
                                    //otherCell->setMortonIndex(neighborsIndex[idxNeig]);
                                    neighbors[ neighborsPosition[idxNeig] ] = otherCell;
                                    ++counter;
                                }
                            }
                        }
                        // need to compute
                        if(counter){
                            myThreadkernels->M2L( iterArray[idxCell].getCurrentCell() , neighbors, counter, idxLevel);
                        }
                    }

                    myThreadkernels->finishedLevelM2L(idxLevel);
                }
                FLOG(computationCounter.tac());
            }
            FLOG(receiveCounter.tac());
        }

        for(int idxComm = 0 ; idxComm < nbProcess * OctreeHeight; ++idxComm){
            delete sendBuffer[idxComm];
            delete recvBuffer[idxComm];
        }
        for(int idxComm = 0 ; idxComm < OctreeHeight; ++idxComm){
            delete leafsNeedOther[idxComm];
        }
        delete[] sendBuffer;
        delete[] recvBuffer;
        delete[] indexToSend;
        delete[] leafsNeedOther;
        delete[] globalReceiveMap;


        FLOG( FLog::Controller << "\tFinished (@Downward Pass (M2L) = "  << counterTime.tacAndElapsed() << " s)\n" );
        FLOG( FLog::Controller << "\t\t Computation : " << computationCounter.cumulated() << " s\n" );
        FLOG( FLog::Controller << "\t\t Send : " << sendCounter.cumulated() << " s\n" );
        FLOG( FLog::Controller << "\t\t Receive : " << receiveCounter.cumulated() << " s\n" );
        FLOG( FLog::Controller << "\t\t Gather : " << gatherCounter.cumulated() << " s\n" );
        FLOG( FLog::Controller << "\t\t Prepare : " << prepareCounter.cumulated() << " s\n" );
901

902
    }
903

904
905
906
907
    //////////////////////////////////////////////////////////////////
    // ---------------- L2L ---------------
    //////////////////////////////////////////////////////////////////

908
    void downardPass(){ // second L2L
909
910
911
912
913
914
915
916
917
918
919
920
921
922
        FLOG( FLog::Controller.write("\tStart Downward Pass (L2L)\n").write(FLog::Flush); );
        FLOG(FTic counterTime);
        FLOG(FTic computationCounter);
        FLOG(FTic prepareCounter);
        FLOG(FTic waitCounter);

        // Start from leal level - 1
        typename OctreeClass::Iterator octreeIterator(tree);
        octreeIterator.moveDown();
        typename OctreeClass::Iterator avoidGotoLeftIterator(octreeIterator);

        // Max 1 receive and 7 send (but 7 times the same data)
        MPI_Request*const requests = new MPI_Request[8];
        MPI_Status*const status = new MPI_Status[8];
BRAMAS Berenger's avatar
BRAMAS Berenger committed
923
924
        MPI_Request*const requestsSize = new MPI_Request[8];
        MPI_Status*const statusSize = new MPI_Status[8];
925
926
927

        const int heightMinusOne = OctreeHeight - 1;

928
929
        FMpiBufferWriter sendBuffer(comm.getComm());
        FMpiBufferReader recvBuffer(comm.getComm());
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968

        int righestProcToSendTo   = nbProcess - 1;

        // for each levels exepted leaf level
        for(int idxLevel = 2 ; idxLevel < heightMinusOne ; ++idxLevel ){
            // If nothing to do in the next level skip the current one
            if(idProcess != 0 && !procHasWorkAtLevel(idxLevel+1, idProcess) ){
                avoidGotoLeftIterator.moveDown();
                octreeIterator = avoidGotoLeftIterator;
                continue;
            }

            // Copy all the cells in an array even the one that are out of my working interval
            int totalNbCellsAtLevel = 0;
            do{
                iterArray[totalNbCellsAtLevel++] = octreeIterator;
            } while(octreeIterator.moveRight());
            avoidGotoLeftIterator.moveDown();
            octreeIterator = avoidGotoLeftIterator;

            // Count the number of cells that are out of my working interval
            int nbCellsToSkip = 0;
            while(nbCellsToSkip < totalNbCellsAtLevel && iterArray[nbCellsToSkip].getCurrentGlobalIndex() < getWorkingInterval(idxLevel , idProcess).leftIndex){
                nbCellsToSkip += 1;
            }

            // Check if someone will send a cell to me
            bool hasToReceive = false;
            int idxProcToReceive = idProcess - 1;
            if(idProcess != 0 && nbCellsToSkip){
                // Starting from my left neighbor stop at the first proc that has work to do (not null interval)
                while(idxProcToReceive && !procHasWorkAtLevel(idxLevel, idxProcToReceive) ){
                    idxProcToReceive -= 1;
                }
                // Check if we find such a proc and that it share a cell with us on the border
                if(procHasWorkAtLevel(idxLevel, idxProcToReceive) && procCoversMyLeftBorderCell(idxLevel, idxProcToReceive)){
                    hasToReceive = true;
                }
            }
969

BRAMAS Berenger's avatar
BRAMAS Berenger committed
970
            #pragma omp parallel
971
            {
BRAMAS Berenger's avatar
BRAMAS Berenger committed
972
973
                KernelClass* myThreadkernels = (kernels[omp_get_thread_num()]);
                #pragma omp single nowait
974
975
976
                {
                    FLOG(prepareCounter.tic());
                    int iterRequests = 0;
BRAMAS Berenger's avatar
BRAMAS Berenger committed
977
978
979
                    int iterRequestsSize = 0;
                    int recvBufferSize = 0;
                    int sendBufferSize;
980
981
                    // Post the receive
                    if(hasToReceive){
BRAMAS Berenger's avatar
BRAMAS Berenger committed
982
983
                        FMpi::MpiAssert( MPI_Irecv( &recvBufferSize, 1, MPI_INT, idxProcToReceive,
                                                    FMpi::TagFmmL2LSize + idxLevel, comm.getComm(), &requestsSize[iterRequestsSize++]), __LINE__ );
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
                    }

                    // We have to be sure that we are not sending if we have no work in the current level
                    if(idProcess != nbProcess - 1 && idProcess < righestProcToSendTo && procHasWorkAtLevel(idxLevel, idProcess)){
                        int idxProcSend = idProcess + 1;
                        int nbMessageSent = 0;
                        // From the proc on the right to righestProcToSendTo, check if we have to send something
                        while(idxProcSend <= righestProcToSendTo && ( !procHasWorkAtLevel(idxLevel+1, idxProcSend) || procCoversMyRightBorderCell(idxLevel, idxProcSend)) ){
                            // We know that if the proc has work at the next level it share a cell with us due to the while condition
                            if(procHasWorkAtLevel(idxLevel+1, idxProcSend)){
                                FAssertLF(procCoversMyRightBorderCell(idxLevel, idxProcSend));
                                // If first message then serialize the cell to send
                                if( nbMessageSent == 0 ){
                                    // We send our last cell
                                    iterArray[totalNbCellsAtLevel - 1].getCurrentCell()->serializeDown(sendBuffer);
BRAMAS Berenger's avatar
BRAMAS Berenger committed
999
                                    sendBufferSize = sendBuffer.getSize();
1000
1001
                                }
                                // Post the send message
BRAMAS Berenger's avatar
BRAMAS Berenger committed
1002
1003
                                FMpi::MpiAssert( MPI_Isend(&sendBufferSize, 1, MPI_INT, idxProcSend,
                                                           FMpi::TagFmmL2LSize + idxLevel, comm.getComm(), &requestsSize[iterRequestsSize++]), __LINE__);
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
                                FMpi::MpiAssert( MPI_Isend(sendBuffer.data(), sendBuffer.getSize(), MPI_PACKED, idxProcSend,
                                                           FMpi::TagFmmL2L + idxLevel, comm.getComm(), &requests[iterRequests++]), __LINE__);
                                // Inc and check the counter
                                nbMessageSent += 1;
                                FAssertLF(nbMessageSent <= 7);
                            }
                            idxProcSend += 1;
                        }
                        // Next time we will not need to go further than idxProcSend
                        righestProcToSendTo = idxProcSend;
                    }
BRAMAS Berenger's avatar
BRAMAS Berenger committed
1015

1016
                    // Finalize the communication
BRAMAS Berenger's avatar
BRAMAS Berenger committed
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
                    if(iterRequestsSize){
                        FLOG(waitCounter.tic());
                        FAssertLF(iterRequestsSize <= 8);
                        FMpi::MpiAssert(MPI_Waitall( iterRequestsSize, requestsSize, statusSize), __LINE__);
                        FLOG(waitCounter.tac());
                    }

                    if(hasToReceive){
                        FMpi::MpiAssert( MPI_Irecv( recvBuffer.data(), recvBuffer.getCapacity(), MPI_PACKED, idxProcToReceive,
                                                    FMpi::TagFmmL2L + idxLevel, comm.getComm(), &requests[iterRequests++]), __LINE__ );
                    }

1029
1030
1031
1032
1033
1034
                    if(iterRequests){
                        FLOG(waitCounter.tic());
                        FAssertLF(iterRequests <= 8);
                        FMpi::MpiAssert(MPI_Waitall( iterRequests, requests, status), __LINE__);
                        FLOG(waitCounter.tac());
                    }
BRAMAS Berenger's avatar
BRAMAS Berenger committed
1035

1036
1037
1038
1039
1040
1041
                    // If we receive something proceed the L2L
                    if(hasToReceive){
                        FAssertLF(iterRequests != 0);
                        // In this case we know that we have to perform the L2L with the last cell that are
                        // exclude from our working interval nbCellsToSkip-1
                        iterArray[nbCellsToSkip-1].getCurrentCell()->deserializeDown(recvBuffer);
BRAMAS Berenger's avatar
BRAMAS Berenger committed
1042
                        myThreadkernels->L2L( iterArray[nbCellsToSkip-1].getCurrentCell() , iterArray[nbCellsToSkip-1].getCurrentChild(), idxLevel);
1043
1044
1045
                    }
                    FLOG(prepareCounter.tac());
                }
1046

BRAMAS Berenger's avatar
BRAMAS Berenger committed
1047
                #pragma omp single nowait
1048
1049
1050
1051
                {
                    FLOG(computationCounter.tic());
                }
                // Threads are working on all the cell of our working interval at that level
BRAMAS Berenger's avatar
BRAMAS Berenger committed
1052
                #pragma omp for nowait
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
                for(int idxCell = nbCellsToSkip ; idxCell < totalNbCellsAtLevel ; ++idxCell){
                    myThreadkernels->L2L( iterArray[idxCell].getCurrentCell() , iterArray[idxCell].getCurrentChild(), idxLevel);
                }
            }
            FLOG(computationCounter.tac());

            sendBuffer.reset();
            recvBuffer.seek(0);
        }

        delete[] requests;
        delete[] status;

        FLOG( FLog::Controller << "\tFinished (@Downward Pass (L2L) = "  << counterTime.tacAndElapsed() << " s)\n" );
        FLOG( FLog::Controller << "\t\t Computation : " << computationCounter.cumulated() << " s\n" );
        FLOG( FLog::Controller << "\t\t Prepare : " << prepareCounter.cumulated() << " s\n" );
        FLOG( FLog::Controller << "\t\t Wait : " << waitCounter.cumulated() << " s\n" );
1070
1071
1072
1073
1074
1075
1076
    }


    /////////////////////////////////////////////////////////////////////////////
    // Direct
    /////////////////////////////////////////////////////////////////////////////
    struct LeafData{
1077
1078
1079
1080
        FTreeCoordinate coord;
        CellClass* cell;
        ContainerClass* targets;
        ContainerClass* sources;
1081
1082
1083
1084
    };


    /** P2P */
1085
    void directPass(const bool p2pEnabled, const bool l2pEnabled){
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
        FLOG( FLog::Controller.write("\tStart Direct Pass\n").write(FLog::Flush); );
        FLOG( FTic counterTime);
        FLOG( FTic prepareCounter);
        FLOG( FTic gatherCounter);
        FLOG( FTic waitCounter);
        FLOG(FTic computationCounter);
        FLOG(FTic computation2Counter);

        ///////////////////////////////////////////////////
        // Prepare data to send receive
        ///////////////////////////////////////////////////
        FLOG(prepareCounter.tic());

        // To send in asynchrone way
        MPI_Request requests[2 * nbProcess];
        MPI_Status status[2 * nbProcess];
        int iterRequest = 0;
        int nbMessagesToRecv = 0;

        FMpiBufferWriter**const sendBuffer = new FMpiBufferWriter*[nbProcess];
        memset(sendBuffer, 0, sizeof(FMpiBufferWriter*) * nbProcess);

        FMpiBufferReader**const recvBuffer = new FMpiBufferReader*[nbProcess];
        memset(recvBuffer, 0, sizeof(FMpiBufferReader*) * nbProcess);

        /* This a nbProcess x nbProcess matrix of integer
     * let U and V be id of processes :
     * globalReceiveMap[U*nbProcess + V] == size of information needed by V and own by U
     */
        int*const globalReceiveMap = new int[nbProcess * nbProcess];
        memset(globalReceiveMap, 0, sizeof(int) * nbProcess * nbProcess);
PIACIBELLO Cyrille's avatar
PIACIBELLO Cyrille committed
1117

1118
1119
        FBoolArray leafsNeedOther(this->numberOfLeafs);
        int countNeedOther = 0;
PIACIBELLO Cyrille's avatar
PIACIBELLO Cyrille committed
1120

1121
1122
        // To store the result
        OctreeClass otherP2Ptree( tree->getHeight(), tree->getSubHeight(), tree->getBoxWidth(), tree->getBoxCenter() );
PIACIBELLO Cyrille's avatar
PIACIBELLO Cyrille committed
1123

1124
1125
1126
        // init
        const int LeafIndex = OctreeHeight - 1;
        const int SizeShape = 3*3*3;
PIACIBELLO Cyrille's avatar
PIACIBELLO Cyrille committed
1127

1128
1129
        int shapeLeaf[SizeShape];
        memset(shapeLeaf,0,SizeShape*sizeof(int));
PIACIBELLO Cyrille's avatar
PIACIBELLO Cyrille committed
1130

1131
        LeafData* const leafsDataArray = new LeafData[this->numberOfLeafs];
PIACIBELLO Cyrille's avatar
PIACIBELLO Cyrille committed
1132

1133
        FVector<LeafData> leafsNeedOtherData(countNeedOther);
PIACIBELLO Cyrille's avatar
PIACIBELLO Cyrille committed
1134

1135
1136
1137
        FVector<typename OctreeClass::Iterator>*const toSend = new FVector<typename OctreeClass::Iterator>[nbProcess];
        int partsToSend[nbProcess];
        memset(partsToSend, 0, sizeof(int) * nbProcess);
PIACIBELLO Cyrille's avatar
PIACIBELLO Cyrille committed
1138
1139

#pragma omp parallel
1140
        {
PIACIBELLO Cyrille's avatar
PIACIBELLO Cyrille committed
1141
#pragma omp master // MUST WAIT to fill leafsNeedOther
1142
            if(p2pEnabled){
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
                // Copy leafs
                {
                    typename OctreeClass::Iterator octreeIterator(tree);
                    octreeIterator.gotoBottomLeft();
                    int idxLeaf = 0;
                    do{
                        this->iterArray[idxLeaf++] = octreeIterator;
                    } while(octreeIterator.moveRight());
                }

                int alreadySent[nbProcess];

                //Will store the indexes of the neighbors of current cell
                MortonIndex indexesNeighbors[26];

                for(int idxLeaf = 0 ; idxLeaf < this->numberOfLeafs ; ++idxLeaf){
                    memset(alreadySent, 0, sizeof(int) * nbProcess);
                    bool needOther = false;
                    //Get the neighbors of current cell in indexesNeighbors, and their number in neighCount
                    const int neighCount = (iterArray[idxLeaf].getCurrentGlobalCoordinate()).getNeighborsIndexes(OctreeHeight,indexesNeighbors);
                    //Loop over the neighbor leafs
                    for(int idxNeigh = 0 ; idxNeigh < neighCount ; ++idxNeigh){
                        //Test if leaf belongs to someone else (false if it's mine)
                        if(indexesNeighbors[idxNeigh] < (intervals[idProcess].leftIndex) || (intervals[idProcess].rightIndex) < indexesNeighbors[idxNeigh]){
                            needOther = true;

                            // find the proc that will need current leaf
                            int procToReceive = idProcess;
                            while( procToReceive != 0 && indexesNeighbors[idxNeigh] < intervals[procToReceive].leftIndex){
                                --procToReceive; //scroll process "before" current process
                            }

                            while( procToReceive != nbProcess - 1 && (intervals[procToReceive].rightIndex) < indexesNeighbors[idxNeigh]){
                                ++procToReceive;//scroll process "after" current process
                            }
                            //  Test : Not Already Send && be sure someone hold this interval
                            if( !alreadySent[procToReceive] && intervals[procToReceive].leftIndex <= indexesNeighbors[idxNeigh] && indexesNeighbors[idxNeigh] <= intervals[procToReceive].rightIndex){

                                alreadySent[procToReceive] = 1;
                                toSend[procToReceive].push( iterArray[idxLeaf] );
                                partsToSend[procToReceive] += iterArray[idxLeaf].getCurrentListSrc()->getSavedSize();
                                partsToSend[procToReceive] += int(sizeof(MortonIndex));
                            }
                        }
                    }

                    if(needOther){ //means that something need to be sent (or received)
                        leafsNeedOther.set(idxLeaf,true);
                        ++countNeedOther;
                    }
                }

                // No idea why it is mandatory there, could it be a few line before,
                for(int idxProc = 0 ; idxProc < nbProcess ; ++idxProc){
                    if(partsToSend[idxProc]){
                        partsToSend[idxProc] += int(sizeof(int));
                    }
                }
            }
PIACIBELLO Cyrille's avatar
PIACIBELLO Cyrille committed
1202
1203
1204
1205

#pragma omp barrier

#pragma omp master // nowait
1206
            if(p2pEnabled){
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
                //Share to all processus globalReceiveMap
                FLOG(gatherCounter.tic());
                FMpi::MpiAssert( MPI_Allgather( partsToSend, nbProcess, MPI_INT, globalReceiveMap, nbProcess, MPI_INT, comm.getComm()),  __LINE__ );
                FLOG(gatherCounter.tac());

                //Prepare receive
                for(int idxProc = 0 ; idxProc < nbProcess ; ++idxProc){
                    if(globalReceiveMap[idxProc * nbProcess + idProcess]){ //if idxProc has sth for me.
                        //allocate buffer of right size
                        recvBuffer[idxProc] = new FMpiBufferReader(comm.getComm(),globalReceiveMap[idxProc * nbProcess + idProcess]);
                        FMpi::MpiAssert( MPI_Irecv(recvBuffer[idxProc]->data(), recvBuffer[idxProc]->getCapacity(), MPI_PACKED,
                                                   idxProc, FMpi::TagFmmP2P, comm.getComm(), &requests[iterRequest++]) , __LINE__ );
                    }
                }

                nbMessagesToRecv = iterRequest;
                // Prepare send
                for(int idxProc = 0 ; idxProc < nbProcess ; ++idxProc){
                    if(toSend[idxProc].getSize() != 0){
                        sendBuffer[idxProc] = new FMpiBufferWriter(comm.getComm(),globalReceiveMap[idProcess*nbProcess+idxProc]);
                        // << is equivalent to write().
                        (*sendBuffer[idxProc]) << toSend[idxProc].getSize();
                        for(int idxLeaf = 0 ; idxLeaf < toSend[idxProc].getSize() ; ++idxLeaf){
                            (*sendBuffer[idxProc]) << toSend[idxProc][idxLeaf].getCurrentGlobalIndex();
                            toSend[idxProc][idxLeaf].getCurrentListSrc()->save(*sendBuffer[idxProc]);
                        }

                        FMpi::MpiAssert( MPI_Isend( sendBuffer[idxProc]->data(), sendBuffer[idxProc]->getSize() , MPI_PACKED ,
                                                    idxProc, FMpi::TagFmmP2P, comm.getComm(), &requests[iterRequest++]) , __LINE__ );

                    }
                }

                delete[] toSend;


                //////////////////////////////////////////////////////////
                // Waitsend receive
                //////////////////////////////////////////////////////////

                // Wait data
                FLOG(waitCounter.tic());
                MPI_Waitall(iterRequest, requests, status);
                FLOG(waitCounter.tac());

                for(int idxRcv = 0 ; idxRcv < nbMessagesToRecv ; ++idxRcv){
                    const int idxProc = status[idxRcv].MPI_SOURCE;
                    int nbLeaves;
                    (*recvBuffer[idxProc]) >> nbLeaves;
                    for(int idxLeaf = 0 ; idxLeaf < nbLeaves ; ++idxLeaf){
                        MortonIndex leafIndex;
                        (*recvBuffer[idxProc]) >> leafIndex;
                        otherP2Ptree.createLeaf(leafIndex)->getSrc()->restore((*recvBuffer[idxProc]));
                    }
                    delete recvBuffer[idxProc];
                    recvBuffer[idxProc] = nullptr;
                }
            }

            ///////////////////////////////////////////////////
            // Prepare data for thread P2P
            ///////////////////////////////////////////////////
PIACIBELLO Cyrille's avatar
PIACIBELLO Cyrille committed
1269
1270

#pragma omp single // MUST WAIT!
1271
1272
1273
            {
                typename OctreeClass::Iterator octreeIterator(tree);
                octreeIterator.gotoBottomLeft();
PIACIBELLO Cyrille's avatar
PIACIBELLO Cyrille committed
1274

1275
1276
1277
                // to store which shape for each leaf
                typename OctreeClass::Iterator* const myLeafs = new typename OctreeClass::Iterator[this->numberOfLeafs];
                int*const shapeType = new int[this->numberOfLeafs];
PIACIBELLO Cyrille's avatar
PIACIBELLO Cyrille committed
1278

1279
1280
                for(int idxLeaf = 0 ; idxLeaf < this->numberOfLeafs ; ++idxLeaf){
                    myLeafs[idxLeaf] = octreeIterator;
PIACIBELLO Cyrille's avatar
PIACIBELLO Cyrille committed
1281

1282
1283
1284
                    const FTreeCoordinate& coord = octreeIterator.getCurrentCell()->getCoordinate();
                    const int shape = (coord.getX()%3)*9 + (coord.getY()%3)*3 + (coord.getZ()%3);
                    shapeType[idxLeaf] = shape;
PIACIBELLO Cyrille's avatar
PIACIBELLO Cyrille committed
1285

1286
                    ++shapeLeaf[shape];
PIACIBELLO Cyrille's avatar
PIACIBELLO Cyrille committed
1287

1288
1289
                    octreeIterator.moveRight();
                }
PIACIBELLO Cyrille's avatar
PIACIBELLO Cyrille committed
1290

1291
1292
1293
1294
1295
                int startPosAtShape[SizeShape];
                startPosAtShape[0] = 0;
                for(int idxShape = 1 ; idxShape < SizeShape ; ++idxShape){
                    startPosAtShape[idxShape] = startPosAtShape[idxShape-1] + shapeLeaf[idxShape-1];
                }
PIACIBELLO Cyrille's avatar
PIACIBELLO Cyrille committed
1296

1297
1298
1299
                int idxInArray = 0;
                for(int idxLeaf = 0 ; idxLeaf < this->numberOfLeafs ; ++idxLeaf, ++idxInArray){
                    const int shapePosition = shapeType[idxInArray];
PIACIBELLO Cyrille's avatar
PIACIBELLO Cyrille committed
1300

1301
1302
1303
1304
1305
                    leafsDataArray[startPosAtShape[shapePosition]].coord = myLeafs[idxInArray].getCurrentGlobalCoordinate();
                    leafsDataArray[startPosAtShape[shapePosition]].cell = myLeafs[idxInArray].getCurrentCell();
                    leafsDataArray[startPosAtShape[shapePosition]].targets = myLeafs[idxInArray].getCurrentListTargets();
                    leafsDataArray[startPosAtShape[shapePosition]].sources = myLeafs[idxInArray].getCurrentListSrc();
                    if( leafsNeedOther.get(idxLeaf) ) leafsNeedOtherData.push(leafsDataArray[startPosAtShape[shapePosition]]);
PIACIBELLO Cyrille's avatar
PIACIBELLO Cyrille committed
1306

1307
1308
                    ++startPosAtShape[shapePosition];
                }
PIACIBELLO Cyrille's avatar
PIACIBELLO Cyrille committed
1309

1310
1311
                delete[] shapeType;
                delete[] myLeafs;
PIACIBELLO Cyrille's avatar
PIACIBELLO Cyrille committed
1312

1313
1314
                FLOG(prepareCounter.tac());
            }
PIACIBELLO Cyrille's avatar
PIACIBELLO Cyrille committed
1315
1316


1317
1318
1319
            //////////////////////////////////////////////////////////
            // Computation P2P that DO NOT need others data
            //////////////////////////////////////////////////////////
PIACIBELLO Cyrille's avatar
PIACIBELLO Cyrille committed
1320

1321
            {
PIACIBELLO Cyrille's avatar
PIACIBELLO Cyrille committed
1322
#pragma omp single nowait
1323
1324
1325
                {
                    FLOG(computationCounter.tic());
                    int previous = 0;
PIACIBELLO Cyrille's avatar
PIACIBELLO Cyrille committed
1326

1327
1328
1329
                    for(int idxShape = 0 ; idxShape < SizeShape ; ++idxShape){
                        const int endAtThisShape = shapeLeaf[idxShape] + previous;
                        const int chunckSize = FMath::Max(1, (endAtThisShape-previous)/(omp_get_num_threads()*omp_get_num_threads()));
PIACIBELLO Cyrille's avatar
PIACIBELLO Cyrille committed
1330

1331
1332
                        for(int idxLeafs = previous ; idxLeafs < endAtThisShape ; idxLeafs += chunckSize){
                            const int nbLeavesInTask = FMath::Min(endAtThisShape-idxLeafs, chunckSize);
BRAMAS Berenger's avatar
BRAMAS Berenger committed
1333
#pragma omp task default(none) firstprivate(nbLeavesInTask,idxLeafs) //+shared(leafsDataArray)
1334
1335
1336
1337
1338
1339
1340
                            {
                                KernelClass* myThreadkernels = (kernels[omp_get_thread_num()]);
                                // There is a maximum of 26 neighbors
                                ContainerClass* neighbors[27];

                                for(int idxTaskLeaf = idxLeafs ; idxTaskLeaf < (idxLeafs + nbLeavesInTask) ; ++idxTaskLeaf){
                                    LeafData& currentIter = leafsDataArray[idxTaskLeaf];
1341
1342
1343
1344
1345
1346
1347
                                    if(l2pEnabled){
                                        myThreadkernels->L2P(currentIter.cell, currentIter.targets);
                                    }
                                    if(p2pEnabled){
                                        // need the current particles and neighbors particles
                                        const int counter = tree->getLeafsNeighbors(neighbors, currentIter.coord, LeafIndex);
                                        myThreadkernels->P2P( currentIter.coord,currentIter.targets,
1348
                                                          currentIter.sources, neighbors, counter);
1349
                                    }
1350
1351
1352
1353
                                }
                            }
                        }
                        previous = endAtThisShape;
PIACIBELLO Cyrille's avatar
PIACIBELLO Cyrille committed
1354
1355

#pragma omp taskwait
1356
1357
1358
1359
                    }
                    FLOG(computationCounter.tac());
                }
            }
PIACIBELLO Cyrille's avatar
PIACIBELLO Cyrille committed
1360

1361
            // Wait the come to finish (and the previous computation also)
PIACIBELLO Cyrille's avatar
PIACIBELLO Cyrille committed
1362
1363
1364
#pragma omp barrier


1365
1366
1367
            //////////////////////////////////////////////////////////
            // Computation P2P that need others data
            //////////////////////////////////////////////////////////
PIACIBELLO Cyrille's avatar
PIACIBELLO Cyrille committed
1368
#pragma omp master
1369
            { FLOG( computation2Counter.tic() ); }
PIACIBELLO Cyrille's avatar
PIACIBELLO Cyrille committed
1370

1371
            if(p2pEnabled){
1372
1373
1374
1375
1376
1377
1378
                KernelClass& myThreadkernels = (*kernels[omp_get_thread_num()]);
                // There is a maximum of 26 neighbors
                ContainerClass* neighbors[27];
                MortonIndex indexesNeighbors[27];
                int indexArray[26];
                // Box limite
                const int nbLeafToProceed = leafsNeedOtherData.getSize();
PIACIBELLO Cyrille's avatar
PIACIBELLO Cyrille committed
1379
1380

#pragma omp for schedule(static)
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
                for(int idxLeafs = 0 ; idxLeafs < nbLeafToProceed ; ++idxLeafs){
                    LeafData currentIter = leafsNeedOtherData[idxLeafs];

                    // need the current particles and neighbors particles
                    int counter = 0;
                    memset( neighbors, 0, sizeof(ContainerClass*) * 27);

                    // Take possible data
                    const int nbNeigh = currentIter.coord.getNeighborsIndexes(OctreeHeight, indexesNeighbors, indexArray);