FFmmAlgorithmThreadProc.hpp 59.4 KB
Newer Older
1
// ===================================================================================
2 3 4 5 6 7 8 9
// Logiciel initial: ScalFmm Version 0.5
// Co-auteurs : Olivier Coulaud, Bérenger Bramas.
// Propriétaires : INRIA.
// Copyright © 2011-2012, diffusé sous les termes et conditions d’une licence propriétaire.
// Initial software: ScalFmm Version 0.5
// Co-authors: Olivier Coulaud, Bérenger Bramas.
// Owners: INRIA.
// Copyright © 2011-2012, spread under the terms and conditions of a proprietary license.
10
// ===================================================================================
11 12
#ifndef FFMMALGORITHMTHREADPROC_HPP
#define FFMMALGORITHMTHREADPROC_HPP
13

14 15 16 17 18 19 20

#include "../Utils/FAssertable.hpp"
#include "../Utils/FDebug.hpp"
#include "../Utils/FTrace.hpp"
#include "../Utils/FTic.hpp"
#include "../Utils/FGlobal.hpp"

21
#include "../Containers/FBoolArray.hpp"
22
#include "../Containers/FOctree.hpp"
berenger-bramas's avatar
berenger-bramas committed
23
#include "../Containers/FLightOctree.hpp"
24

berenger-bramas's avatar
berenger-bramas committed
25
#include "../Utils/FMpi.hpp"
26 27 28

#include <omp.h>

29

30 31 32 33 34 35
/**
* @author Berenger Bramas (berenger.bramas@inria.fr)
* @class FFmmAlgorithmThreadProc
* @brief
* Please read the license
*
berenger-bramas's avatar
berenger-bramas committed
36
* This class is a threaded FMM algorithm with mpi.
37 38 39 40 41 42 43
* It just iterates on a tree and call the kernels with good arguments.
* It used the inspector-executor model :
* iterates on the tree and builds an array to work in parallel on this array
*
* Of course this class does not deallocate pointer given in arguements.
*
* Threaded & based on the inspector-executor model
berenger-bramas's avatar
berenger-bramas committed
44 45
* schedule(runtime) export OMP_NUM_THREADS=2
* export OMPI_CXX=`which g++-4.4`
46 47 48
* mpirun -np 2 valgrind --suppressions=/usr/share/openmpi/openmpi-valgrind.supp
* --tool=memcheck --leak-check=yes --show-reachable=yes --num-callers=20 --track-fds=yes
* ./Tests/testFmmAlgorithmProc ../Data/testLoaderSmall.fma.tmp
49
*/
berenger-bramas's avatar
berenger-bramas committed
50 51
template<class OctreeClass, class ParticleClass, class CellClass, class ContainerClass, class KernelClass, class LeafClass>
class FFmmAlgorithmThreadProc : protected FAssertable {
52

53

berenger-bramas's avatar
berenger-bramas committed
54 55
    OctreeClass* const tree;                 //< The octree to work on
    KernelClass** kernels;                   //< The kernels
56

berenger-bramas's avatar
berenger-bramas committed
57 58
    typename OctreeClass::Iterator* iterArray;
    int numberOfLeafs;                          //< To store the size at the previous level
59

berenger-bramas's avatar
berenger-bramas committed
60
    const int MaxThreads;               //< the max number of thread allowed by openmp
61

berenger-bramas's avatar
berenger-bramas committed
62
    const int nbProcess;                //< Number of process
berenger-bramas's avatar
berenger-bramas committed
63
    const int idProcess;                //< Id of current process
berenger-bramas's avatar
berenger-bramas committed
64

65 66
    const int OctreeHeight;

berenger-bramas's avatar
berenger-bramas committed
67 68 69 70 71
    struct Interval{
        MortonIndex min;
        MortonIndex max;
    };
    Interval*const intervals;
72
    Interval*const workingIntervalsPerLevel;
berenger-bramas's avatar
berenger-bramas committed
73 74


75 76 77 78
    Interval& getWorkingInterval(const int level, const int proc){
        return workingIntervalsPerLevel[OctreeHeight * proc + level];
    }

79

80
public:
81 82 83 84 85 86 87 88 89

    Interval& getWorkingInterval(const int level){
        return getWorkingInterval(level, idProcess);
    }

    bool hasWorkAtLevel(const int level){
        return idProcess == 0 || getWorkingInterval(level, idProcess - 1).max < getWorkingInterval(level, idProcess).max;
    }

90 91 92 93 94
    /** The constructor need the octree and the kernels used for computation
      * @param inTree the octree to work on
      * @param inKernels the kernels to call
      * An assert is launched if one of the arguments is null
      */
95
    FFmmAlgorithmThreadProc(const FMpi::FComm& comm, OctreeClass* const inTree, KernelClass* const inKernels)
96 97 98 99
        : tree(inTree) , kernels(0), numberOfLeafs(0),
          MaxThreads(omp_get_max_threads()), nbProcess(comm.processCount()), idProcess(comm.processId()),
          OctreeHeight(tree->getHeight()),intervals(new Interval[comm.processCount()]),
          workingIntervalsPerLevel(new Interval[comm.processCount() * tree->getHeight()]){
100

101
        fassert(tree, "tree cannot be null", __LINE__, __FILE__);
102

berenger-bramas's avatar
berenger-bramas committed
103
        this->kernels = new KernelClass*[MaxThreads];
berenger-bramas's avatar
berenger-bramas committed
104
        for(int idxThread = 0 ; idxThread < MaxThreads ; ++idxThread){
berenger-bramas's avatar
berenger-bramas committed
105
            this->kernels[idxThread] = new KernelClass(*inKernels);
106 107 108
        }

        FDEBUG(FDebug::Controller << "FFmmAlgorithmThreadProc\n");
109
        FDEBUG(FDebug::Controller << "Max threads = "  << MaxThreads << ", Procs = " << nbProcess << ", I am " << idProcess << ".\n");
110 111 112 113
    }

    /** Default destructor */
    virtual ~FFmmAlgorithmThreadProc(){
berenger-bramas's avatar
berenger-bramas committed
114
        for(int idxThread = 0 ; idxThread < MaxThreads ; ++idxThread){
115 116
            delete this->kernels[idxThread];
        }
berenger-bramas's avatar
berenger-bramas committed
117
        delete [] this->kernels;
118

berenger-bramas's avatar
berenger-bramas committed
119
        delete [] intervals;
120
        delete [] workingIntervalsPerLevel;
121 122 123 124 125 126 127
    }

    /**
      * To execute the fmm algorithm
      * Call this function to run the complete algorithm
      */
    void execute(){
128
        FTRACE( FTrace::FFunction functionTrace( __FUNCTION__, "Fmm" , __FILE__ , __LINE__ ) );
129 130

        // Count leaf
berenger-bramas's avatar
berenger-bramas committed
131
        this->numberOfLeafs = 0;
132
        {
133 134
            FTRACE( FTrace::FRegion regionTrace( "Preprocess" , __FUNCTION__ , __FILE__ , __LINE__) );

135 136 137 138 139 140 141 142 143 144
            Interval myLastInterval;
            {
                typename OctreeClass::Iterator octreeIterator(tree);
                octreeIterator.gotoBottomLeft();
                myLastInterval.min = octreeIterator.getCurrentGlobalIndex();
                do{
                    ++this->numberOfLeafs;
                } while(octreeIterator.moveRight());
                myLastInterval.max = octreeIterator.getCurrentGlobalIndex();
            }
145 146
            iterArray = new typename OctreeClass::Iterator[numberOfLeafs];
            fassert(iterArray, "iterArray bad alloc", __LINE__, __FILE__);
147

148
            // We get the min/max indexes from each procs
149
            FMpi::MpiAssert( MPI_Allgather( &myLastInterval, sizeof(Interval), MPI_BYTE, intervals, sizeof(Interval), MPI_BYTE, MPI_COMM_WORLD),  __LINE__ );
150

151 152 153 154 155
            Interval myIntervals[OctreeHeight];
            myIntervals[OctreeHeight - 1] = myLastInterval;
            for(int idxLevel = OctreeHeight - 2 ; idxLevel >= 0 ; --idxLevel){
                myIntervals[idxLevel].min = myIntervals[idxLevel+1].min >> 3;
                myIntervals[idxLevel].max = myIntervals[idxLevel+1].max >> 3;
berenger-bramas's avatar
berenger-bramas committed
156
            }
157 158 159 160
            if(idProcess != 0){
                typename OctreeClass::Iterator octreeIterator(tree);
                octreeIterator.gotoBottomLeft();
                octreeIterator.moveUp();
161

162 163 164 165 166 167 168 169 170 171
                MortonIndex currentLimit = intervals[idProcess-1].max >> 3;

                for(int idxLevel = OctreeHeight - 2 ; idxLevel >= 1 ; --idxLevel){
                    while(octreeIterator.getCurrentGlobalIndex() <= currentLimit){
                        if( !octreeIterator.moveRight() ) break;
                    }
                    myIntervals[idxLevel].min = octreeIterator.getCurrentGlobalIndex();
                    octreeIterator.moveUp();
                    currentLimit >>= 3;
                }
berenger-bramas's avatar
berenger-bramas committed
172
            }
173 174

            // We get the min/max indexes from each procs
175
            FMpi::MpiAssert( MPI_Allgather( myIntervals, int(sizeof(Interval)) * OctreeHeight, MPI_BYTE,
176
                                            workingIntervalsPerLevel, int(sizeof(Interval)) * OctreeHeight, MPI_BYTE, MPI_COMM_WORLD),  __LINE__ );
177 178
        }

179
        // run;
180 181 182 183
        bottomPass();

        upwardPass();

184 185
        transferPass();

186 187 188 189 190 191 192 193
        downardPass();

        directPass();

        // delete array
        delete [] iterArray;
        iterArray = 0;

194

195 196
    }

197
private:
berenger-bramas's avatar
berenger-bramas committed
198

berenger-bramas's avatar
berenger-bramas committed
199 200 201 202
    /////////////////////////////////////////////////////////////////////////////
    // P2M
    /////////////////////////////////////////////////////////////////////////////

berenger-bramas's avatar
berenger-bramas committed
203
    /** P2M Bottom Pass */
204
    void bottomPass(){
205
        FTRACE( FTrace::FFunction functionTrace(__FUNCTION__, "Fmm" , __FILE__ , __LINE__) );
206
        FDEBUG( FDebug::Controller.write("\tStart Bottom Pass\n").write(FDebug::Flush) );
berenger-bramas's avatar
berenger-bramas committed
207
        FDEBUG(FTic counterTime);
208

berenger-bramas's avatar
berenger-bramas committed
209
        typename OctreeClass::Iterator octreeIterator(tree);
berenger-bramas's avatar
berenger-bramas committed
210

211 212
        // Iterate on leafs
        octreeIterator.gotoBottomLeft();
berenger-bramas's avatar
berenger-bramas committed
213
        int leafs = 0;
214
        do{
berenger-bramas's avatar
berenger-bramas committed
215
            iterArray[leafs++] = octreeIterator;
216 217
        } while(octreeIterator.moveRight());

berenger-bramas's avatar
berenger-bramas committed
218
        FDEBUG(FTic computationCounter);
219
#pragma omp parallel
220
        {
berenger-bramas's avatar
berenger-bramas committed
221
            KernelClass * const myThreadkernels = kernels[omp_get_thread_num()];
222
#pragma omp for nowait
berenger-bramas's avatar
berenger-bramas committed
223
            for(int idxLeafs = 0 ; idxLeafs < leafs ; ++idxLeafs){
224 225 226 227 228
                myThreadkernels->P2M( iterArray[idxLeafs].getCurrentCell() , iterArray[idxLeafs].getCurrentListSrc());
            }
        }
        FDEBUG(computationCounter.tac());

229

230
        FDEBUG( FDebug::Controller << "\tFinished (@Bottom Pass (P2M) = "  << counterTime.tacAndElapsed() << "s)\n" );
231
        FDEBUG( FDebug::Controller << "\t\t Computation : " << computationCounter.elapsed() << " s\n" );
232

233 234
    }

berenger-bramas's avatar
berenger-bramas committed
235 236 237 238
    /////////////////////////////////////////////////////////////////////////////
    // Upward
    /////////////////////////////////////////////////////////////////////////////

239 240
    /** M2M */
    void upwardPass(){
241
        FTRACE( FTrace::FFunction functionTrace(__FUNCTION__, "Fmm" , __FILE__ , __LINE__) );
242
        FDEBUG( FDebug::Controller.write("\tStart Upward Pass\n").write(FDebug::Flush); );
berenger-bramas's avatar
berenger-bramas committed
243 244
        FDEBUG(FTic counterTime);
        FDEBUG(FTic computationCounter);
245 246
        FDEBUG(FTic prepareCounter);
        FDEBUG(FTic waitCounter);
247 248

        // Start from leal level - 1
berenger-bramas's avatar
berenger-bramas committed
249
        typename OctreeClass::Iterator octreeIterator(tree);
250 251
        octreeIterator.gotoBottomLeft();
        octreeIterator.moveUp();
berenger-bramas's avatar
berenger-bramas committed
252
        typename OctreeClass::Iterator avoidGotoLeftIterator(octreeIterator);
253

254 255
        // This variable is the proc responsible
        // of the shared cells
256
        int sendToProc = idProcess;
berenger-bramas's avatar
berenger-bramas committed
257

258
        // There are a maximum of 8-1 sends and 8-1 receptions
259
        MPI_Request requests[14];
260
        MPI_Status status[14];
261

262
        // Maximum data per message is:
263
        const int recvBufferOffset = 8 * CellClass::SerializedSizeUp + 1;
264 265
        char sendBuffer[recvBufferOffset];
        char recvBuffer[nbProcess * recvBufferOffset];
266
        CellClass recvBufferCells[8];
267

268 269
        int firstProcThatSend = idProcess + 1;

270
        // for each levels
271
        for(int idxLevel = OctreeHeight - 2 ; idxLevel > 1 ; --idxLevel ){
272 273 274 275 276 277
            // No more work for me
            if(idProcess != 0
                    && getWorkingInterval((idxLevel+1), idProcess).max <= getWorkingInterval((idxLevel+1), idProcess - 1).max){
                break;
            }

berenger-bramas's avatar
berenger-bramas committed
278
            // copy cells to work with
berenger-bramas's avatar
berenger-bramas committed
279
            int numberOfCells = 0;
280 281
            // for each cells
            do{
282
                iterArray[numberOfCells++] = octreeIterator;
283 284
            } while(octreeIterator.moveRight());
            avoidGotoLeftIterator.moveUp();
berenger-bramas's avatar
berenger-bramas committed
285 286
            octreeIterator = avoidGotoLeftIterator;

287
            // We may need to send something
berenger-bramas's avatar
berenger-bramas committed
288
            int iterRequests = 0;
289 290 291 292 293
            int cellsToSend = -1;

            while(iterArray[cellsToSend+1].getCurrentGlobalIndex() < getWorkingInterval(idxLevel, idProcess).min){
                ++cellsToSend;
            }
berenger-bramas's avatar
berenger-bramas committed
294

295 296
            FTRACE( FTrace::FRegion regionTrace( "Preprocess" , __FUNCTION__ , __FILE__ , __LINE__) );

297
            FDEBUG(prepareCounter.tic());
298
            if(idProcess != 0
299 300
                    && (getWorkingInterval((idxLevel+1), idProcess).min >>3) <= (getWorkingInterval((idxLevel+1), idProcess - 1).max >>3)){

301 302 303
                char state = 0;
                int idxBuff = 1;

304
                const CellClass* const* const child = iterArray[cellsToSend].getCurrentChild();
305
                for(int idxChild = 0 ; idxChild < 8 ; ++idxChild){
306
                    if( child[idxChild] && getWorkingInterval((idxLevel+1), idProcess).min <= child[idxChild]->getMortonIndex() ){
307 308
                        child[idxChild]->serializeUp(&sendBuffer[idxBuff]);
                        idxBuff += CellClass::SerializedSizeUp;
309
                        state = char(state | (0x1 << idxChild));
berenger-bramas's avatar
berenger-bramas committed
310
                    }
311 312
                }
                sendBuffer[0] = state;
313

314
                while( sendToProc && iterArray[cellsToSend].getCurrentGlobalIndex() == getWorkingInterval(idxLevel , sendToProc - 1).max){
315
                    --sendToProc;
316
                }
317

318
                MPI_Isend(sendBuffer, idxBuff, MPI_BYTE, sendToProc, FMpi::TagFmmM2M, MPI_COMM_WORLD, &requests[iterRequests++]);
berenger-bramas's avatar
berenger-bramas committed
319
            }
320

321
            // We may need to receive something
322
            bool hasToReceive = false;
323 324
            int endProcThatSend = firstProcThatSend;

berenger-bramas's avatar
berenger-bramas committed
325
            if(idProcess != nbProcess - 1){
326 327
                while(firstProcThatSend < nbProcess
                      && getWorkingInterval((idxLevel+1), firstProcThatSend).max < getWorkingInterval((idxLevel+1), idProcess).max){
berenger-bramas's avatar
berenger-bramas committed
328
                    ++firstProcThatSend;
329
                }
330

331
                if(firstProcThatSend < nbProcess &&
332
                        (getWorkingInterval((idxLevel+1), firstProcThatSend).min >>3) <= (getWorkingInterval((idxLevel+1) , idProcess).max>>3) ){
333 334 335 336

                    endProcThatSend = firstProcThatSend;

                    while( endProcThatSend < nbProcess &&
337
                           (getWorkingInterval((idxLevel+1) ,endProcThatSend).min >>3) <= (getWorkingInterval((idxLevel+1) , idProcess).max>>3)){
338 339 340 341 342 343 344 345 346 347 348
                        ++endProcThatSend;
                    }


                    if(firstProcThatSend != endProcThatSend){
                        hasToReceive = true;

                        for(int idxProc = firstProcThatSend ; idxProc < endProcThatSend ; ++idxProc ){

                            MPI_Irecv(&recvBuffer[idxProc * recvBufferOffset], recvBufferOffset, MPI_BYTE, idxProc, FMpi::TagFmmM2M, MPI_COMM_WORLD, &requests[iterRequests++]);
                        }
349
                    }
350
                }
351
            }
352
            FDEBUG(prepareCounter.tac());
353
            FTRACE( regionTrace.end() );
354

355 356
            // Compute
            FDEBUG(computationCounter.tic());
357
#pragma omp parallel
358 359 360
            {
                const int endIndex = (hasToReceive?numberOfCells-1:numberOfCells);
                KernelClass& myThreadkernels = (*kernels[omp_get_thread_num()]);
361
#pragma omp for
362
                for( int idxCell = cellsToSend + 1 ; idxCell < endIndex ; ++idxCell){
363 364
                    myThreadkernels.M2M( iterArray[idxCell].getCurrentCell() , iterArray[idxCell].getCurrentChild(), idxLevel);
                }
365
            }
366
            FDEBUG(computationCounter.tac());
367

368
            // Are we sending or waiting anything?
369
            if(iterRequests){
370
                FDEBUG(waitCounter.tic());
371
                MPI_Waitall( iterRequests, requests, status);
372
                FDEBUG(waitCounter.tac());
373

374
                // we were receiving data
375 376 377 378
                if( hasToReceive ){
                    CellClass* currentChild[8];
                    memcpy(currentChild, iterArray[numberOfCells - 1].getCurrentChild(), 8 * sizeof(CellClass*));

379
                    // retreive data and merge my child and the child from others
380
                    for(int idxProc = firstProcThatSend ; idxProc < endProcThatSend ; ++idxProc){
381
                        int state = int(recvBuffer[idxProc * recvBufferOffset]);
382 383 384 385 386 387 388 389 390 391

                        int position = 0;
                        int bufferIndex = 1;
                        while( state && position < 8){
                            while(!(state & 0x1)){
                                state >>= 1;
                                ++position;
                            }

                            fassert(!currentChild[position], "Already has a cell here", __LINE__, __FILE__);
392 393 394 395 396

                            recvBufferCells[position].deserializeUp(&recvBuffer[idxProc * recvBufferOffset + bufferIndex]);
                            bufferIndex += CellClass::SerializedSizeUp;

                            currentChild[position] = (CellClass*) &recvBufferCells[position];
397 398 399 400 401

                            state >>= 1;
                            ++position;
                        }
                    }
berenger-bramas's avatar
berenger-bramas committed
402

403 404 405 406
                    // Finally compute
                    FDEBUG(computationCounter.tic());
                    (*kernels[0]).M2M( iterArray[numberOfCells - 1].getCurrentCell() , currentChild, idxLevel);
                    FDEBUG(computationCounter.tac());
407 408 409


                    firstProcThatSend = endProcThatSend - 1;
410
                }
411 412
            }
        }
berenger-bramas's avatar
berenger-bramas committed
413

414

415
        FDEBUG( FDebug::Controller << "\tFinished (@Upward Pass (M2M) = "  << counterTime.tacAndElapsed() << "s)\n" );
berenger-bramas's avatar
berenger-bramas committed
416
        FDEBUG( FDebug::Controller << "\t\t Computation : " << computationCounter.cumulated() << " s\n" );
417 418
        FDEBUG( FDebug::Controller << "\t\t Prepare : " << prepareCounter.cumulated() << " s\n" );
        FDEBUG( FDebug::Controller << "\t\t Wait : " << waitCounter.cumulated() << " s\n" );
419

420 421
    }

berenger-bramas's avatar
berenger-bramas committed
422 423 424 425
    /////////////////////////////////////////////////////////////////////////////
    // Downard
    /////////////////////////////////////////////////////////////////////////////

426 427
    /** M2L  */
    void transferPass(){
428
        FTRACE( FTrace::FFunction functionTrace(__FUNCTION__, "Fmm" , __FILE__ , __LINE__) );
429

430 431 432 433 434 435 436
        FDEBUG( FDebug::Controller.write("\tStart Downward Pass (M2L)\n").write(FDebug::Flush); );
        FDEBUG(FTic counterTime);
        FDEBUG(FTic computationCounter);
        FDEBUG(FTic sendCounter);
        FDEBUG(FTic receiveCounter);
        FDEBUG(FTic prepareCounter);
        FDEBUG(FTic gatherCounter);
437

438 439 440
        //////////////////////////////////////////////////////////////////
        // First know what to send to who
        //////////////////////////////////////////////////////////////////
441

442 443 444 445 446 447 448 449
        // pointer to send
        typename OctreeClass::Iterator* toSend[nbProcess * OctreeHeight];
        memset(toSend, 0, sizeof(typename OctreeClass::Iterator*) * nbProcess * OctreeHeight );
        int sizeToSend[nbProcess * OctreeHeight];
        memset(sizeToSend, 0, sizeof(int) * nbProcess * OctreeHeight);
        // index
        int indexToSend[nbProcess * OctreeHeight];
        memset(indexToSend, 0, sizeof(int) * nbProcess * OctreeHeight);
450

451 452
        // To know if a leaf has been already sent to a proc
        bool alreadySent[nbProcess];
453

454 455
        FBoolArray* leafsNeedOther[OctreeHeight];
        memset(leafsNeedOther, 0, sizeof(FBoolArray) * OctreeHeight);
456

457 458 459
        {
            FTRACE( FTrace::FRegion regionTrace( "Preprocess" , __FUNCTION__ , __FILE__ , __LINE__) );
            FDEBUG(prepareCounter.tic());
460

461 462 463 464 465 466 467
            typename OctreeClass::Iterator octreeIterator(tree);
            octreeIterator.moveDown();
            typename OctreeClass::Iterator avoidGotoLeftIterator(octreeIterator);
            // for each levels
            for(int idxLevel = 2 ; idxLevel < OctreeHeight ; ++idxLevel ){
                if(idProcess != 0
                        && getWorkingInterval(idxLevel, idProcess).max <= getWorkingInterval(idxLevel, idProcess - 1).max){
468 469 470
                    avoidGotoLeftIterator.moveDown();
                    octreeIterator = avoidGotoLeftIterator;

471 472 473 474
                    continue;
                }

                int numberOfCells = 0;
475

476 477 478
                while(octreeIterator.getCurrentGlobalIndex() <  getWorkingInterval(idxLevel , idProcess).min){
                    octreeIterator.moveRight();
                }
479

480 481 482 483 484 485 486 487 488 489 490 491 492
                // for each cells
                do{
                    iterArray[numberOfCells] = octreeIterator;
                    ++numberOfCells;
                } while(octreeIterator.moveRight());
                avoidGotoLeftIterator.moveDown();
                octreeIterator = avoidGotoLeftIterator;

                leafsNeedOther[idxLevel] = new FBoolArray(numberOfCells);


                // Which cell potentialy needs other data and in the same time
                // are potentialy needed by other
493
                int neighborsPosition[189];
494 495 496
                MortonIndex neighborsIndexes[189];
                for(int idxCell = 0 ; idxCell < numberOfCells ; ++idxCell){
                    // Find the M2L neigbors of a cell
497
                    const int counter = getInteractionNeighbors(iterArray[idxCell].getCurrentGlobalCoordinate(), idxLevel, neighborsIndexes, neighborsPosition);
498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513

                    memset(alreadySent, false, sizeof(bool) * nbProcess);
                    bool needOther = false;
                    // Test each negibors to know which one do not belong to us
                    for(int idxNeigh = 0 ; idxNeigh < counter ; ++idxNeigh){
                        if(neighborsIndexes[idxNeigh] < getWorkingInterval(idxLevel , idProcess).min
                                || getWorkingInterval(idxLevel , idProcess).max < neighborsIndexes[idxNeigh]){
                            int procToReceive = idProcess;
                            while( 0 != procToReceive && neighborsIndexes[idxNeigh] < getWorkingInterval(idxLevel , procToReceive).min ){
                                --procToReceive;
                            }
                            while( procToReceive != nbProcess -1 && getWorkingInterval(idxLevel , procToReceive).max < neighborsIndexes[idxNeigh]){
                                ++procToReceive;
                            }
                            // Maybe already sent to that proc?
                            if( !alreadySent[procToReceive]
514 515
                                    && getWorkingInterval(idxLevel , procToReceive).min <= neighborsIndexes[idxNeigh]
                                    && neighborsIndexes[idxNeigh] <= getWorkingInterval(idxLevel , procToReceive).max){
516

517
                                alreadySent[procToReceive] = true;
518

519
                                needOther = true;
520

521 522 523 524 525 526 527
                                if(indexToSend[idxLevel * nbProcess + procToReceive] ==  sizeToSend[idxLevel * nbProcess + procToReceive]){
                                    const int previousSize = sizeToSend[idxLevel * nbProcess + procToReceive];
                                    sizeToSend[idxLevel * nbProcess + procToReceive] = FMath::Max(int(10*sizeof(typename OctreeClass::Iterator)), int(sizeToSend[idxLevel * nbProcess + procToReceive] * 1.5));
                                    typename OctreeClass::Iterator* temp = toSend[idxLevel * nbProcess + procToReceive];
                                    toSend[idxLevel * nbProcess + procToReceive] = reinterpret_cast<typename OctreeClass::Iterator*>(new char[sizeof(typename OctreeClass::Iterator) * sizeToSend[idxLevel * nbProcess + procToReceive]]);
                                    memcpy(toSend[idxLevel * nbProcess + procToReceive], temp, previousSize * sizeof(typename OctreeClass::Iterator));
                                    delete[] reinterpret_cast<char*>(temp);
528
                                }
529 530

                                toSend[idxLevel * nbProcess + procToReceive][indexToSend[idxLevel * nbProcess + procToReceive]++] = iterArray[idxCell];
531 532
                            }
                        }
533 534 535
                    }
                    if(needOther){
                        leafsNeedOther[idxLevel]->set(idxCell,true);
536 537 538
                    }

                }
539

540
            }
541
            FDEBUG(prepareCounter.tac());
542

543
        }
544

545 546 547
        //////////////////////////////////////////////////////////////////
        // Gather this information
        //////////////////////////////////////////////////////////////////
548

549 550 551 552 553 554 555
        FDEBUG(gatherCounter.tic());
        // All process say to each others
        // what the will send to who
        int globalReceiveMap[nbProcess * nbProcess * OctreeHeight];
        memset(globalReceiveMap, 0, sizeof(int) * nbProcess * nbProcess * OctreeHeight);
        FMpi::MpiAssert( MPI_Allgather( indexToSend, nbProcess * OctreeHeight, MPI_INT, globalReceiveMap, nbProcess * OctreeHeight, MPI_INT, MPI_COMM_WORLD),  __LINE__ );
        FDEBUG(gatherCounter.tac());
556 557


558 559 560
        //////////////////////////////////////////////////////////////////
        // Send and receive for real
        //////////////////////////////////////////////////////////////////
561

562 563 564 565 566 567
        FDEBUG(sendCounter.tic());
        // Then they can send and receive (because they know what they will receive)
        // To send in asynchrone way
        MPI_Request requests[2 * nbProcess * OctreeHeight];
        MPI_Status status[2 * nbProcess * OctreeHeight];
        int iterRequest = 0;
568

569
        const int SizeOfCellToSend = sizeof(MortonIndex) + CellClass::SerializedSizeUp;
570

571 572
        char* sendBuffer[nbProcess * OctreeHeight];
        memset(sendBuffer, 0, sizeof(CellClass*) * nbProcess * OctreeHeight);
573

574 575
        char* recvBuffer[nbProcess * OctreeHeight];
        memset(recvBuffer, 0, sizeof(CellClass*) * nbProcess * OctreeHeight);
576 577


578 579 580 581 582 583 584 585 586 587
        for(int idxLevel = 2 ; idxLevel < OctreeHeight ; ++idxLevel ){
            for(int idxProc = 0 ; idxProc < nbProcess ; ++idxProc){
                const int toSendAtProcAtLevel = indexToSend[idxLevel * nbProcess + idxProc];
                if(toSendAtProcAtLevel != 0){
                    sendBuffer[idxLevel * nbProcess + idxProc] = new char[toSendAtProcAtLevel * SizeOfCellToSend];

                    for(int idxLeaf = 0 ; idxLeaf < toSendAtProcAtLevel; ++idxLeaf){
                        const MortonIndex cellIndex = toSend[idxLevel * nbProcess + idxProc][idxLeaf].getCurrentGlobalIndex();
                        memcpy(&sendBuffer[idxLevel * nbProcess + idxProc][idxLeaf * SizeOfCellToSend],&cellIndex, sizeof(MortonIndex));
                        toSend[idxLevel * nbProcess + idxProc][idxLeaf].getCurrentCell()->serializeUp(&sendBuffer[idxLevel * nbProcess + idxProc][idxLeaf * SizeOfCellToSend] + sizeof(MortonIndex));
588 589
                    }

590 591 592
                    FMpi::MpiAssert( MPI_Isend( sendBuffer[idxLevel * nbProcess + idxProc], toSendAtProcAtLevel * SizeOfCellToSend , MPI_BYTE ,
                                                idxProc, FMpi::TagLast + idxLevel, MPI_COMM_WORLD, &requests[iterRequest++]) , __LINE__ );
                }
593

594 595 596 597 598 599
                const int toReceiveFromProcAtLevel = globalReceiveMap[(idxProc * nbProcess * OctreeHeight) + idxLevel * nbProcess + idProcess];
                if(toReceiveFromProcAtLevel){
                    recvBuffer[idxLevel * nbProcess + idxProc] = new char[toReceiveFromProcAtLevel * SizeOfCellToSend];

                    FMpi::MpiAssert( MPI_Irecv(recvBuffer[idxLevel * nbProcess + idxProc], toReceiveFromProcAtLevel * SizeOfCellToSend, MPI_BYTE,
                                               idxProc, FMpi::TagLast + idxLevel, MPI_COMM_WORLD, &requests[iterRequest++]) , __LINE__ );
600
                }
601
            }
602 603
        }
        FDEBUG(sendCounter.tac());
604

605 606 607
        //////////////////////////////////////////////////////////////////
        // Do M2L
        //////////////////////////////////////////////////////////////////
608

609 610 611 612 613 614 615 616 617 618
        {
            FTRACE( FTrace::FRegion regionTrace("Compute", __FUNCTION__ , __FILE__ , __LINE__) );
            typename OctreeClass::Iterator octreeIterator(tree);
            octreeIterator.moveDown();
            typename OctreeClass::Iterator avoidGotoLeftIterator(octreeIterator);
            // Now we can compute all the data
            // for each levels
            for(int idxLevel = 2 ; idxLevel < OctreeHeight ; ++idxLevel ){
                if(idProcess != 0
                        && getWorkingInterval(idxLevel, idProcess).max <= getWorkingInterval(idxLevel, idProcess - 1).max){
619

620 621 622
                    avoidGotoLeftIterator.moveDown();
                    octreeIterator = avoidGotoLeftIterator;

623
                    continue;
624
                }
625

626 627 628 629 630 631 632 633 634 635 636
                int numberOfCells = 0;
                while(octreeIterator.getCurrentGlobalIndex() <  getWorkingInterval(idxLevel , idProcess).min){
                    octreeIterator.moveRight();
                }
                // for each cells
                do{
                    iterArray[numberOfCells] = octreeIterator;
                    ++numberOfCells;
                } while(octreeIterator.moveRight());
                avoidGotoLeftIterator.moveDown();
                octreeIterator = avoidGotoLeftIterator;
637

638
                FDEBUG(computationCounter.tic());
639
                #pragma omp parallel
640 641
                {
                    KernelClass * const myThreadkernels = kernels[omp_get_thread_num()];
642
                    const CellClass* neighbors[343];
643

644
                    #pragma omp for  schedule(dynamic) nowait
645
                    for(int idxCell = 0 ; idxCell < numberOfCells ; ++idxCell){
646
                        const int counter = tree->getInteractionNeighbors(neighbors,  iterArray[idxCell].getCurrentGlobalCoordinate(), idxLevel);
647
                        if(counter) myThreadkernels->M2L( iterArray[idxCell].getCurrentCell() , neighbors, counter, idxLevel);
648 649
                    }
                }
650
                FDEBUG(computationCounter.tac());
651
            }
652
        }
berenger-bramas's avatar
berenger-bramas committed
653

654
        //////////////////////////////////////////////////////////////////
655
        // Wait received data and compute
656 657
        //////////////////////////////////////////////////////////////////

658 659
        // Wait to receive every things (and send every things)
        MPI_Waitall(iterRequest, requests, status);
berenger-bramas's avatar
berenger-bramas committed
660

661 662 663
        {
            FTRACE( FTrace::FRegion regionTrace("Compute Received data", __FUNCTION__ , __FILE__ , __LINE__) );
            FDEBUG(receiveCounter.tic());
664 665 666
            typename OctreeClass::Iterator octreeIterator(tree);
            octreeIterator.moveDown();
            typename OctreeClass::Iterator avoidGotoLeftIterator(octreeIterator);
667 668 669
            // compute the second time
            // for each levels
            for(int idxLevel = 2 ; idxLevel < OctreeHeight ; ++idxLevel ){
670
                if(idProcess != 0
671
                        && getWorkingInterval(idxLevel, idProcess).max <= getWorkingInterval(idxLevel, idProcess - 1).max){
672 673 674 675 676 677 678

                    avoidGotoLeftIterator.moveDown();
                    octreeIterator = avoidGotoLeftIterator;

                    continue;
                }

679 680 681 682 683 684 685 686 687 688 689 690 691 692 693
                // put the received data into a temporary tree
                FLightOctree tempTree;
                for(int idxProc = 0 ; idxProc < nbProcess ; ++idxProc){
                    const int toReceiveFromProcAtLevel = globalReceiveMap[(idxProc * nbProcess * OctreeHeight) + idxLevel * nbProcess + idProcess];
                    const char* const cells = recvBuffer[idxLevel * nbProcess + idxProc];
                    for(int idxCell = 0 ; idxCell < toReceiveFromProcAtLevel ; ++idxCell){
                        MortonIndex cellIndex = 0;
                        memcpy(&cellIndex, &cells[idxCell * SizeOfCellToSend], sizeof(MortonIndex));
                        const char* const cellData = &cells[idxCell * SizeOfCellToSend] + sizeof(MortonIndex);
                        tempTree.insertCell(cellIndex, cellData, idxLevel);
                    }
                }

                // take cells from our octree only if they are
                // linked to received data
694
                int numberOfCells = 0;
695 696 697 698 699
                int realCellId = 0;

                while(octreeIterator.getCurrentGlobalIndex() <  getWorkingInterval(idxLevel , idProcess).min){
                    octreeIterator.moveRight();
                }
700 701
                // for each cells
                do{
702 703 704 705
                    // copy cells that need data from others
                    if(leafsNeedOther[idxLevel]->get(realCellId++)){
                        iterArray[numberOfCells++] = octreeIterator;
                    }
706 707 708 709
                } while(octreeIterator.moveRight());
                avoidGotoLeftIterator.moveDown();
                octreeIterator = avoidGotoLeftIterator;

710 711 712 713 714
                delete leafsNeedOther[idxLevel];
                leafsNeedOther[idxLevel] = 0;

                // Compute this cells
                FDEBUG(computationCounter.tic());
715
                #pragma omp parallel
716 717 718 719
                {
                    KernelClass * const myThreadkernels = kernels[omp_get_thread_num()];
                    MortonIndex neighborsIndex[189];
                    CellClass neighborsData[189];
720 721
                    int neighborsPosition[189];
                    const CellClass* neighbors[343];
722

723
                    #pragma omp for  schedule(dynamic) nowait
724 725
                    for(int idxCell = 0 ; idxCell < numberOfCells ; ++idxCell){
                        // compute indexes
726
                        const int counterNeighbors = getInteractionNeighbors(iterArray[idxCell].getCurrentGlobalCoordinate(), idxLevel, neighborsIndex, neighborsPosition);
727 728 729 730

                        int counter = 0;
                        // does we receive this index from someone?
                        for(int idxNeig = 0 ;idxNeig < counterNeighbors ; ++idxNeig){
731 732 733 734 735 736 737 738 739
                            if(neighborsIndex[idxNeig] < getWorkingInterval(idxLevel , idProcess).min
                                    || getWorkingInterval(idxLevel , idProcess).max < neighborsIndex[idxNeig]){
                                const void* const cellFromOtherProc = tempTree.getCell(neighborsIndex[idxNeig], idxLevel);
                                if(cellFromOtherProc){
                                    neighborsData[counter].deserializeUp(cellFromOtherProc);
                                    neighborsData[counter].setMortonIndex(neighborsIndex[idxNeig]);
                                    neighbors[ neighborsPosition[counter] ] = &neighborsData[counter];
                                    ++counter;
                                }
740 741 742 743 744
                            }
                        }
                        // need to compute
                        if(counter){
                            myThreadkernels->M2L( iterArray[idxCell].getCurrentCell() , neighbors, counter, idxLevel);
745
                            memset(neighbors, 0, 343 * sizeof(CellClass*));
746 747
                        }
                    }
748
                }
749 750 751 752
                FDEBUG(computationCounter.tac());
            }
            FDEBUG(receiveCounter.tac());
        }
753

754 755 756 757 758
        for(int idxComm = 0 ; idxComm < nbProcess * OctreeHeight; ++idxComm){
            delete[] sendBuffer[idxComm];
            delete[] recvBuffer[idxComm];
            delete[] reinterpret_cast<char*>( toSend[idxComm] );
        }
759

760 761 762 763 764 765 766
        FDEBUG( FDebug::Controller << "\tFinished (@Downward Pass (M2L) = "  << counterTime.tacAndElapsed() << "s)\n" );
        FDEBUG( FDebug::Controller << "\t\t Computation : " << computationCounter.cumulated() << " s\n" );
        FDEBUG( FDebug::Controller << "\t\t Send : " << sendCounter.cumulated() << " s\n" );
        FDEBUG( FDebug::Controller << "\t\t Receive : " << receiveCounter.cumulated() << " s\n" );
        FDEBUG( FDebug::Controller << "\t\t Gather : " << gatherCounter.cumulated() << " s\n" );
        FDEBUG( FDebug::Controller << "\t\t Prepare : " << prepareCounter.cumulated() << " s\n" );
    }
767

768 769 770
    //////////////////////////////////////////////////////////////////
    // ---------------- L2L ---------------
    //////////////////////////////////////////////////////////////////
771

772 773 774 775 776 777 778
    void downardPass(){ // second L2L
        FTRACE( FTrace::FFunction functionTrace(__FUNCTION__, "Fmm" , __FILE__ , __LINE__) );
        FDEBUG( FDebug::Controller.write("\tStart Downward Pass (L2L)\n").write(FDebug::Flush); );
        FDEBUG(FTic counterTime);
        FDEBUG(FTic computationCounter);
        FDEBUG(FTic prepareCounter);
        FDEBUG(FTic waitCounter);
779

780 781 782 783
        // Start from leal level - 1
        typename OctreeClass::Iterator octreeIterator(tree);
        octreeIterator.moveDown();
        typename OctreeClass::Iterator avoidGotoLeftIterator(octreeIterator);
784

785 786
        MPI_Request requests[nbProcess];
        MPI_Status status[nbProcess];
787

788
        const int heightMinusOne = OctreeHeight - 1;
789

790 791
        char sendBuffer[CellClass::SerializedSizeDown];
        char recvBuffer[CellClass::SerializedSizeDown];
792

793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836
        // for each levels exepted leaf level
        for(int idxLevel = 2 ; idxLevel < heightMinusOne ; ++idxLevel ){
            if(idProcess != 0
                    && getWorkingInterval((idxLevel+1) , idProcess).max <= getWorkingInterval((idxLevel+1) , idProcess - 1).max){

                avoidGotoLeftIterator.moveDown();
                octreeIterator = avoidGotoLeftIterator;

                continue;
            }

            // copy cells to work with
            int numberOfCells = 0;
            // for each cells
            do{
                iterArray[numberOfCells++] = octreeIterator;
            } while(octreeIterator.moveRight());
            avoidGotoLeftIterator.moveDown();
            octreeIterator = avoidGotoLeftIterator;

            int firstCellWork = -1;
            while(iterArray[firstCellWork+1].getCurrentGlobalIndex() < getWorkingInterval(idxLevel , idProcess).min){
                ++firstCellWork;
            }

            bool needToRecv = false;
            int iterRequests = 0;

            FDEBUG(prepareCounter.tic());

            // do we need to receive one or zeros cell
            if(idProcess != 0
                    && (getWorkingInterval((idxLevel + 1) , idProcess).min >> 3 ) <= (getWorkingInterval((idxLevel+1) , idProcess - 1).max >> 3 ) ){
                needToRecv = true;

                MPI_Irecv( recvBuffer, CellClass::SerializedSizeDown, MPI_BYTE, MPI_ANY_SOURCE, FMpi::TagFmmL2L, MPI_COMM_WORLD, &requests[iterRequests++]);
            }


            if(idProcess != nbProcess - 1){
                int firstProcThatRecv = idProcess + 1;
                while( firstProcThatRecv < nbProcess &&
                       getWorkingInterval((idxLevel + 1) , firstProcThatRecv).max <= getWorkingInterval((idxLevel+1) , idProcess).max){
                    ++firstProcThatRecv;
837
                }
838

839 840 841 842
                int endProcThatRecv = firstProcThatRecv;
                while( endProcThatRecv < nbProcess &&
                       (getWorkingInterval((idxLevel + 1) , endProcThatRecv).min >> 3) <= (getWorkingInterval((idxLevel+1) , idProcess).max >> 3) ){
                    ++endProcThatRecv;
843 844
                }

845 846 847 848 849
                if(firstProcThatRecv != endProcThatRecv){
                    iterArray[numberOfCells - 1].getCurrentCell()->serializeDown(sendBuffer);
                    for(int idxProc = firstProcThatRecv ; idxProc < endProcThatRecv ; ++idxProc ){

                        MPI_Isend(sendBuffer, CellClass::SerializedSizeDown, MPI_BYTE, idxProc, FMpi::TagFmmL2L, MPI_COMM_WORLD, &requests[iterRequests++]);
850 851 852
                    }
                }
            }
853
            FDEBUG(prepareCounter.tac());
854

855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871
            FDEBUG(computationCounter.tic());
#pragma omp parallel
            {
                KernelClass& myThreadkernels = (*kernels[omp_get_thread_num()]);
#pragma omp for
                for(int idxCell = firstCellWork + 1 ; idxCell < numberOfCells ; ++idxCell){
                    myThreadkernels.L2L( iterArray[idxCell].getCurrentCell() , iterArray[idxCell].getCurrentChild(), idxLevel);
                }
            }
            FDEBUG(computationCounter.tac());

            // are we sending or receiving?
            if(iterRequests){
                // process
                FDEBUG(waitCounter.tic());
                MPI_Waitall( iterRequests, requests, status);
                FDEBUG(waitCounter.tac());
berenger-bramas's avatar
berenger-bramas committed
872

873 874 875 876 877 878 879 880 881
                if(needToRecv){
                    // Need to compute
                    FDEBUG(computationCounter.tic());
                    iterArray[firstCellWork].getCurrentCell()->deserializeDown(recvBuffer);
                    kernels[0]->L2L( iterArray[firstCellWork].getCurrentCell() , iterArray[firstCellWork].getCurrentChild(), idxLevel);
                    FDEBUG(computationCounter.tac());
                }
            }
        }
882

883 884 885 886
        FDEBUG( FDebug::Controller << "\tFinished (@Downward Pass (L2L) = "  << counterTime.tacAndElapsed() << "s)\n" );
        FDEBUG( FDebug::Controller << "\t\t Computation : " << computationCounter.cumulated() << " s\n" );
        FDEBUG( FDebug::Controller << "\t\t Prepare : " << prepareCounter.cumulated() << " s\n" );
        FDEBUG( FDebug::Controller << "\t\t Wait : " << waitCounter.cumulated() << " s\n" );
887 888
    }

889

berenger-bramas's avatar
berenger-bramas committed
890 891 892