FFmmAlgorithmThreadBalanced.hpp 20.2 KB
Newer Older
Quentin Khan's avatar
Quentin Khan committed
1
// ===================================================================================
2 3 4 5
// Copyright ScalFmm 2016 INRIA, Olivier Coulaud, Bérenger Bramas,
// Matthias Messner olivier.coulaud@inria.fr, berenger.bramas@inria.fr
// This software is a computer program whose purpose is to compute the
// FMM.
6
//
7
// This software is governed by the CeCILL-C and LGPL licenses and
Quentin Khan's avatar
Quentin Khan committed
8
// abiding by the rules of distribution of free software.
9 10 11
// An extension to the license is given to allow static linking of scalfmm
// inside a proprietary application (no matter its license).
// See the main license file for more details.
Quentin Khan's avatar
Quentin Khan committed
12 13 14 15
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16 17 18
// GNU General Public and CeCILL-C Licenses for more details.
// "http://www.cecill.info".
// "http://www.gnu.org/licenses".
Quentin Khan's avatar
Quentin Khan committed
19
// ===================================================================================
20 21 22 23 24 25 26

// ==== CMAKE ====
// Keep in private GIT
// @SCALFMM_PRIVATE



Quentin Khan's avatar
Quentin Khan committed
27 28 29 30
#ifndef FFMMALGORITHMTHREADBALANCED_HPP
#define FFMMALGORITHMTHREADBALANCED_HPP


31 32
#include "../Src/Utils/FAssert.hpp"
#include "../Src/Utils/FLog.hpp"
Quentin Khan's avatar
Quentin Khan committed
33

34 35 36
#include "../Src/Utils/FTic.hpp"
#include "../Src/Utils/FGlobal.hpp"
#include "../Src/Utils/FAlgorithmTimers.hpp"
Quentin Khan's avatar
Quentin Khan committed
37

38
#include "../Src/Containers/FOctree.hpp"
Quentin Khan's avatar
Quentin Khan committed
39

40 41
#include "../Src/Core/FCoreCommon.hpp"

42
#include "../Src/BalanceTree/FCoordColour.hpp"
43
#include "../Src/BalanceTree/FCostZones.hpp"
44
#include "../Utils/FEnv.hpp"
45

46

47
#include <vector>
Quentin Khan's avatar
Quentin Khan committed
48 49 50 51

#include <omp.h>

/**
52
 * \brief Implements a threaded FMM algorithm using OpenMP.
Quentin Khan's avatar
Quentin Khan committed
53
 *
54 55
 * \author Quentin Khan, original file: Berenger Bramas <berenger.bramas@inria.fr>
 * \copyright Please read the license.
Quentin Khan's avatar
Quentin Khan committed
56
 *
57
 * This class runs a threaded FMM algorithm. It just iterates on a tree and call
Quentin Khan's avatar
Quentin Khan committed
58 59 60 61
 * the kernels with good arguments.  The inspector-executor model is used : the
 * class iterates on the tree and builds an array and works in parallel on this
 * array.
 *
62 63 64 65 66 67 68
 * This algorithm uses the P2P in a thread safe manner, even if the kernel does
 * not initially take care of it. When working on a leaf, a kernel may want to
 * write to the leaf direct neighbours. To avoid any concurrent write, we use 27
 * colours (which is the maximum number of neighbours for a point in a 3D grid).
 * All leaves of a given colour are separated by a least 2 leaves. This means
 * that all threads can work on the same colour at the same time.
 *
69
 * For example, in 2D, one would have a grid looking like the following, where
70 71 72 73 74 75 76 77 78 79
 * each number represents a coloured cell. The grid has been cut to work on
 * cells which have a colour value of 4.
 *
 *     0 1 2 | 0 1 2
 *     3 4 5 | 3 4 5
 *     6 7 8 | 6 7 8
 *     ------+------
 *     0 1 2 | 0 1 2
 *     3 4 5 | 3 4 5
 *     6 7 8 | 6 7 8
Quentin Khan's avatar
Quentin Khan committed
80
 *
81
 * \note Upon destruction, this class does not free pointers given to its constructor.
Quentin Khan's avatar
Quentin Khan committed
82 83 84
 */
template<class OctreeClass, class CellClass, class ContainerClass, class KernelClass, class LeafClass>
class FFmmAlgorithmThreadBalanced : public FAbstractAlgorithm, public FAlgorithmTimers{
85

86 87 88
    /// Shortened tree iterator class.
    using TreeIterator   = typename OctreeClass::Iterator;
    /// Factorisation of the class holding the zone bounds.
89
    using ZoneBoundClass = typename FCostZones<OctreeClass, CellClass>::BoundClass;
90

91 92
    OctreeClass* const tree;  ///< The octree to work on.
    KernelClass** kernels;    ///< The kernels.
Quentin Khan's avatar
Quentin Khan committed
93

94 95
    const int MaxThreads;     ///< The maximum number of threads.
    const int OctreeHeight;   ///< The height of the given tree.
Quentin Khan's avatar
Quentin Khan committed
96

97
    /// The vector containing the internal costzones
Quentin Khan's avatar
Quentin Khan committed
98
    const std::vector<std::vector<ZoneBoundClass>>& costzones;
99

100
    /// The vector containing the leaf level costzones
101
    const std::vector<std::vector<ZoneBoundClass>>& leafcostzones;
102
    
Quentin Khan's avatar
Quentin Khan committed
103
public:
104 105
    /** 
     * \brief Class constructor
Quentin Khan's avatar
Quentin Khan committed
106
     * 
107 108
     * The constructor needs the octree, the kernel used for computation and the
     * cost zones.
109 110 111 112 113 114 115
     *
     * \warning Internally, one kernel is built for each thread, and each works
     * on its own copy. This means the kernel cannot assume anything about the
     * parts of the tree it will be executed on.
     *
     * \param inTree      The octree to work on.
     * \param inKernels   The kernel to call.
116 117
     * \param internalCostZones The far-field cost zones for each thread.
     * \param leafCostZones The near-field cost zones for each thread.
Quentin Khan's avatar
Quentin Khan committed
118 119
     *
     * \except An exception is thrown if one of the arguments is NULL.
120 121
     * \except An assertion checks that the number of threads is the same as the
     * number of zones.
Quentin Khan's avatar
Quentin Khan committed
122
     */
123 124 125 126 127
    FFmmAlgorithmThreadBalanced(
        OctreeClass* const inTree,
        KernelClass* const inKernel,
        const std::vector<std::vector<ZoneBoundClass>>& internalCostzones,
        const std::vector<std::vector<ZoneBoundClass>>& leafCostzones) :
128 129
        tree(inTree) , 
        kernels(nullptr),
130
        MaxThreads(FEnv::GetValue("SCALFMM_ALGO_NUM_THREADS",omp_get_max_threads())),
131
        OctreeHeight(tree->getHeight()),
132 133
        costzones(internalCostzones),
        leafcostzones(leafCostzones) {
134
        
135
        FAssertLF(tree, "Tree cannot be null.");
136 137
        FAssertLF(internalCostzones.size() == static_cast<unsigned int>(MaxThreads),
                  std::string("Thread count is different from cost zone count (") +
138 139
                  std::to_string(MaxThreads) + std::string(" : ") +
                  std::to_string(internalCostzones.size()) + ")." );
Quentin Khan's avatar
Quentin Khan committed
140

141
        // Allocation of one kernel per thread (in case of impossible concurent use)
Quentin Khan's avatar
Quentin Khan committed
142
        this->kernels = new KernelClass*[MaxThreads];
143
        // Allocation is done so that kernels are closest to their core.
144 145
        #pragma omp parallel num_threads(MaxThreads)
        {
146
            #pragma omp critical (InitFFmmAlgorithmThreadBalanced)
Quentin Khan's avatar
Quentin Khan committed
147
            {
148
                this->kernels[omp_get_thread_num()] = new KernelClass(*inKernel);
Quentin Khan's avatar
Quentin Khan committed
149 150 151 152 153
            }
        }

        FAbstractAlgorithm::setNbLevelsInTree(OctreeHeight);

154
        FLOG(std::cerr << "FFmmAlgorithmThreadBalanced (Max Thread " << MaxThreads << ")\n");
Quentin Khan's avatar
Quentin Khan committed
155 156
    }

157
    /** \brief Destructor */
Quentin Khan's avatar
Quentin Khan committed
158 159 160 161 162 163 164 165 166
    virtual ~FFmmAlgorithmThreadBalanced(){
        for(int idxThread = 0 ; idxThread < MaxThreads ; ++idxThread){
            delete this->kernels[idxThread];
        }
        delete [] this->kernels;
    }

protected:
    /**
167
     * \brief Runs the complete algorithm.
168
     *
169
     * \param operationsToProceed A flag combinaison to specify the operators
170
     * to use. See FFmmOperations in FCoreCommon.hpp.
Quentin Khan's avatar
Quentin Khan committed
171 172 173 174
     */
    void executeCore(const unsigned operationsToProceed) override {

        Timers[P2MTimer].tic();
175 176
        if(operationsToProceed & FFmmP2M)
            bottomPass();
Quentin Khan's avatar
Quentin Khan committed
177 178 179
        Timers[P2MTimer].tac();

        Timers[M2MTimer].tic();
180 181
        if(operationsToProceed & FFmmM2M)
            upwardPass();
Quentin Khan's avatar
Quentin Khan committed
182 183 184
        Timers[M2MTimer].tac();

        Timers[M2LTimer].tic();
185 186
        if(operationsToProceed & FFmmM2L)
            transferPass();
Quentin Khan's avatar
Quentin Khan committed
187 188 189
        Timers[M2LTimer].tac();

        Timers[L2LTimer].tic();
190 191
        if(operationsToProceed & FFmmL2L)
            downardPass();
Quentin Khan's avatar
Quentin Khan committed
192 193 194
        Timers[L2LTimer].tac();

        Timers[NearTimer].tic();
195 196
        if( (operationsToProceed & FFmmP2P) || (operationsToProceed & FFmmL2P) )
            directPass((operationsToProceed & FFmmP2P),(operationsToProceed & FFmmL2P));
Quentin Khan's avatar
Quentin Khan committed
197 198 199 200 201 202 203 204
        Timers[NearTimer].tac();

    }

    /////////////////////////////////////////////////////////////////////////////
    // P2M
    /////////////////////////////////////////////////////////////////////////////

205 206 207 208 209 210 211
    /** \brief Runs the P2M kernel.
     *
     * We retrieve the far-field cost zones for the tree leaf level. Each thread
     * runs the P2M kernel on the cells within its zone.
     *
     * \note The lower level of the far-field zones is used.
     */
Quentin Khan's avatar
Quentin Khan committed
212
    void bottomPass(){
213 214
        FLOG( std::cerr << "\tStart Bottom Pass" << std::endl );
        FLOG( FTic timer );
215

216
        #pragma omp parallel num_threads(MaxThreads)
Quentin Khan's avatar
Quentin Khan committed
217
        {
218
            // Thread index ( = zone index )
219
            const int threadIdx = omp_get_thread_num();
220 221 222 223 224 225 226
            // Thread kernels
            KernelClass * const threadKernels = kernels[threadIdx];
            // Iterator to the zone's first cell
            TreeIterator zoneIterator = costzones.at(threadIdx).at(OctreeHeight-1).first;
            // Cell count in zone
            int zoneCellCount         = costzones.at(threadIdx).at(OctreeHeight-1).second;
            
227 228
            // Call P2M on cells
            while ( zoneCellCount-- > 0 ) {
229 230
                threadKernels->P2M(zoneIterator.getCurrentCell(),     // Cell
                                   zoneIterator.getCurrentListSrc()); // Particles
231
                zoneIterator.moveRight();
Quentin Khan's avatar
Quentin Khan committed
232
            }
233
        } // sync barrier
234

235 236 237
        FLOG( timer.tac() );
        FLOG( std::cerr << "\tFinished (@Bottom Pass (P2M) = "
                        << timer.elapsed() << " s)" << std::endl );
Quentin Khan's avatar
Quentin Khan committed
238
    }
239
    
Quentin Khan's avatar
Quentin Khan committed
240 241 242
    /////////////////////////////////////////////////////////////////////////////
    // Upward
    /////////////////////////////////////////////////////////////////////////////
243
    
244 245 246 247 248 249 250
    /** \brief Runs the M2M kernel.
     *
     * The M2M kernel is run on every node of the tree starting from the
     * lowest level up to the topmost. There is a synchronisation barrier
     * between each level.
     *
     */
251
    void upwardPass() {
252 253
        FLOG( std::cerr << "\tStart Upward Pass" << std::endl );
        FLOG( FTic timer );
Quentin Khan's avatar
Quentin Khan committed
254

255
        // Running the kernel from the lowest up to the topmost
256
        for(int level = FAbstractAlgorithm::lowerWorkingLevel-2;
257 258
            level >= FAbstractAlgorithm::upperWorkingLevel;
            level--) {
259

260 261
            FLOG( FTic levelTimer );
            FLOG( std::cerr << "\t\t>> Level " << level << std::flush);
262

263
            #pragma omp parallel num_threads(MaxThreads)
Quentin Khan's avatar
Quentin Khan committed
264
            {
265 266 267
                // Thread index ( = zone index)
                const int threadNum = omp_get_thread_num(); 
                // Thread kernels
268
                KernelClass * const myThreadkernels = kernels[threadNum];
269 270 271 272 273 274
                // Iterator to the zone's first cell (for this level)
                TreeIterator zoneIterator = costzones.at(threadNum).at(level).first;
                // Cell count in zone (for this level)
                int zoneCellCount         = costzones.at(threadNum).at(level).second;
               
                // Call M2M on cells
275
                while(zoneCellCount-- > 0) {
Quentin Khan's avatar
Quentin Khan committed
276 277
                    // We need the current cell and the child
                    // child is an array (of 8 child) that may be null
278 279 280 281
                    myThreadkernels->M2M( zoneIterator.getCurrentCell(),
                                          zoneIterator.getCurrentChild(),
                                          zoneIterator.level());
                    zoneIterator.moveRight();
Quentin Khan's avatar
Quentin Khan committed
282
                }
283
            } // barrier between levels
284

285 286 287
            FLOG( levelTimer.tac() );
            FLOG( std::cerr << " = "  << levelTimer.elapsed()
                                   << "s" << std::endl );
288
        }
Quentin Khan's avatar
Quentin Khan committed
289

290 291
        FLOG( std::cerr << "\tFinished (@Upward Pass (M2M) = "
                               << timer.tacAndElapsed() << " s)" << std::endl );
Quentin Khan's avatar
Quentin Khan committed
292 293 294 295 296 297
    }

    /////////////////////////////////////////////////////////////////////////////
    // Transfer
    /////////////////////////////////////////////////////////////////////////////

298 299 300 301 302 303 304 305 306 307 308 309 310 311
    /** \brief Runs the M2L kernel.
     *
     * The M2L kernel is run on every node of the tree starting from the
     * topmost level up to the lowest. There is a synchronisation barrier
     * between each level.
     */
    void transferPass() {
        FLOG( std::cerr << "\tStart Downward Pass (M2L)" << std::endl );
        FLOG( FTic timer );

        // Running the kernel from the topmost to the lowest
        for( int level = FAbstractAlgorithm::upperWorkingLevel ;
             level < FAbstractAlgorithm::lowerWorkingLevel ;
             ++level )
312
        {
313 314
            FLOG( std::cerr << "\t\t>> Level " << level << std::flush );
            FLOG( FTic levelTimer );
315

316
            #pragma omp parallel num_threads(MaxThreads)
Quentin Khan's avatar
Quentin Khan committed
317
            {
318 319 320 321

                // Thread index ( = zone index)
                const int threadNum = omp_get_thread_num(); 
                // Thread kernels
322
                KernelClass * const myThreadkernels = kernels[threadNum];
323 324 325 326 327
                // Iterator to the zone's first cell (for this level)
                TreeIterator zoneIterator = costzones.at(threadNum).at(level).first;
                // Cell count in zone (for this level)
                int zoneCellCount         = costzones.at(threadNum).at(level).second;
                // Array for cell neighbours
328 329
                const CellClass* neighbours[342];
                int neighborPositions[342];
330
                
331
                // Call M2L kernel on cells
332 333 334
                while(zoneCellCount-- > 0) {
                    const int counter =
                        tree->getInteractionNeighbors(
335
                            neighbours, neighborPositions,
336
                            zoneIterator.getCurrentGlobalCoordinate(),
337
                            level);
338 339 340 341
                    if(counter)
                        myThreadkernels->M2L(
                            zoneIterator.getCurrentCell(),
                            neighbours,
342
                            neighborPositions,
343
                            counter,
344
                            level);
345
                    zoneIterator.moveRight();
Quentin Khan's avatar
Quentin Khan committed
346 347
                }

348
                myThreadkernels->finishedLevelM2L(level);
349
            }
350

351 352 353
            FLOG( levelTimer.tac() );
            FLOG( std::cerr << " = " << levelTimer.elapsed()
                                   << " s"  << std::endl );
354 355
        }
                
356 357
        FLOG( std::cerr << "\tFinished (@Downward Pass (M2L) = "
              << timer.tacAndElapsed() << " s)\n" );
Quentin Khan's avatar
Quentin Khan committed
358 359 360 361 362 363
    }

    /////////////////////////////////////////////////////////////////////////////
    // Downward
    /////////////////////////////////////////////////////////////////////////////

364 365 366 367 368 369
    /** \brief Runs the L2L kernel. 
     *
     * The L2L kernel is run on every node of the tree starting from the
     * topmost level up to the lowest. There is a synchronisation barrier
     * between each level.
     */
Quentin Khan's avatar
Quentin Khan committed
370
    void downardPass(){
371 372
        FLOG( std::cerr << "\tStart Downward Pass (L2L)" << std::endl );
        FLOG( FTic timer );
Quentin Khan's avatar
Quentin Khan committed
373 374

        // for each levels excepted leaf level
375 376 377
        for(int level = FAbstractAlgorithm::upperWorkingLevel ; 
            level < FAbstractAlgorithm::lowerWorkingLevel - 1;
            ++level ) 
378
        {
379 380
            FLOG( std::cerr << "\t\t>> Level " << level << std::flush);
            FLOG( FTic levelTimer );
381

382
            #pragma omp parallel num_threads(MaxThreads)
Quentin Khan's avatar
Quentin Khan committed
383
            {
384 385 386
                // Thread index ( = zone index)
                const int threadNum = omp_get_thread_num(); 
                // Thread kernels
387
                KernelClass * const myThreadkernels = kernels[threadNum];
388 389 390 391
                // Iterator to the zone's first cell (for this level)
                TreeIterator zoneIterator = costzones.at(threadNum).at(level).first;
                // Cell count in zone (for this level)
                int zoneCellCount         = costzones.at(threadNum).at(level).second;
392
                
393
                // Call L2L kernel on cells
394 395 396 397
                while( zoneCellCount-- > 0 ) {
                    myThreadkernels->L2L(
                        zoneIterator.getCurrentCell(),
                        zoneIterator.getCurrentChild(),
398
                        level);
399
                    zoneIterator.moveRight();
Quentin Khan's avatar
Quentin Khan committed
400 401
                }
            }
402

403 404
            FLOG(levelTimer.tac());
            FLOG( std::cerr << " = "  << levelTimer.elapsed() << " s\n" );
Quentin Khan's avatar
Quentin Khan committed
405 406
        }

407 408
        FLOG( std::cerr << "\tFinished (@Downward Pass (L2L) = " 
                               << timer.tacAndElapsed() << " s)\n" );
Quentin Khan's avatar
Quentin Khan committed
409 410 411 412 413 414 415 416
    }



    /////////////////////////////////////////////////////////////////////////////
    // Direct
    /////////////////////////////////////////////////////////////////////////////

417 418
    /**
     * \brief Runs the P2P & L2P kernels.
Quentin Khan's avatar
Quentin Khan committed
419 420 421 422 423
     *
     * \param p2pEnabled Run the P2P kernel.
     * \param l2pEnabled Run the L2P kernel.
     */
    void directPass(const bool p2pEnabled, const bool l2pEnabled){
424 425
        FLOG( std::cerr << "\tStart Direct Pass" << std::endl );
        FLOG( FTic timer );
Quentin Khan's avatar
Quentin Khan committed
426

427 428 429 430
        /** \brief Data handling structure
         *
         * This structure is used to store data related to a leaf.
         */
431
        struct LeafData {
432 433 434 435
            MortonIndex index;       ///< Leaf Morton index
            CellClass* cell;         ///< Pointer to the cell in the tree
            ContainerClass* targets; ///< Targets for L2P & P2P kernel
            ContainerClass* sources; ///< Sources for P2P kernel
Quentin Khan's avatar
Quentin Khan committed
436 437
        };

438
        #pragma omp parallel num_threads(MaxThreads)
Quentin Khan's avatar
Quentin Khan committed
439
        {
440 441 442 443 444 445 446 447 448
            // Thread index ( = zone index)
            const int threadNum = omp_get_thread_num(); 
            // Thread kernels
            KernelClass * const threadKernels = kernels[threadNum];
            // Iterator to the zone's first cell (for this level)
            TreeIterator zoneIterator;
            // Cell count in zone (for this level)
            int zoneCellCount;
            // Cell neighbours
449 450
            ContainerClass* neighbours[26];
            int neighborPositions[26];
451 452 453 454 455 456
            // Cell data
            LeafData leafdata;
            
            // The cells are coloured in order to make concurent work
            // possible. Cells in a colour can all be computed at the same time.
            for( int colourIdx = 0; colourIdx < FCoordColour::range; colourIdx++ ) {
457
                
458 459 460 461
                // Get the iterator of the first cell and cell count for this
                // thread ( = zone)
                zoneIterator  = leafcostzones.at(threadNum).at(colourIdx).first;
                zoneCellCount = leafcostzones.at(threadNum).at(colourIdx).second;
Quentin Khan's avatar
Quentin Khan committed
462

463 464
                // Skip through all the leaves, work only on those with the
                // right colour
465
                while( zoneCellCount > 0) {
466 467 468 469 470 471 472 473 474
                    if( FCoordColour::coord2colour(zoneIterator.getCurrentCell()
                                                   ->getCoordinate())
                        == colourIdx) {

                        // Store leaf data
                        leafdata.index   = zoneIterator.getCurrentGlobalIndex();
                        leafdata.cell    = zoneIterator.getCurrentCell();
                        leafdata.targets = zoneIterator.getCurrentListTargets();
                        leafdata.sources = zoneIterator.getCurrentListSrc();
475
                        
476
                        // Call L2P kernel
477
                        if( l2pEnabled ) {
478
                            threadKernels->L2P(leafdata.cell, leafdata.targets);
479 480
                        }

481 482 483
                        // Call P2P kernel
                        if( p2pEnabled ) {
                            // needs the current particles and neighbours particles
484
                            const int counter =
485
                                tree->getLeafsNeighbors(neighbours,neighborPositions,
486
                                                        leafdata.cell->getCoordinate(),
487
                                                        OctreeHeight - 1); // Leaf level
488

489 490 491 492
                            threadKernels->P2P(leafdata.cell->getCoordinate(),
                                                leafdata.targets,
                                                leafdata.sources,
                                                neighbours,
493
                                                neighborPositions,
494
                                                counter);
495 496
                        }

497
                        // Decrease cell count only if a cell of the right colour was encountered.
498 499 500
                        zoneCellCount--;
                    }

501
                    zoneIterator.moveRight();
502
                }
503
                // Barrier between computation on two different colours
504
                #pragma omp barrier
Quentin Khan's avatar
Quentin Khan committed
505
            }
506
            
Quentin Khan's avatar
Quentin Khan committed
507 508
        }

509 510 511
        FLOG( timer.tac() );
        FLOG( std::cerr << "\tFinished (@Direct Pass (L2P + P2P) = "
                               << timer.elapsed() << " s)" << std::endl );
Quentin Khan's avatar
Quentin Khan committed
512 513 514 515 516 517
    }

};


#endif //FFMMALGORITHMTHREADBALANCED_HPP