FGroupTaskStarpuMpiAlgorithm.hpp 104 KB
Newer Older
1 2 3 4 5
// Keep in private GIT
// @SCALFMM_PRIVATE
#ifndef FGROUPTASKSTARPUMPIALGORITHM_HPP
#define FGROUPTASKSTARPUMPIALGORITHM_HPP

6 7 8 9 10 11 12 13 14 15 16
#include "../../Utils/FGlobal.hpp"
#include "../../Core/FCoreCommon.hpp"
#include "../../Utils/FQuickSort.hpp"
#include "../../Containers/FTreeCoordinate.hpp"
#include "../../Utils/FLog.hpp"
#include "../../Utils/FTic.hpp"
#include "../../Utils/FAssert.hpp"
#include "../../Utils/FAlignedMemory.hpp"
#include "../../Utils/FAssert.hpp"

#include "../../Utils/FMpi.hpp"
17

18 19
#include "FOutOfBlockInteraction.hpp"

20 21 22 23 24 25
#include <vector>
#include <memory>

#include <omp.h>

#include <starpu.h>
26
#include <starpu_mpi.h>
27
#include "../StarPUUtils/FStarPUUtils.hpp"
28
#include "../StarPUUtils/FStarPUFmmPriorities.hpp"
29

30
#ifdef STARPU_USE_CPU
31
#include "../StarPUUtils/FStarPUCpuWrapper.hpp"
32
#endif
33
#ifdef SCALFMM_ENABLE_CUDA_KERNEL
34 35 36 37 38
#include "../StarPUUtils/FStarPUCudaWrapper.hpp"
#include "../Cuda/FCudaEmptyKernel.hpp"
#include "../Cuda/FCudaGroupAttachedLeaf.hpp"
#include "../Cuda/FCudaGroupOfParticles.hpp"
#include "../Cuda/FCudaGroupOfCells.hpp"
39
#include "../Cuda/FCudaEmptyCellSymb.hpp"
40
#endif
41
#ifdef SCALFMM_ENABLE_OPENCL_KERNEL
42 43
#include "../StarPUUtils/FStarPUOpenClWrapper.hpp"
#include "../OpenCl/FOpenCLDeviceWrapper.hpp"
44
#include "../OpenCl/FEmptyOpenCLCode.hpp"
45
#endif
46

47

48
template <class OctreeClass, class CellContainerClass, class KernelClass, class ParticleGroupClass, class StarPUCpuWrapperClass
49
#ifdef SCALFMM_ENABLE_CUDA_KERNEL
50
    , class StarPUCudaWrapperClass = FStarPUCudaWrapper<KernelClass, FCudaEmptyCellSymb, int, int, FCudaGroupOfCells<FCudaEmptyCellSymb, int, int>,
BRAMAS Berenger's avatar
BRAMAS Berenger committed
51
                                                        FCudaGroupOfParticles<int, 0, 0, int>, FCudaGroupAttachedLeaf<int, 0, 0, int>, FCudaEmptyKernel<int> >
52
#endif
53
#ifdef SCALFMM_ENABLE_OPENCL_KERNEL
54
    , class StarPUOpenClWrapperClass = FStarPUOpenClWrapper<KernelClass, FOpenCLDeviceWrapper<KernelClass>>
55 56
#endif
          >
57
class FGroupTaskStarPUMpiAlgorithm : public FAbstractAlgorithm {
58
protected:
59
    typedef FGroupTaskStarPUMpiAlgorithm<OctreeClass, CellContainerClass, KernelClass, ParticleGroupClass, StarPUCpuWrapperClass
60
#ifdef SCALFMM_ENABLE_CUDA_KERNEL
61
        , StarPUCudaWrapperClass
62
#endif
63
#ifdef SCALFMM_ENABLE_OPENCL_KERNEL
64
    , StarPUOpenClWrapperClass
65 66
#endif
    > ThisClass;
67

68
    int getTag(const int inLevel, const MortonIndex mindex, const int mode) const{
69 70 71
        int shift = 0;
        int height = tree->getHeight();
        while(height) { shift += 1; height >>= 1; }
BRAMAS Berenger's avatar
BRAMAS Berenger committed
72
        return int((((mindex<<shift) + inLevel) << 5) + mode);
73 74
    }

75 76 77 78 79 80 81 82 83
    const FMpi::FComm& comm;

    template <class OtherBlockClass>
    struct BlockInteractions{
        OtherBlockClass* otherBlock;
        int otherBlockId;
        std::vector<OutOfBlockInteraction> interactions;
    };

84 85 86 87
    struct CellHandles{
        starpu_data_handle_t symb;
        starpu_data_handle_t up;
        starpu_data_handle_t down;
88
        int intervalSize;
89 90 91 92 93
    };

    struct ParticleHandles{
        starpu_data_handle_t symb;
        starpu_data_handle_t down;
94
        int intervalSize;
95 96
    };

97 98 99 100
    std::vector< std::vector< std::vector<BlockInteractions<CellContainerClass>>>> externalInteractionsAllLevel;
    std::vector< std::vector<BlockInteractions<ParticleGroupClass>>> externalInteractionsLeafLevel;

    OctreeClass*const tree;       //< The Tree
101
    KernelClass*const originalCpuKernel;
102

103 104
    std::vector<CellHandles>* cellHandles;
    std::vector<ParticleHandles> particleHandles;
105 106 107 108 109 110 111 112

    starpu_codelet p2m_cl;
    starpu_codelet m2m_cl[9];
    starpu_codelet l2l_cl[9];
    starpu_codelet l2p_cl;

    starpu_codelet m2l_cl_in;
    starpu_codelet m2l_cl_inout;
113
    starpu_codelet m2l_cl_inout_mpi;
114 115 116

    starpu_codelet p2p_cl_in;
    starpu_codelet p2p_cl_inout;
117
    starpu_codelet p2p_cl_inout_mpi;
118

119
#ifdef STARPU_USE_CPU
120
    StarPUCpuWrapperClass cpuWrapper;
121
#endif
122
#ifdef SCALFMM_ENABLE_CUDA_KERNEL
123 124
    StarPUCudaWrapperClass cudaWrapper;
#endif
125
#ifdef SCALFMM_ENABLE_OPENCL_KERNEL
126 127
    StarPUOpenClWrapperClass openclWrapper;
#endif
128 129 130

    FStarPUPtrInterface wrappers;
    FStarPUPtrInterface* wrapperptr;
131

132 133 134 135 136
#ifdef STARPU_SUPPORT_ARBITER
    starpu_arbiter_t arbiterPole;
    starpu_arbiter_t arbiterLocal;
    starpu_arbiter_t arbiterParticles;
#endif
137
public:
138 139
    FGroupTaskStarPUMpiAlgorithm(const FMpi::FComm& inComm, OctreeClass*const inTree, KernelClass* inKernels)
        :   comm(inComm), tree(inTree), originalCpuKernel(inKernels),
140
          cellHandles(nullptr),
141 142
#ifdef STARPU_USE_CPU
            cpuWrapper(tree->getHeight()),
143
#endif
144
#ifdef SCALFMM_ENABLE_CUDA_KERNEL
145 146
            cudaWrapper(tree->getHeight()),
#endif
147
#ifdef SCALFMM_ENABLE_OPENCL_KERNEL
148
            openclWrapper(tree->getHeight()),
149 150
#endif
            wrapperptr(&wrappers){
151 152 153
        FAssertLF(tree, "tree cannot be null");
        FAssertLF(inKernels, "kernels cannot be null");

154 155
        FAbstractAlgorithm::setNbLevelsInTree(tree->getHeight());

156 157
        struct starpu_conf conf;
        FAssertLF(starpu_conf_init(&conf) == 0);
158
        FStarPUFmmPriorities::Controller().init(&conf, tree->getHeight(), inKernels);
159
        FAssertLF(starpu_init(&conf) == 0);
160
        FAssertLF(starpu_mpi_init ( 0, 0, 0 ) == 0);
161 162 163

        starpu_pthread_mutex_t initMutex;
        starpu_pthread_mutex_init(&initMutex, NULL);
164
#ifdef STARPU_USE_CPU
165 166 167 168 169
        FStarPUUtils::ExecOnWorkers(STARPU_CPU, [&](){
            starpu_pthread_mutex_lock(&initMutex);
            cpuWrapper.initKernel(starpu_worker_get_id(), inKernels);
            starpu_pthread_mutex_unlock(&initMutex);
        });
170
        wrappers.set(FSTARPU_CPU_IDX, &cpuWrapper);
171
#endif
172
#ifdef SCALFMM_ENABLE_CUDA_KERNEL
173 174 175 176 177 178 179
        FStarPUUtils::ExecOnWorkers(STARPU_CUDA, [&](){
            starpu_pthread_mutex_lock(&initMutex);
            cudaWrapper.initKernel(starpu_worker_get_id(), inKernels);
            starpu_pthread_mutex_unlock(&initMutex);
        });
        wrappers.set(FSTARPU_CUDA_IDX, &cudaWrapper);
#endif
180
#ifdef SCALFMM_ENABLE_OPENCL_KERNEL
181 182 183 184 185 186
        FStarPUUtils::ExecOnWorkers(STARPU_OPENCL, [&](){
            starpu_pthread_mutex_lock(&initMutex);
            openclWrapper.initKernel(starpu_worker_get_id(), inKernels);
            starpu_pthread_mutex_unlock(&initMutex);
        });
        wrappers.set(FSTARPU_OPENCL_IDX, &openclWrapper);
187
#endif
188 189
        starpu_pthread_mutex_destroy(&initMutex);

190 191
        starpu_pause();

192
        cellHandles   = new std::vector<CellHandles>[tree->getHeight()];
193

194
        initCodelet();
195
        initCodeletMpi();
196 197 198 199 200
#ifdef STARPU_SUPPORT_ARBITER
        arbiterPole = starpu_arbiter_create();
        arbiterLocal = starpu_arbiter_create();
        arbiterParticles = starpu_arbiter_create();
#endif
201

BRAMAS Berenger's avatar
BRAMAS Berenger committed
202
        FLOG(FLog::Controller << "FGroupTaskStarPUAlgorithm (Max Worker " << starpu_worker_get_count() << ")\n");
BRAMAS Berenger's avatar
BRAMAS Berenger committed
203
#ifdef STARPU_USE_CPU
BRAMAS Berenger's avatar
BRAMAS Berenger committed
204
        FLOG(FLog::Controller << "FGroupTaskStarPUAlgorithm (Max CPU " << starpu_cpu_worker_get_count() << ")\n");
BRAMAS Berenger's avatar
BRAMAS Berenger committed
205
#endif
206
#ifdef SCALFMM_ENABLE_OPENCL_KERNEL
BRAMAS Berenger's avatar
BRAMAS Berenger committed
207 208
        FLOG(FLog::Controller << "FGroupTaskStarPUAlgorithm (Max OpenCL " << starpu_opencl_worker_get_count() << ")\n");
#endif
209
#ifdef SCALFMM_ENABLE_CUDA_KERNEL
BRAMAS Berenger's avatar
BRAMAS Berenger committed
210 211
        FLOG(FLog::Controller << "FGroupTaskStarPUAlgorithm (Max CUDA " << starpu_cuda_worker_get_count() << ")\n");
#endif
212 213 214
    }

    ~FGroupTaskStarPUMpiAlgorithm(){
BRAMAS Berenger's avatar
BRAMAS Berenger committed
215 216
        starpu_resume();

217
        cleanHandle();
218
        cleanHandleMpi();
219
        delete[] cellHandles;
220

BRAMAS Berenger's avatar
BRAMAS Berenger committed
221 222 223 224 225 226 227 228 229 230
        starpu_pthread_mutex_t releaseMutex;
        starpu_pthread_mutex_init(&releaseMutex, NULL);
#ifdef STARPU_USE_CPU
        FStarPUUtils::ExecOnWorkers(STARPU_CPU, [&](){
            starpu_pthread_mutex_lock(&releaseMutex);
            cpuWrapper.releaseKernel(starpu_worker_get_id());
            starpu_pthread_mutex_unlock(&releaseMutex);
        });
        wrappers.set(FSTARPU_CPU_IDX, &cpuWrapper);
#endif
231
#ifdef SCALFMM_ENABLE_CUDA_KERNEL
BRAMAS Berenger's avatar
BRAMAS Berenger committed
232 233 234 235 236 237 238
        FStarPUUtils::ExecOnWorkers(STARPU_CUDA, [&](){
            starpu_pthread_mutex_lock(&releaseMutex);
            cudaWrapper.releaseKernel(starpu_worker_get_id());
            starpu_pthread_mutex_unlock(&releaseMutex);
        });
        wrappers.set(FSTARPU_CUDA_IDX, &cudaWrapper);
#endif
239
#ifdef SCALFMM_ENABLE_OPENCL_KERNEL
BRAMAS Berenger's avatar
BRAMAS Berenger committed
240 241 242 243 244 245 246 247 248
        FStarPUUtils::ExecOnWorkers(STARPU_OPENCL, [&](){
            starpu_pthread_mutex_lock(&releaseMutex);
            openclWrapper.releaseKernel(starpu_worker_get_id());
            starpu_pthread_mutex_unlock(&releaseMutex);
        });
        wrappers.set(FSTARPU_OPENCL_IDX, &openclWrapper);
#endif
        starpu_pthread_mutex_destroy(&releaseMutex);

249 250 251 252 253 254

#ifdef STARPU_SUPPORT_ARBITER
        starpu_arbiter_destroy(arbiterPole);
        starpu_arbiter_destroy(arbiterLocal);
        starpu_arbiter_destroy(arbiterParticles);
#endif
255
        starpu_mpi_shutdown();
256 257 258
        starpu_shutdown();
    }

259 260 261 262 263
protected:
    /**
      * Runs the complete algorithm.
      */
    void executeCore(const unsigned operationsToProceed) override {
264
        FLOG( FLog::Controller << "\tStart FGroupTaskStarPUMpiAlgorithm\n" );
265
        const bool directOnly = (tree->getHeight() <= 2);
266 267 268 269 270 271

        #pragma omp parallel
        #pragma omp single
        buildExternalInteractionVecs();
        buildHandles();

272 273
        #pragma omp parallel
        #pragma omp single
274 275 276
        buildRemoteInteractionsAndHandles();

        starpu_resume();
277
        postRecvAllocatedBlocks();
278

279
        if( operationsToProceed & FFmmP2P ) insertParticlesSend();
280

281
        if(operationsToProceed & FFmmP2M && !directOnly) bottomPass();
282

283 284
        if(operationsToProceed & FFmmM2M && !directOnly) upwardPass();
        if(operationsToProceed & FFmmM2L && !directOnly) insertCellsSend();
285

286 287
        if(operationsToProceed & FFmmM2L && !directOnly) transferPass();
        if(operationsToProceed & FFmmM2L && !directOnly) transferPassMpi();
288

289
        if(operationsToProceed & FFmmL2L && !directOnly) downardPass();
290

291 292
        if( operationsToProceed & FFmmP2P ) directPass();
        if( operationsToProceed & FFmmP2P ) directPassMpi();
BRAMAS Berenger's avatar
BRAMAS Berenger committed
293

294
        if( operationsToProceed & FFmmL2P  && !directOnly) mergePass();
295 296 297 298 299

        starpu_task_wait_for_all();
        starpu_pause();
    }

300

301 302
    void initCodelet(){
        memset(&p2m_cl, 0, sizeof(p2m_cl));
303
#ifdef STARPU_USE_CPU
304
        if(originalCpuKernel->supportP2M(FSTARPU_CPU_IDX)){
305 306 307
            p2m_cl.cpu_funcs[0] = StarPUCpuWrapperClass::bottomPassCallback;
            p2m_cl.where |= STARPU_CPU;
        }
308
#endif
309
#ifdef SCALFMM_ENABLE_CUDA_KERNEL
310
        if(originalCpuKernel->supportP2M(FSTARPU_CUDA_IDX)){
311 312 313 314
            p2m_cl.cuda_funcs[0] = StarPUCudaWrapperClass::bottomPassCallback;
            p2m_cl.where |= STARPU_CUDA;
        }
#endif
315
#ifdef SCALFMM_ENABLE_OPENCL_KERNEL
316
        if(originalCpuKernel->supportP2M(FSTARPU_OPENCL_IDX)){
317 318 319
            p2m_cl.opencl_funcs[0] = StarPUOpenClWrapperClass::bottomPassCallback;
            p2m_cl.where |= STARPU_OPENCL;
        }
320
#endif
321 322 323 324
        p2m_cl.nbuffers = 3;
        p2m_cl.modes[0] = STARPU_R;
        p2m_cl.modes[1] = STARPU_RW;
        p2m_cl.modes[2] = STARPU_R;
325
        p2m_cl.name = "p2m_cl";
326 327 328 329

        memset(m2m_cl, 0, sizeof(m2m_cl[0])*9);
        memset(l2l_cl, 0, sizeof(l2l_cl[0])*9);
        for(int idx = 0 ; idx < 9 ; ++idx){
330
#ifdef STARPU_USE_CPU
331
            if(originalCpuKernel->supportM2M(FSTARPU_CPU_IDX)){
332 333 334
                m2m_cl[idx].cpu_funcs[0] = StarPUCpuWrapperClass::upwardPassCallback;
                m2m_cl[idx].where |= STARPU_CPU;
            }
335
#endif
336
#ifdef SCALFMM_ENABLE_CUDA_KERNEL
337
            if(originalCpuKernel->supportM2M(FSTARPU_CUDA_IDX)){
338 339 340 341
                m2m_cl[idx].cuda_funcs[0] = StarPUCudaWrapperClass::upwardPassCallback;
                m2m_cl[idx].where |= STARPU_CUDA;
            }
#endif
342
#ifdef SCALFMM_ENABLE_OPENCL_KERNEL
343
            if(originalCpuKernel->supportM2M(FSTARPU_OPENCL_IDX)){
344 345 346
                m2m_cl[idx].opencl_funcs[0] = StarPUOpenClWrapperClass::upwardPassCallback;
                m2m_cl[idx].where |= STARPU_OPENCL;
            }
347
#endif
348 349 350 351
            m2m_cl[idx].nbuffers = (idx+2)*2;
            m2m_cl[idx].dyn_modes = (starpu_data_access_mode*)malloc(m2m_cl[idx].nbuffers*sizeof(starpu_data_access_mode));
            m2m_cl[idx].dyn_modes[0] = STARPU_R;
            m2m_cl[idx].dyn_modes[1] = STARPU_RW;
352
            m2m_cl[idx].name = "m2m_cl";
353

354
#ifdef STARPU_USE_CPU
355
            if(originalCpuKernel->supportL2L(FSTARPU_CPU_IDX)){
356 357 358
                l2l_cl[idx].cpu_funcs[0] = StarPUCpuWrapperClass::downardPassCallback;
                l2l_cl[idx].where |= STARPU_CPU;
            }
359
#endif
360
#ifdef SCALFMM_ENABLE_CUDA_KERNEL
361
            if(originalCpuKernel->supportL2L(FSTARPU_CUDA_IDX)){
362 363 364 365
                l2l_cl[idx].cuda_funcs[0] = StarPUCudaWrapperClass::downardPassCallback;
                l2l_cl[idx].where |= STARPU_CUDA;
            }
#endif
366
#ifdef SCALFMM_ENABLE_OPENCL_KERNEL
367
            if(originalCpuKernel->supportL2L(FSTARPU_OPENCL_IDX)){
368 369 370
                l2l_cl[idx].opencl_funcs[0] = StarPUOpenClWrapperClass::downardPassCallback;
                l2l_cl[idx].where |= STARPU_OPENCL;
            }
371
#endif
372 373
            l2l_cl[idx].nbuffers = (idx+2)*2;
            l2l_cl[idx].dyn_modes = (starpu_data_access_mode*)malloc(l2l_cl[idx].nbuffers*sizeof(starpu_data_access_mode));
374
            l2l_cl[idx].dyn_modes[0] = STARPU_R;
375
            l2l_cl[idx].dyn_modes[1] = STARPU_R;
376
            l2l_cl[idx].name = "l2l_cl";
377 378

            for(int idxBuffer = 0 ; idxBuffer <= idx ; ++idxBuffer){
379 380 381
                m2m_cl[idx].dyn_modes[(idxBuffer*2)+2] = STARPU_R;
                m2m_cl[idx].dyn_modes[(idxBuffer*2)+3] = STARPU_R;
                l2l_cl[idx].dyn_modes[(idxBuffer*2)+2] = STARPU_R;
382
                l2l_cl[idx].dyn_modes[(idxBuffer*2)+3] = starpu_data_access_mode(STARPU_RW|STARPU_COMMUTE_IF_SUPPORTED);
383 384 385 386
            }
        }

        memset(&l2p_cl, 0, sizeof(l2p_cl));
387
#ifdef STARPU_USE_CPU
388
        if(originalCpuKernel->supportL2P(FSTARPU_CPU_IDX)){
389 390 391
            l2p_cl.cpu_funcs[0] = StarPUCpuWrapperClass::mergePassCallback;
            l2p_cl.where |= STARPU_CPU;
        }
392
#endif
393
#ifdef SCALFMM_ENABLE_CUDA_KERNEL
394
        if(originalCpuKernel->supportL2P(FSTARPU_CUDA_IDX)){
395 396 397 398
            l2p_cl.cuda_funcs[0] = StarPUCudaWrapperClass::mergePassCallback;
            l2p_cl.where |= STARPU_CUDA;
        }
#endif
399
#ifdef SCALFMM_ENABLE_OPENCL_KERNEL
400
        if(originalCpuKernel->supportL2P(FSTARPU_OPENCL_IDX)){
401 402 403
            l2p_cl.opencl_funcs[0] = StarPUOpenClWrapperClass::mergePassCallback;
            l2p_cl.where |= STARPU_OPENCL;
        }
404
#endif
405
        l2p_cl.nbuffers = 4;
406
        l2p_cl.modes[0] = STARPU_R;
407 408
        l2p_cl.modes[1] = STARPU_R;
        l2p_cl.modes[2] = STARPU_R;
409
        l2p_cl.modes[3] = starpu_data_access_mode(STARPU_RW|STARPU_COMMUTE_IF_SUPPORTED);
410
        l2p_cl.name = "l2p_cl";
411 412

        memset(&p2p_cl_in, 0, sizeof(p2p_cl_in));
413
#ifdef STARPU_USE_CPU
414
        if(originalCpuKernel->supportP2P(FSTARPU_CPU_IDX)){
415 416 417
            p2p_cl_in.cpu_funcs[0] = StarPUCpuWrapperClass::directInPassCallback;
            p2p_cl_in.where |= STARPU_CPU;
        }
418
#endif
419
#ifdef SCALFMM_ENABLE_CUDA_KERNEL
420
        if(originalCpuKernel->supportP2P(FSTARPU_CUDA_IDX)){
421 422 423 424
            p2p_cl_in.cuda_funcs[0] = StarPUCudaWrapperClass::directInPassCallback;
            p2p_cl_in.where |= STARPU_CUDA;
        }
#endif
425
#ifdef SCALFMM_ENABLE_OPENCL_KERNEL
426
        if(originalCpuKernel->supportP2P(FSTARPU_OPENCL_IDX)){
427 428 429
            p2p_cl_in.opencl_funcs[0] = StarPUOpenClWrapperClass::directInPassCallback;
            p2p_cl_in.where |= STARPU_OPENCL;
        }
430
#endif
431 432
        p2p_cl_in.nbuffers = 2;
        p2p_cl_in.modes[0] = STARPU_R;
433
        p2p_cl_in.modes[1] = starpu_data_access_mode(STARPU_RW|STARPU_COMMUTE_IF_SUPPORTED);
434
        p2p_cl_in.name = "p2p_cl_in";
435
        memset(&p2p_cl_inout, 0, sizeof(p2p_cl_inout));
436
#ifdef STARPU_USE_CPU
437
        if(originalCpuKernel->supportP2PExtern(FSTARPU_CPU_IDX)){
438 439 440
            p2p_cl_inout.cpu_funcs[0] = StarPUCpuWrapperClass::directInoutPassCallback;
            p2p_cl_inout.where |= STARPU_CPU;
        }
441
#endif
442
#ifdef SCALFMM_ENABLE_CUDA_KERNEL
443
        if(originalCpuKernel->supportP2PExtern(FSTARPU_CUDA_IDX)){
444 445 446 447
            p2p_cl_inout.cuda_funcs[0] = StarPUCudaWrapperClass::directInoutPassCallback;
            p2p_cl_inout.where |= STARPU_CUDA;
        }
#endif
448
#ifdef SCALFMM_ENABLE_OPENCL_KERNEL
449
        if(originalCpuKernel->supportP2PExtern(FSTARPU_OPENCL_IDX)){
450 451 452
            p2p_cl_inout.opencl_funcs[0] = StarPUOpenClWrapperClass::directInoutPassCallback;
            p2p_cl_inout.where |= STARPU_OPENCL;
        }
453
#endif
454 455
        p2p_cl_inout.nbuffers = 4;
        p2p_cl_inout.modes[0] = STARPU_R;
456
        p2p_cl_inout.modes[1] = starpu_data_access_mode(STARPU_RW|STARPU_COMMUTE_IF_SUPPORTED);
457
        p2p_cl_inout.modes[2] = STARPU_R;
458
        p2p_cl_inout.modes[3] = starpu_data_access_mode(STARPU_RW|STARPU_COMMUTE_IF_SUPPORTED);
459
        p2p_cl_inout.name = "p2p_cl_inout";
460 461

        memset(&m2l_cl_in, 0, sizeof(m2l_cl_in));
462
#ifdef STARPU_USE_CPU
463
        if(originalCpuKernel->supportM2L(FSTARPU_CPU_IDX)){
464 465 466
            m2l_cl_in.cpu_funcs[0] = StarPUCpuWrapperClass::transferInPassCallback;
            m2l_cl_in.where |= STARPU_CPU;
        }
467
#endif
468
#ifdef SCALFMM_ENABLE_CUDA_KERNEL
469
        if(originalCpuKernel->supportM2L(FSTARPU_CUDA_IDX)){
470 471 472 473
            m2l_cl_in.cuda_funcs[0] = StarPUCudaWrapperClass::transferInPassCallback;
            m2l_cl_in.where |= STARPU_CUDA;
        }
#endif
474
#ifdef SCALFMM_ENABLE_OPENCL_KERNEL
475
        if(originalCpuKernel->supportM2L(FSTARPU_OPENCL_IDX)){
476 477 478
            m2l_cl_in.opencl_funcs[0] = StarPUOpenClWrapperClass::transferInPassCallback;
            m2l_cl_in.where |= STARPU_OPENCL;
        }
479
#endif
480 481
        m2l_cl_in.nbuffers = 3;
        m2l_cl_in.modes[0] = STARPU_R;
482
        m2l_cl_in.modes[1] = STARPU_R;
483
        m2l_cl_in.modes[2] = starpu_data_access_mode(STARPU_RW|STARPU_COMMUTE_IF_SUPPORTED);
484
        m2l_cl_in.name = "m2l_cl_in";
485
        memset(&m2l_cl_inout, 0, sizeof(m2l_cl_inout));
486
#ifdef STARPU_USE_CPU
487
        if(originalCpuKernel->supportM2LExtern(FSTARPU_CPU_IDX)){
488 489 490
            m2l_cl_inout.cpu_funcs[0] = StarPUCpuWrapperClass::transferInoutPassCallback;
            m2l_cl_inout.where |= STARPU_CPU;
        }
491
#endif
492
#ifdef SCALFMM_ENABLE_CUDA_KERNEL
493
        if(originalCpuKernel->supportM2LExtern(FSTARPU_CUDA_IDX)){
494 495 496 497
            m2l_cl_inout.cuda_funcs[0] = StarPUCudaWrapperClass::transferInoutPassCallback;
            m2l_cl_inout.where |= STARPU_CUDA;
        }
#endif
498
#ifdef SCALFMM_ENABLE_OPENCL_KERNEL
499
        if(originalCpuKernel->supportM2LExtern(FSTARPU_OPENCL_IDX)){
500 501 502
            m2l_cl_inout.opencl_funcs[0] = StarPUOpenClWrapperClass::transferInoutPassCallback;
            m2l_cl_inout.where |= STARPU_OPENCL;
        }
503
#endif
504 505 506
        m2l_cl_inout.nbuffers = 6;
        m2l_cl_inout.modes[0] = STARPU_R;
        m2l_cl_inout.modes[1] = STARPU_R;
507
        m2l_cl_inout.modes[2] = starpu_data_access_mode(STARPU_RW|STARPU_COMMUTE_IF_SUPPORTED);
508
        m2l_cl_inout.modes[3] = STARPU_R;
509
        m2l_cl_inout.modes[4] = STARPU_R;
510
        m2l_cl_inout.modes[5] = starpu_data_access_mode(STARPU_RW|STARPU_COMMUTE_IF_SUPPORTED);
511
        m2l_cl_inout.name = "m2l_cl_inout";
512 513 514 515 516
    }

    /** dealloc in a starpu way all the defined handles */
    void cleanHandle(){
        for(int idxLevel = 0 ; idxLevel < tree->getHeight() ; ++idxLevel){
517 518 519 520
            for(int idxHandle = 0 ; idxHandle < int(cellHandles[idxLevel].size()) ; ++idxHandle){
                starpu_data_unregister(cellHandles[idxLevel][idxHandle].symb);
                starpu_data_unregister(cellHandles[idxLevel][idxHandle].up);
                starpu_data_unregister(cellHandles[idxLevel][idxHandle].down);
521
            }
522
            cellHandles[idxLevel].clear();
523 524
        }
        {
525 526 527
            for(int idxHandle = 0 ; idxHandle < int(particleHandles.size()) ; ++idxHandle){
                starpu_data_unregister(particleHandles[idxHandle].symb);
                starpu_data_unregister(particleHandles[idxHandle].down);
528
            }
529
            particleHandles.clear();
530 531 532 533
        }
    }

    ////////////////////////////////////////////////////////////////////////////
534 535

    void initCodeletMpi(){
536
        memset(&p2p_cl_inout_mpi, 0, sizeof(p2p_cl_inout_mpi));
537
#ifdef STARPU_USE_CPU
538
        if(originalCpuKernel->supportP2PMpi(FSTARPU_CPU_IDX)){
539 540 541
            p2p_cl_inout_mpi.where |= STARPU_CPU;
            p2p_cl_inout_mpi.cpu_funcs[0] = StarPUCpuWrapperClass::directInoutPassCallbackMpi;
        }
542
#endif
543
#ifdef SCALFMM_ENABLE_CUDA_KERNEL
544
        if(originalCpuKernel->supportP2PMpi(FSTARPU_CUDA_IDX)){
545 546 547 548
            p2p_cl_inout_mpi.where |= STARPU_CUDA;
            p2p_cl_inout_mpi.cuda_funcs[0] = StarPUCudaWrapperClass::directInoutPassCallbackMpi;
        }
#endif
549
#ifdef SCALFMM_ENABLE_OPENCL_KERNEL
550
        if(originalCpuKernel->supportP2PMpi(FSTARPU_OPENCL_IDX)){
551 552 553
            p2p_cl_inout_mpi.where |= STARPU_OPENCL;
            p2p_cl_inout_mpi.opencl_funcs[0] = StarPUOpenClWrapperClass::directInoutPassCallbackMpi;
        }
554
#endif
555 556
        p2p_cl_inout_mpi.nbuffers = 3;
        p2p_cl_inout_mpi.modes[0] = STARPU_R;
557
        p2p_cl_inout_mpi.modes[1] = starpu_data_access_mode(STARPU_RW|STARPU_COMMUTE_IF_SUPPORTED);
558
        p2p_cl_inout_mpi.modes[2] = STARPU_R;
559
        p2p_cl_inout_mpi.name = "p2p_cl_inout_mpi";
560

561
        memset(&m2l_cl_inout_mpi, 0, sizeof(m2l_cl_inout_mpi));
562
#ifdef STARPU_USE_CPU
563
        if(originalCpuKernel->supportM2LMpi(FSTARPU_CPU_IDX)){
564 565 566
            m2l_cl_inout_mpi.where |= STARPU_CPU;
            m2l_cl_inout_mpi.cpu_funcs[0] = StarPUCpuWrapperClass::transferInoutPassCallbackMpi;
        }
567
#endif
568
#ifdef SCALFMM_ENABLE_CUDA_KERNEL
569
        if(originalCpuKernel->supportM2LMpi(FSTARPU_CUDA_IDX)){
570 571 572 573
            m2l_cl_inout_mpi.where |= STARPU_CUDA;
            m2l_cl_inout_mpi.cuda_funcs[0] = StarPUCudaWrapperClass::transferInoutPassCallbackMpi;
        }
#endif
574
#ifdef SCALFMM_ENABLE_OPENCL_KERNEL
575
        if(originalCpuKernel->supportM2LMpi(FSTARPU_OPENCL_IDX)){
576 577 578
            m2l_cl_inout_mpi.where |= STARPU_OPENCL;
            m2l_cl_inout_mpi.opencl_funcs[0] = StarPUOpenClWrapperClass::transferInoutPassCallbackMpi;
        }
579
#endif
580 581
        m2l_cl_inout_mpi.nbuffers = 4;
        m2l_cl_inout_mpi.modes[0] = STARPU_R;
582
        m2l_cl_inout_mpi.modes[1] = starpu_data_access_mode(STARPU_RW|STARPU_COMMUTE_IF_SUPPORTED);
583 584
        m2l_cl_inout_mpi.modes[2] = STARPU_R;
        m2l_cl_inout_mpi.modes[3] = STARPU_R;
585
        m2l_cl_inout_mpi.name = "m2l_cl_inout_mpi";
586 587
    }

588 589 590 591 592 593
    std::vector<std::pair<MortonIndex,MortonIndex>> processesIntervalPerLevels;
    struct BlockDescriptor{
        MortonIndex firstIndex;
        MortonIndex lastIndex;
        int owner;
        int bufferSize;
594 595 596 597
        size_t bufferSizeSymb;
        size_t bufferSizeUp;
        size_t bufferSizeDown;
        size_t leavesBufferSize;
598 599 600 601 602 603 604 605
    };
    std::vector<std::vector<BlockDescriptor>> processesBlockInfos;
    std::vector<int> nbBlocksPerLevelAll;
    std::vector<int> nbBlocksBeforeMinPerLevel;

    std::vector< std::vector< std::vector<BlockInteractions<CellContainerClass>>>> externalInteractionsAllLevelMpi;
    std::vector< std::vector<BlockInteractions<ParticleGroupClass>>> externalInteractionsLeafLevelMpi;

606
    struct RemoteHandle{
607 608 609 610
        RemoteHandle() : ptrSymb(nullptr), ptrUp(nullptr), ptrDown(nullptr){
            memset(&handleSymb, 0, sizeof(handleSymb));
            memset(&handleUp, 0, sizeof(handleUp));
            memset(&handleDown, 0, sizeof(handleDown));
611 612
        }

613 614 615 616 617 618
        unsigned char * ptrSymb;
        starpu_data_handle_t handleSymb;
        unsigned char * ptrUp;
        starpu_data_handle_t handleUp;
        unsigned char * ptrDown;
        starpu_data_handle_t handleDown;
BRAMAS Berenger's avatar
BRAMAS Berenger committed
619 620

        int intervalSize;
621 622 623 624 625
    };

    std::vector<std::vector<RemoteHandle>> remoteCellGroups;
    std::vector<RemoteHandle> remoteParticleGroupss;

626
    void buildRemoteInteractionsAndHandles(){
627 628
        cleanHandleMpi();

629 630
        // We need to have information about all other blocks
        std::unique_ptr<int[]> nbBlocksPerLevel(new int[tree->getHeight()]);
631
        nbBlocksPerLevel[0] = 0;
632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674
        for(int idxLevel = 1 ; idxLevel < tree->getHeight() ; ++idxLevel){
            nbBlocksPerLevel[idxLevel] = tree->getNbCellGroupAtLevel(idxLevel);
        }
        // Exchange the number of blocks per proc
        nbBlocksPerLevelAll.resize(tree->getHeight() * comm.processCount());
        FMpi::Assert(MPI_Allgather(nbBlocksPerLevel.get(), tree->getHeight(), MPI_INT,
                                   nbBlocksPerLevelAll.data(), tree->getHeight(), MPI_INT,
                                   comm.getComm()), __LINE__);
        // Compute the number of blocks before mine
        nbBlocksBeforeMinPerLevel.resize(tree->getHeight());
        for(int idxLevel = 1 ; idxLevel < tree->getHeight() ; ++idxLevel){
            nbBlocksBeforeMinPerLevel[idxLevel] = 0;
            for(int idxProc = 0 ; idxProc < comm.processId() ; ++idxProc){
                nbBlocksBeforeMinPerLevel[idxLevel] += nbBlocksPerLevelAll[idxProc*tree->getHeight() + idxLevel];
            }
        }
        // Prepare the block infos
        processesBlockInfos.resize(tree->getHeight());
        std::unique_ptr<int[]> recvBlocksCount(new int[comm.processCount()]);
        std::unique_ptr<int[]> recvBlockDispl(new int[comm.processCount()]);
        // Exchange the block info per level
        for(int idxLevel = 1 ; idxLevel < tree->getHeight() ; ++idxLevel){
            // Count the total number of blocks
            int nbBlocksInLevel = 0;
            recvBlockDispl[0] = 0;
            for(int idxProc = 0 ; idxProc < comm.processCount() ; ++idxProc){
                nbBlocksInLevel += nbBlocksPerLevelAll[idxProc*tree->getHeight() + idxLevel];
                // Count and displacement for the MPI all gatherv
                recvBlocksCount[idxProc] = nbBlocksPerLevelAll[idxProc*tree->getHeight() + idxLevel] * int(sizeof(BlockDescriptor));
                if(idxProc) recvBlockDispl[idxProc] = recvBlockDispl[idxProc-1] + recvBlocksCount[idxProc-1];
            }
            processesBlockInfos[idxLevel].resize(nbBlocksInLevel);
            // Fill my blocks
            std::vector<BlockDescriptor> myBlocksAtLevel;
            myBlocksAtLevel.resize(nbBlocksPerLevel[idxLevel]);
            FAssertLF(tree->getNbCellGroupAtLevel(idxLevel) == int(myBlocksAtLevel.size()));
            FAssertLF(nbBlocksPerLevel[idxLevel] == nbBlocksPerLevelAll[comm.processId()*tree->getHeight() + idxLevel]);

            for(int idxGroup = 0 ; idxGroup < tree->getNbCellGroupAtLevel(idxLevel) ; ++idxGroup){
                CellContainerClass*const currentCells = tree->getCellGroup(idxLevel, idxGroup);
                myBlocksAtLevel[idxGroup].firstIndex = currentCells->getStartingIndex();
                myBlocksAtLevel[idxGroup].lastIndex  = currentCells->getEndingIndex();
                myBlocksAtLevel[idxGroup].owner = comm.processId();
675 676 677
                myBlocksAtLevel[idxGroup].bufferSizeSymb = currentCells->getBufferSizeInByte();
                myBlocksAtLevel[idxGroup].bufferSizeUp   = currentCells->getMultipoleBufferSizeInByte();
                myBlocksAtLevel[idxGroup].bufferSizeDown = currentCells->getLocalBufferSizeInByte();
678 679 680 681 682 683 684 685 686 687 688 689 690

                if(idxLevel == tree->getHeight() - 1){
                    myBlocksAtLevel[idxGroup].leavesBufferSize = tree->getParticleGroup(idxGroup)->getBufferSizeInByte();
                }
                else{
                    myBlocksAtLevel[idxGroup].leavesBufferSize = 0;
                }
            }
            // Exchange with all other
            FMpi::Assert(MPI_Allgatherv(myBlocksAtLevel.data(), int(myBlocksAtLevel.size()*sizeof(BlockDescriptor)), MPI_BYTE,
                                        processesBlockInfos[idxLevel].data(), recvBlocksCount.get(), recvBlockDispl.get(), MPI_BYTE,
                                        comm.getComm()), __LINE__);
        }
691 692 693 694 695 696 697
        // Prepare remate ptr and handles
        remoteCellGroups.resize( tree->getHeight() );
        for(int idxLevel = 1 ; idxLevel < tree->getHeight() ; ++idxLevel){
            remoteCellGroups[idxLevel].resize( processesBlockInfos[idxLevel].size());
        }
        remoteParticleGroupss.resize(processesBlockInfos[tree->getHeight()-1].size());

698 699 700 701 702 703 704 705
        // From now we have the number of blocks for all process
        // we also have the size of the blocks therefor we can
        // create the handles we need
        // We will now detect the relation between our blocks and others
        // During the M2M (which is the same for the L2L)
        // During the M2L and during the P2P
        // I need to insert the task that read my data or that write the data I need.
        // M2L
706
        externalInteractionsAllLevelMpi.clear();
707 708 709 710 711 712 713 714 715 716 717 718 719 720
        externalInteractionsAllLevelMpi.resize(tree->getHeight());
        for(int idxLevel = tree->getHeight()-1 ; idxLevel >= 2 ; --idxLevel){
            // From this level there are no more blocks
            if(tree->getNbCellGroupAtLevel(idxLevel) == 0){
                // We stop here
                break;
            }
            // What are my morton interval at this level
            const MortonIndex myFirstIndex = tree->getCellGroup(idxLevel, 0)->getStartingIndex();
            const MortonIndex myLastIndex = tree->getCellGroup(idxLevel, tree->getNbCellGroupAtLevel(idxLevel)-1)->getEndingIndex();

            externalInteractionsAllLevelMpi[idxLevel].resize(tree->getNbCellGroupAtLevel(idxLevel));

            for(int idxGroup = 0 ; idxGroup < tree->getNbCellGroupAtLevel(idxLevel) ; ++idxGroup){
721
                CellContainerClass* currentCells = tree->getCellGroup(idxLevel, idxGroup);
722 723 724 725 726 727 728 729 730 731

                std::vector<BlockInteractions<CellContainerClass>>* externalInteractions = &externalInteractionsAllLevelMpi[idxLevel][idxGroup];

                #pragma omp task default(none) firstprivate(idxGroup, currentCells, idxLevel, externalInteractions)
                {
                    std::vector<OutOfBlockInteraction> outsideInteractions;
                    const MortonIndex blockStartIdx = currentCells->getStartingIndex();
                    const MortonIndex blockEndIdx   = currentCells->getEndingIndex();

                    for(MortonIndex mindex = blockStartIdx ; mindex < blockEndIdx ; ++mindex){
732 733 734
                        if(currentCells->exists(mindex)){
                            const typename CellContainerClass::CompleteCellClass cell = currentCells->getCompleteCell(mindex);
                            FAssertLF(cell.getMortonIndex() == mindex);
735 736
                            MortonIndex interactionsIndexes[189];
                            int interactionsPosition[189];
737
                            const FTreeCoordinate coord(cell.getCoordinate());
738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767
                            int counter = coord.getInteractionNeighbors(idxLevel,interactionsIndexes,interactionsPosition);

                            for(int idxInter = 0 ; idxInter < counter ; ++idxInter){
                                // This interactions need a block owned by someoneelse
                                if(interactionsIndexes[idxInter] < myFirstIndex || myLastIndex <= interactionsIndexes[idxInter]){
                                    OutOfBlockInteraction property;
                                    property.insideIndex = mindex;
                                    property.outIndex    = interactionsIndexes[idxInter];
                                    property.outPosition = interactionsPosition[idxInter];
                                    outsideInteractions.push_back(property);
                                }
                            }
                        }
                    }

                    // Manage outofblock interaction
                    FQuickSort<OutOfBlockInteraction, int>::QsSequential(outsideInteractions.data(),int(outsideInteractions.size()));

                    int currentOutInteraction = 0;
                    for(int idxOtherGroup = 0 ; idxOtherGroup < int(processesBlockInfos[idxLevel].size())
                                                && currentOutInteraction < int(outsideInteractions.size()) ; ++idxOtherGroup){
                        // Skip my blocks
                        if(idxOtherGroup == nbBlocksBeforeMinPerLevel[idxLevel]){
                            idxOtherGroup += tree->getNbCellGroupAtLevel(idxLevel);
                            if(idxOtherGroup == int(processesBlockInfos[idxLevel].size())){
                                break;
                            }
                            FAssertLF(idxOtherGroup < int(processesBlockInfos[idxLevel].size()));
                        }

BRAMAS Berenger's avatar
BRAMAS Berenger committed
768 769
                        const MortonIndex blockStartIdxOther = processesBlockInfos[idxLevel][idxOtherGroup].firstIndex;
                        const MortonIndex blockEndIdxOther   = processesBlockInfos[idxLevel][idxOtherGroup].lastIndex;
770

BRAMAS Berenger's avatar
BRAMAS Berenger committed
771
                        while(currentOutInteraction < int(outsideInteractions.size()) && outsideInteractions[currentOutInteraction].outIndex < blockStartIdxOther){
772 773 774 775
                            currentOutInteraction += 1;
                        }

                        int lastOutInteraction = currentOutInteraction;
BRAMAS Berenger's avatar
BRAMAS Berenger committed
776
                        while(lastOutInteraction < int(outsideInteractions.size()) && outsideInteractions[lastOutInteraction].outIndex < blockEndIdxOther){
777 778 779 780 781 782
                            lastOutInteraction += 1;
                        }

                        // Create interactions
                        const int nbInteractionsBetweenBlocks = (lastOutInteraction-currentOutInteraction);
                        if(nbInteractionsBetweenBlocks){
783
                            if(remoteCellGroups[idxLevel][idxOtherGroup].ptrSymb == nullptr){
784 785
                                #pragma omp critical(CreateM2LRemotes)
                                {
786
                                    if(remoteCellGroups[idxLevel][idxOtherGroup].ptrSymb == nullptr){
BRAMAS Berenger's avatar
BRAMAS Berenger committed
787
                                        const size_t nbBytesInBlockSymb = processesBlockInfos[idxLevel][idxOtherGroup].bufferSizeSymb;
788
                                        unsigned char* memoryBlockSymb = (unsigned char*)FAlignedMemory::AllocateBytes<32>(nbBytesInBlockSymb);
789 790 791
                                        remoteCellGroups[idxLevel][idxOtherGroup].ptrSymb = memoryBlockSymb;
                                        starpu_variable_data_register(&remoteCellGroups[idxLevel][idxOtherGroup].handleSymb, 0,
                                                                      (uintptr_t)remoteCellGroups[idxLevel][idxOtherGroup].ptrSymb, nbBytesInBlockSymb);
BRAMAS Berenger's avatar
BRAMAS Berenger committed
792
                                        const size_t nbBytesInBlockUp = processesBlockInfos[idxLevel][idxOtherGroup].bufferSizeUp;
793
                                        unsigned char* memoryBlockUp = (unsigned char*)FAlignedMemory::AllocateBytes<32>(nbBytesInBlockUp);
794 795 796
                                        remoteCellGroups[idxLevel][idxOtherGroup].ptrUp = memoryBlockUp;
                                        starpu_variable_data_register(&remoteCellGroups[idxLevel][idxOtherGroup].handleUp, 0,
                                                                      (uintptr_t)remoteCellGroups[idxLevel][idxOtherGroup].ptrUp, nbBytesInBlockUp);
797 798
                                    }
                                }
799 800
                            }

801 802
                            externalInteractions->emplace_back();
                            BlockInteractions<CellContainerClass>* interactions = &externalInteractions->back();
803
                            //interactions->otherBlock = remoteCellGroups[idxLevel][idxOtherGroup].ptr;
804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821
                            interactions->otherBlockId = idxOtherGroup;
                            interactions->interactions.resize(nbInteractionsBetweenBlocks);
                            std::copy(outsideInteractions.begin() + currentOutInteraction,
                                      outsideInteractions.begin() + lastOutInteraction,
                                      interactions->interactions.begin());
                        }

                        currentOutInteraction = lastOutInteraction;
                    }
                }
            }
        }
        // P2P
        // We create one big vector per block
        {
            const MortonIndex myFirstIndex = tree->getParticleGroup(0)->getStartingIndex();
            const MortonIndex myLastIndex = tree->getParticleGroup(tree->getNbParticleGroup()-1)->getEndingIndex();

822
            externalInteractionsLeafLevelMpi.clear();
823 824 825 826 827 828 829 830 831 832 833 834 835 836
            externalInteractionsLeafLevelMpi.resize(tree->getNbParticleGroup());
            for(int idxGroup = 0 ; idxGroup < tree->getNbParticleGroup() ; ++idxGroup){
                // Create the vector
                ParticleGroupClass* containers = tree->getParticleGroup(idxGroup);

                std::vector<BlockInteractions<ParticleGroupClass>>* externalInteractions = &externalInteractionsLeafLevelMpi[idxGroup];

                #pragma omp task default(none) firstprivate(idxGroup, containers, externalInteractions)
                { // Can be a task(inout:iterCells)
                    std::vector<OutOfBlockInteraction> outsideInteractions;
                    const MortonIndex blockStartIdx = containers->getStartingIndex();
                    const MortonIndex blockEndIdx   = containers->getEndingIndex();

                    for(MortonIndex mindex = blockStartIdx ; mindex < blockEndIdx ; ++mindex){
837 838
                        // ParticleContainerClass particles = containers->template getLeaf<ParticleContainerClass>(mindex);
                        if(containers->exists(mindex)){
839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871
                            MortonIndex interactionsIndexes[26];
                            int interactionsPosition[26];
                            FTreeCoordinate coord(mindex, tree->getHeight()-1);
                            int counter = coord.getNeighborsIndexes(tree->getHeight(),interactionsIndexes,interactionsPosition);

                            for(int idxInter = 0 ; idxInter < counter ; ++idxInter){
                                if(interactionsIndexes[idxInter] < myFirstIndex ||
                                        myLastIndex <= interactionsIndexes[idxInter]){
                                    OutOfBlockInteraction property;
                                    property.insideIndex = mindex;
                                    property.outIndex    = interactionsIndexes[idxInter];
                                    property.outPosition = interactionsPosition[idxInter];
                                    outsideInteractions.push_back(property);
                                }
                            }
                        }
                    }

                    // Sort to match external order
                    FQuickSort<OutOfBlockInteraction, int>::QsSequential(outsideInteractions.data(),int(outsideInteractions.size()));

                    int currentOutInteraction = 0;
                    for(int idxOtherGroup = 0 ; idxOtherGroup < int(processesBlockInfos[tree->getHeight()-1].size())
                                                && currentOutInteraction < int(outsideInteractions.size()) ; ++idxOtherGroup){
                        // Skip my blocks
                        if(idxOtherGroup == nbBlocksBeforeMinPerLevel[tree->getHeight()-1]){
                            idxOtherGroup += tree->getNbCellGroupAtLevel(tree->getHeight()-1);
                            if(idxOtherGroup == int(processesBlockInfos[tree->getHeight()-1].size())){
                                break;
                            }
                            FAssertLF(idxOtherGroup < int(processesBlockInfos[tree->getHeight()-1].size()));
                        }

BRAMAS Berenger's avatar
BRAMAS Berenger committed
872 873
                        const MortonIndex blockStartIdxOther = processesBlockInfos[tree->getHeight()-1][idxOtherGroup].firstIndex;
                        const MortonIndex blockEndIdxOther   = processesBlockInfos[tree->getHeight()-1][idxOtherGroup].lastIndex;
874

BRAMAS Berenger's avatar
BRAMAS Berenger committed
875
                        while(currentOutInteraction < int(outsideInteractions.size()) && outsideInteractions[currentOutInteraction].outIndex < blockStartIdxOther){
876 877 878 879
                            currentOutInteraction += 1;
                        }

                        int lastOutInteraction = currentOutInteraction;
BRAMAS Berenger's avatar
BRAMAS Berenger committed
880
                        while(lastOutInteraction < int(outsideInteractions.size()) && outsideInteractions[lastOutInteraction].outIndex < blockEndIdxOther){
881 882 883 884 885
                            lastOutInteraction += 1;
                        }

                        const int nbInteractionsBetweenBlocks = (lastOutInteraction-currentOutInteraction);
                        if(nbInteractionsBetweenBlocks){
886
                            if(remoteParticleGroupss[idxOtherGroup].ptrSymb == nullptr){
887 888
                                #pragma omp critical(CreateM2LRemotes)
                                {
889
                                    if(remoteParticleGroupss[idxOtherGroup].ptrSymb == nullptr){
BRAMAS Berenger's avatar
BRAMAS Berenger committed
890
                                        const size_t nbBytesInBlock = processesBlockInfos[tree->getHeight()-1][idxOtherGroup].leavesBufferSize;
891
                                        unsigned char* memoryBlock = (unsigned char*)FAlignedMemory::AllocateBytes<32>(nbBytesInBlock);
892 893 894
                                        remoteParticleGroupss[idxOtherGroup].ptrSymb = memoryBlock;
                                        starpu_variable_data_register(&remoteParticleGroupss[idxOtherGroup].handleSymb, 0,
                                                                      (uintptr_t)remoteParticleGroupss[idxOtherGroup].ptrSymb, nbBytesInBlock);
895 896
                                    }
                                }
897 898
                            }

899 900
                            externalInteractions->emplace_back();
                            BlockInteractions<ParticleGroupClass>* interactions = &externalInteractions->back();
901
                            //interactions->otherBlock = remoteParticleGroupss[idxOtherGroup].ptr;
902 903 904 905 906 907 908 909 910 911 912 913 914
                            interactions->otherBlockId = idxOtherGroup;
                            interactions->interactions.resize(nbInteractionsBetweenBlocks);
                            std::copy(outsideInteractions.begin() + currentOutInteraction,
                                      outsideInteractions.begin() + lastOutInteraction,
                                      interactions->interactions.begin());
                        }

                        currentOutInteraction = lastOutInteraction;
                    }
                }
            }
        }
    }
915

916 917 918 919 920 921 922 923 924 925 926
    struct MpiDependency{
        int src;
        int dest;
        int level;
        int globalBlockId;
    };

    std::vector<MpiDependency> toSend;

    void postRecvAllocatedBlocks(){
        std::vector<MpiDependency> toRecv;
927
        FAssertLF(tree->getHeight() == int(remoteCellGroups.size()));
928 929
        const bool directOnly = (tree->getHeight() <= 2);
        if(!directOnly){
930
            for(int idxLevel = FAbstractAlgorithm::upperWorkingLevel ; idxLevel < FAbstractAlgorithm::lowerWorkingLevel ; ++idxLevel){
931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950
                for(int idxHandle = 0 ; idxHandle < int(remoteCellGroups[idxLevel].size()) ; ++idxHandle){
                    if(remoteCellGroups[idxLevel][idxHandle].ptrSymb){
                        FAssertLF(remoteCellGroups[idxLevel][idxHandle].ptrUp);
                        FLOG(FLog::Controller << "[SMpi] " << idxLevel << " Post a recv during M2L for Idx " << processesBlockInfos[idxLevel][idxHandle].firstIndex <<
                             " and dest is " << processesBlockInfos[idxLevel][idxHandle].owner << " tag " << getTag(idxLevel,processesBlockInfos[idxLevel][idxHandle].firstIndex, 0) << "\n");
                        FLOG(FLog::Controller << "[SMpi] " << idxLevel << " Post a recv during M2L for Idx " << processesBlockInfos[idxLevel][idxHandle].firstIndex <<
                             " and dest is " << processesBlockInfos[idxLevel][idxHandle].owner << " tag " << getTag(idxLevel,processesBlockInfos[idxLevel][idxHandle].firstIndex, 1) << "\n");

                        starpu_mpi_irecv_detached( remoteCellGroups[idxLevel][idxHandle].handleSymb,
                                                    processesBlockInfos[idxLevel][idxHandle].owner,
                                                    getTag(idxLevel,processesBlockInfos[idxLevel][idxHandle].firstIndex, 0),
                                                    comm.getComm(), 0, 0 );
                        starpu_mpi_irecv_detached( remoteCellGroups[idxLevel][idxHandle].handleUp,
                                                    processesBlockInfos[idxLevel][idxHandle].owner,
                                                    getTag(idxLevel,processesBlockInfos[idxLevel][idxHandle].firstIndex, 1),
                                                    comm.getComm(), 0, 0 );

                        toRecv.push_back({processesBlockInfos[idxLevel][idxHandle].owner,
                                            comm.processId(), idxLevel, idxHandle});
                    }
951 952 953 954 955
                }
            }
        }
        {
            for(int idxHandle = 0 ; idxHandle < int(remoteParticleGroupss.size()) ; ++idxHandle){
956
                if(remoteParticleGroupss[idxHandle].ptrSymb){
BRAMAS Berenger's avatar
BRAMAS Berenger committed
957
                    FLOG(FLog::Controller << "[SMpi] Post a recv during P2P for Idx " << processesBlockInfos[tree->getHeight()-1][idxHandle].firstIndex <<
958
                         " and dest is " << processesBlockInfos[tree->getHeight()-1][idxHandle].owner << " tag " << getTag(tree->getHeight(),processesBlockInfos[tree->getHeight()-1][idxHandle].firstIndex, 0) << "\n");
BRAMAS Berenger's avatar
BRAMAS Berenger committed
959

960
                    starpu_mpi_irecv_detached( remoteParticleGroupss[idxHandle].handleSymb,
961
                                                processesBlockInfos[tree->getHeight()-1][idxHandle].owner,
962
                                                getTag(tree->getHeight(),processesBlockInfos[tree->getHeight()-1][idxHandle].firstIndex, 0),
963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993
                                                comm.getComm(), 0, 0 );

                    toRecv.push_back({processesBlockInfos[tree->getHeight()-1][idxHandle].owner,
                                        comm.processId(), tree->getHeight(), idxHandle});
                }
            }
        }

        FQuickSort<MpiDependency, int>::QsSequential(toRecv.data(),int(toRecv.size()),[](const MpiDependency& d1, const MpiDependency& d2){
            return d1.src <= d2.src;
        });

        std::unique_ptr<int[]> nbBlocksToRecvFromEach(new int[comm.processCount()]);
        memset(nbBlocksToRecvFromEach.get(), 0, sizeof(int)*comm.processCount());
        for(int idxDep = 0 ; idxDep < int(toRecv.size()) ; ++idxDep){
            nbBlocksToRecvFromEach[toRecv[idxDep].src] += 1;
        }

        FAssertLF(nbBlocksToRecvFromEach[comm.processId()] == 0);
        int offset = 0;

        for(int idxProc = 0 ; idxProc < comm.processCount() ; ++idxProc){
            if(idxProc == comm.processId()){
                // How much to send to each
                std::unique_ptr<int[]> nbBlocksToSendToEach(new int[comm.processCount()]);
                FMpi::Assert(MPI_Gather(&nbBlocksToRecvFromEach[idxProc], 1,
                                 MPI_INT, nbBlocksToSendToEach.get(), 1,
                                 MPI_INT, idxProc, comm.getComm() ), __LINE__);

                std::unique_ptr<int[]> displs(new int[comm.processCount()]);
                displs[0] = 0;
BRAMAS Berenger's avatar
BRAMAS Berenger committed
994 995
                for(int idxProcOther = 1 ; idxProcOther < comm.processCount() ; ++idxProcOther){
                    displs[idxProcOther] = displs[idxProcOther-1] + nbBlocksToSendToEach[idxProcOther-1];
996
                }
997
                toSend.resize(displs[comm.processCount()-1] + nbBlocksToSendToEach[comm.processCount()-1]);
998

999
                // We work in bytes
BRAMAS Berenger's avatar
BRAMAS Berenger committed
1000 1001 1002
                for(int idxProcOther = 0 ; idxProcOther < comm.processCount() ; ++idxProcOther){
                    nbBlocksToSendToEach[idxProcOther] *= int(sizeof(MpiDependency));
                    displs[idxProcOther] *= int(sizeof(MpiDependency));
1003
                }
1004

1005 1006
                FMpi::Assert(MPI_Gatherv( nullptr, 0, MPI_BYTE,
                                 toSend.data(),
1007 1008 1009 1010 1011 1012 1013
                                 nbBlocksToSendToEach.get(), displs.get(),
                                 MPI_BYTE, idxProc, comm.getComm()), __LINE__);
            }
            else{
                FMpi::Assert(MPI_Gather(&nbBlocksToRecvFromEach[idxProc], 1,
                                 MPI_INT, 0, 0, MPI_INT, idxProc, comm.getComm() ), __LINE__);
                FMpi::Assert(MPI_Gatherv(
1014
                                 &toRecv[offset], int(nbBlocksToRecvFromEach[idxProc]*sizeof(MpiDependency)), MPI_BYTE,
1015
                                 0, 0, 0, MPI_BYTE, idxProc, comm.getComm() ), __LINE__);
1016

1017 1018 1019 1020 1021 1022 1023 1024 1025
                offset += nbBlocksToRecvFromEach[idxProc];
            }
        }
    }

    void insertParticlesSend(){
        for(int idxSd = 0 ; idxSd < int(toSend.size()) ; ++idxSd){
            const MpiDependency sd = toSend[idxSd];
            if(sd.level == tree->getHeight()){
1026 1027
                const int localId = sd.globalBlockId - nbBlocksBeforeMinPerLevel[tree->getHeight()-1];
                FAssertLF(sd.src == comm.processId());
1028 1029 1030 1031
                FAssertLF(0 <= localId);
                FAssertLF(localId < tree->getNbParticleGroup());

                FLOG(FLog::Controller << "[SMpi] Post a send during P2P for Idx " << tree->getParticleGroup(localId)->getStartingIndex() <<
1032
                     " and dest is " << sd.dest << " tag " << getTag(tree->getHeight(),tree->getParticleGroup(localId)->getStartingIndex(), 0) <<  "\n");
1033

1034 1035
                starpu_mpi_isend_detached( particleHandles[localId].symb, sd.dest,
                        getTag(tree->getHeight(),tree->getParticleGroup(localId)->getStartingIndex(), 0),