FGroupTaskStarpuMpiAlgorithm.hpp 103 KB
Newer Older
1 2 3 4 5
// Keep in private GIT
// @SCALFMM_PRIVATE
#ifndef FGROUPTASKSTARPUMPIALGORITHM_HPP
#define FGROUPTASKSTARPUMPIALGORITHM_HPP

6 7 8 9 10 11 12 13 14 15 16
#include "../../Utils/FGlobal.hpp"
#include "../../Core/FCoreCommon.hpp"
#include "../../Utils/FQuickSort.hpp"
#include "../../Containers/FTreeCoordinate.hpp"
#include "../../Utils/FLog.hpp"
#include "../../Utils/FTic.hpp"
#include "../../Utils/FAssert.hpp"
#include "../../Utils/FAlignedMemory.hpp"
#include "../../Utils/FAssert.hpp"

#include "../../Utils/FMpi.hpp"
17

18 19
#include "FOutOfBlockInteraction.hpp"

20 21 22 23 24 25
#include <vector>
#include <memory>

#include <omp.h>

#include <starpu.h>
26
#include <starpu_mpi.h>
27
#include "../StarPUUtils/FStarPUUtils.hpp"
28
#include "../StarPUUtils/FStarPUFmmPriorities.hpp"
29

30
#ifdef STARPU_USE_CPU
31
#include "../StarPUUtils/FStarPUCpuWrapper.hpp"
32
#endif
33
#ifdef SCALFMM_ENABLE_CUDA_KERNEL
34 35 36 37 38
#include "../StarPUUtils/FStarPUCudaWrapper.hpp"
#include "../Cuda/FCudaEmptyKernel.hpp"
#include "../Cuda/FCudaGroupAttachedLeaf.hpp"
#include "../Cuda/FCudaGroupOfParticles.hpp"
#include "../Cuda/FCudaGroupOfCells.hpp"
39
#include "../Cuda/FCudaEmptyCellSymb.hpp"
40
#endif
41
#ifdef SCALFMM_ENABLE_OPENCL_KERNEL
42 43
#include "../StarPUUtils/FStarPUOpenClWrapper.hpp"
#include "../OpenCl/FOpenCLDeviceWrapper.hpp"
44
#include "../OpenCl/FEmptyOpenCLCode.hpp"
45
#endif
46

47

48
template <class OctreeClass, class CellContainerClass, class KernelClass, class ParticleGroupClass, class StarPUCpuWrapperClass
49
#ifdef SCALFMM_ENABLE_CUDA_KERNEL
50
    , class StarPUCudaWrapperClass = FStarPUCudaWrapper<KernelClass, FCudaEmptyCellSymb, int, int, FCudaGroupOfCells<FCudaEmptyCellSymb, int, int>,
BRAMAS Berenger's avatar
BRAMAS Berenger committed
51
                                                        FCudaGroupOfParticles<int, 0, 0, int>, FCudaGroupAttachedLeaf<int, 0, 0, int>, FCudaEmptyKernel<int> >
52
#endif
53
#ifdef SCALFMM_ENABLE_OPENCL_KERNEL
54
    , class StarPUOpenClWrapperClass = FStarPUOpenClWrapper<KernelClass, FOpenCLDeviceWrapper<KernelClass>>
55 56
#endif
          >
57
class FGroupTaskStarPUMpiAlgorithm : public FAbstractAlgorithm {
58
protected:
59
    typedef FGroupTaskStarPUMpiAlgorithm<OctreeClass, CellContainerClass, KernelClass, ParticleGroupClass, StarPUCpuWrapperClass
60
#ifdef SCALFMM_ENABLE_CUDA_KERNEL
61
        , StarPUCudaWrapperClass
62
#endif
63
#ifdef SCALFMM_ENABLE_OPENCL_KERNEL
64
    , StarPUOpenClWrapperClass
65 66
#endif
    > ThisClass;
67

68
    int getTag(const int inLevel, const MortonIndex mindex, const int mode) const{
69 70 71
        int shift = 0;
        int height = tree->getHeight();
        while(height) { shift += 1; height >>= 1; }
BRAMAS Berenger's avatar
BRAMAS Berenger committed
72
        return int((((mindex<<shift) + inLevel) << 5) + mode);
73 74
    }

75 76 77 78 79 80 81 82 83
    const FMpi::FComm& comm;

    template <class OtherBlockClass>
    struct BlockInteractions{
        OtherBlockClass* otherBlock;
        int otherBlockId;
        std::vector<OutOfBlockInteraction> interactions;
    };

84 85 86 87
    struct CellHandles{
        starpu_data_handle_t symb;
        starpu_data_handle_t up;
        starpu_data_handle_t down;
88
        int intervalSize;
89 90 91 92 93
    };

    struct ParticleHandles{
        starpu_data_handle_t symb;
        starpu_data_handle_t down;
94
        int intervalSize;
95 96
    };

97 98 99 100
    std::vector< std::vector< std::vector<BlockInteractions<CellContainerClass>>>> externalInteractionsAllLevel;
    std::vector< std::vector<BlockInteractions<ParticleGroupClass>>> externalInteractionsLeafLevel;

    OctreeClass*const tree;       //< The Tree
101
    KernelClass*const originalCpuKernel;
102

103 104
    std::vector<CellHandles>* cellHandles;
    std::vector<ParticleHandles> particleHandles;
105 106 107 108 109 110 111 112

    starpu_codelet p2m_cl;
    starpu_codelet m2m_cl[9];
    starpu_codelet l2l_cl[9];
    starpu_codelet l2p_cl;

    starpu_codelet m2l_cl_in;
    starpu_codelet m2l_cl_inout;
113
    starpu_codelet m2l_cl_inout_mpi;
114 115 116

    starpu_codelet p2p_cl_in;
    starpu_codelet p2p_cl_inout;
117
    starpu_codelet p2p_cl_inout_mpi;
118

119
#ifdef STARPU_USE_CPU
120
    StarPUCpuWrapperClass cpuWrapper;
121
#endif
122
#ifdef SCALFMM_ENABLE_CUDA_KERNEL
123 124
    StarPUCudaWrapperClass cudaWrapper;
#endif
125
#ifdef SCALFMM_ENABLE_OPENCL_KERNEL
126 127
    StarPUOpenClWrapperClass openclWrapper;
#endif
128 129 130

    FStarPUPtrInterface wrappers;
    FStarPUPtrInterface* wrapperptr;
131

132 133 134 135 136
#ifdef STARPU_SUPPORT_ARBITER
    starpu_arbiter_t arbiterPole;
    starpu_arbiter_t arbiterLocal;
    starpu_arbiter_t arbiterParticles;
#endif
137
public:
138 139
    FGroupTaskStarPUMpiAlgorithm(const FMpi::FComm& inComm, OctreeClass*const inTree, KernelClass* inKernels)
        :   comm(inComm), tree(inTree), originalCpuKernel(inKernels),
140
          cellHandles(nullptr),
141 142
#ifdef STARPU_USE_CPU
            cpuWrapper(tree->getHeight()),
143
#endif
144
#ifdef SCALFMM_ENABLE_CUDA_KERNEL
145 146
            cudaWrapper(tree->getHeight()),
#endif
147
#ifdef SCALFMM_ENABLE_OPENCL_KERNEL
148
            openclWrapper(tree->getHeight()),
149 150
#endif
            wrapperptr(&wrappers){
151 152 153
        FAssertLF(tree, "tree cannot be null");
        FAssertLF(inKernels, "kernels cannot be null");

154 155
        FAbstractAlgorithm::setNbLevelsInTree(tree->getHeight());

156 157
        struct starpu_conf conf;
        FAssertLF(starpu_conf_init(&conf) == 0);
158
        FStarPUFmmPriorities::Controller().init(&conf, tree->getHeight(), inKernels);
159
        FAssertLF(starpu_init(&conf) == 0);
160
        FAssertLF(starpu_mpi_init ( 0, 0, 0 ) == 0);
161 162 163

        starpu_pthread_mutex_t initMutex;
        starpu_pthread_mutex_init(&initMutex, NULL);
164
#ifdef STARPU_USE_CPU
165 166 167 168 169
        FStarPUUtils::ExecOnWorkers(STARPU_CPU, [&](){
            starpu_pthread_mutex_lock(&initMutex);
            cpuWrapper.initKernel(starpu_worker_get_id(), inKernels);
            starpu_pthread_mutex_unlock(&initMutex);
        });
170
        wrappers.set(FSTARPU_CPU_IDX, &cpuWrapper);
171
#endif
172
#ifdef SCALFMM_ENABLE_CUDA_KERNEL
173 174 175 176 177 178 179
        FStarPUUtils::ExecOnWorkers(STARPU_CUDA, [&](){
            starpu_pthread_mutex_lock(&initMutex);
            cudaWrapper.initKernel(starpu_worker_get_id(), inKernels);
            starpu_pthread_mutex_unlock(&initMutex);
        });
        wrappers.set(FSTARPU_CUDA_IDX, &cudaWrapper);
#endif
180
#ifdef SCALFMM_ENABLE_OPENCL_KERNEL
181 182 183 184 185 186
        FStarPUUtils::ExecOnWorkers(STARPU_OPENCL, [&](){
            starpu_pthread_mutex_lock(&initMutex);
            openclWrapper.initKernel(starpu_worker_get_id(), inKernels);
            starpu_pthread_mutex_unlock(&initMutex);
        });
        wrappers.set(FSTARPU_OPENCL_IDX, &openclWrapper);
187
#endif
188 189
        starpu_pthread_mutex_destroy(&initMutex);

190 191
        starpu_pause();

192
        cellHandles   = new std::vector<CellHandles>[tree->getHeight()];
193

194
        initCodelet();
195
        initCodeletMpi();
196 197 198 199 200
#ifdef STARPU_SUPPORT_ARBITER
        arbiterPole = starpu_arbiter_create();
        arbiterLocal = starpu_arbiter_create();
        arbiterParticles = starpu_arbiter_create();
#endif
201

BRAMAS Berenger's avatar
BRAMAS Berenger committed
202
        FLOG(FLog::Controller << "FGroupTaskStarPUAlgorithm (Max Worker " << starpu_worker_get_count() << ")\n");
BRAMAS Berenger's avatar
BRAMAS Berenger committed
203
#ifdef STARPU_USE_CPU
BRAMAS Berenger's avatar
BRAMAS Berenger committed
204
        FLOG(FLog::Controller << "FGroupTaskStarPUAlgorithm (Max CPU " << starpu_cpu_worker_get_count() << ")\n");
BRAMAS Berenger's avatar
BRAMAS Berenger committed
205
#endif
206
#ifdef SCALFMM_ENABLE_OPENCL_KERNEL
BRAMAS Berenger's avatar
BRAMAS Berenger committed
207 208
        FLOG(FLog::Controller << "FGroupTaskStarPUAlgorithm (Max OpenCL " << starpu_opencl_worker_get_count() << ")\n");
#endif
209
#ifdef SCALFMM_ENABLE_CUDA_KERNEL
BRAMAS Berenger's avatar
BRAMAS Berenger committed
210 211
        FLOG(FLog::Controller << "FGroupTaskStarPUAlgorithm (Max CUDA " << starpu_cuda_worker_get_count() << ")\n");
#endif
212 213 214
    }

    ~FGroupTaskStarPUMpiAlgorithm(){
BRAMAS Berenger's avatar
BRAMAS Berenger committed
215 216
        starpu_resume();

217
        cleanHandle();
218
        cleanHandleMpi();
219
        delete[] cellHandles;
220

BRAMAS Berenger's avatar
BRAMAS Berenger committed
221 222 223 224 225 226 227 228 229 230
        starpu_pthread_mutex_t releaseMutex;
        starpu_pthread_mutex_init(&releaseMutex, NULL);
#ifdef STARPU_USE_CPU
        FStarPUUtils::ExecOnWorkers(STARPU_CPU, [&](){
            starpu_pthread_mutex_lock(&releaseMutex);
            cpuWrapper.releaseKernel(starpu_worker_get_id());
            starpu_pthread_mutex_unlock(&releaseMutex);
        });
        wrappers.set(FSTARPU_CPU_IDX, &cpuWrapper);
#endif
231
#ifdef SCALFMM_ENABLE_CUDA_KERNEL
BRAMAS Berenger's avatar
BRAMAS Berenger committed
232 233 234 235 236 237 238
        FStarPUUtils::ExecOnWorkers(STARPU_CUDA, [&](){
            starpu_pthread_mutex_lock(&releaseMutex);
            cudaWrapper.releaseKernel(starpu_worker_get_id());
            starpu_pthread_mutex_unlock(&releaseMutex);
        });
        wrappers.set(FSTARPU_CUDA_IDX, &cudaWrapper);
#endif
239
#ifdef SCALFMM_ENABLE_OPENCL_KERNEL
BRAMAS Berenger's avatar
BRAMAS Berenger committed
240 241 242 243 244 245 246 247 248
        FStarPUUtils::ExecOnWorkers(STARPU_OPENCL, [&](){
            starpu_pthread_mutex_lock(&releaseMutex);
            openclWrapper.releaseKernel(starpu_worker_get_id());
            starpu_pthread_mutex_unlock(&releaseMutex);
        });
        wrappers.set(FSTARPU_OPENCL_IDX, &openclWrapper);
#endif
        starpu_pthread_mutex_destroy(&releaseMutex);

249 250 251 252 253 254

#ifdef STARPU_SUPPORT_ARBITER
        starpu_arbiter_destroy(arbiterPole);
        starpu_arbiter_destroy(arbiterLocal);
        starpu_arbiter_destroy(arbiterParticles);
#endif
255
        starpu_mpi_shutdown();
256 257 258
        starpu_shutdown();
    }

259 260 261 262 263
protected:
    /**
      * Runs the complete algorithm.
      */
    void executeCore(const unsigned operationsToProceed) override {
264
        FLOG( FLog::Controller << "\tStart FGroupTaskStarPUMpiAlgorithm\n" );
265
        const bool directOnly = (tree->getHeight() <= 2);
266 267 268 269 270 271

        #pragma omp parallel
        #pragma omp single
        buildExternalInteractionVecs();
        buildHandles();

272 273
        #pragma omp parallel
        #pragma omp single
274 275 276
        buildRemoteInteractionsAndHandles();

        starpu_resume();
277
        postRecvAllocatedBlocks();
278

279
        if( operationsToProceed & FFmmP2P ) insertParticlesSend();
280

281
        if(operationsToProceed & FFmmP2M && !directOnly) bottomPass();
282

283 284
        if(operationsToProceed & FFmmM2M && !directOnly) upwardPass();
        if(operationsToProceed & FFmmM2L && !directOnly) insertCellsSend();
285

286 287
        if(operationsToProceed & FFmmM2L && !directOnly) transferPass();
        if(operationsToProceed & FFmmM2L && !directOnly) transferPassMpi();
288

289
        if(operationsToProceed & FFmmL2L && !directOnly) downardPass();
290

291 292
        if( operationsToProceed & FFmmP2P ) directPass();
        if( operationsToProceed & FFmmP2P ) directPassMpi();
BRAMAS Berenger's avatar
BRAMAS Berenger committed
293

294
        if( operationsToProceed & FFmmL2P  && !directOnly) mergePass();
295 296 297 298 299

        starpu_task_wait_for_all();
        starpu_pause();
    }

300

301 302
    void initCodelet(){
        memset(&p2m_cl, 0, sizeof(p2m_cl));
303
#ifdef STARPU_USE_CPU
304
        if(originalCpuKernel->supportP2M(FSTARPU_CPU_IDX)){
305 306 307
            p2m_cl.cpu_funcs[0] = StarPUCpuWrapperClass::bottomPassCallback;
            p2m_cl.where |= STARPU_CPU;
        }
308
#endif
309
#ifdef SCALFMM_ENABLE_CUDA_KERNEL
310
        if(originalCpuKernel->supportP2M(FSTARPU_CUDA_IDX)){
311 312 313 314
            p2m_cl.cuda_funcs[0] = StarPUCudaWrapperClass::bottomPassCallback;
            p2m_cl.where |= STARPU_CUDA;
        }
#endif
315
#ifdef SCALFMM_ENABLE_OPENCL_KERNEL
316
        if(originalCpuKernel->supportP2M(FSTARPU_OPENCL_IDX)){
317 318 319
            p2m_cl.opencl_funcs[0] = StarPUOpenClWrapperClass::bottomPassCallback;
            p2m_cl.where |= STARPU_OPENCL;
        }
320
#endif
321 322 323 324
        p2m_cl.nbuffers = 3;
        p2m_cl.modes[0] = STARPU_R;
        p2m_cl.modes[1] = STARPU_RW;
        p2m_cl.modes[2] = STARPU_R;
325
        p2m_cl.name = "p2m_cl";
326 327 328 329

        memset(m2m_cl, 0, sizeof(m2m_cl[0])*9);
        memset(l2l_cl, 0, sizeof(l2l_cl[0])*9);
        for(int idx = 0 ; idx < 9 ; ++idx){
330
#ifdef STARPU_USE_CPU
331
            if(originalCpuKernel->supportM2M(FSTARPU_CPU_IDX)){
332 333 334
                m2m_cl[idx].cpu_funcs[0] = StarPUCpuWrapperClass::upwardPassCallback;
                m2m_cl[idx].where |= STARPU_CPU;
            }
335
#endif
336
#ifdef SCALFMM_ENABLE_CUDA_KERNEL
337
            if(originalCpuKernel->supportM2M(FSTARPU_CUDA_IDX)){
338 339 340 341
                m2m_cl[idx].cuda_funcs[0] = StarPUCudaWrapperClass::upwardPassCallback;
                m2m_cl[idx].where |= STARPU_CUDA;
            }
#endif
342
#ifdef SCALFMM_ENABLE_OPENCL_KERNEL
343
            if(originalCpuKernel->supportM2M(FSTARPU_OPENCL_IDX)){
344 345 346
                m2m_cl[idx].opencl_funcs[0] = StarPUOpenClWrapperClass::upwardPassCallback;
                m2m_cl[idx].where |= STARPU_OPENCL;
            }
347
#endif
348 349 350 351
            m2m_cl[idx].nbuffers = (idx+2)*2;
            m2m_cl[idx].dyn_modes = (starpu_data_access_mode*)malloc(m2m_cl[idx].nbuffers*sizeof(starpu_data_access_mode));
            m2m_cl[idx].dyn_modes[0] = STARPU_R;
            m2m_cl[idx].dyn_modes[1] = STARPU_RW;
352
            m2m_cl[idx].name = "m2m_cl";
353

354
#ifdef STARPU_USE_CPU
355
            if(originalCpuKernel->supportL2L(FSTARPU_CPU_IDX)){
356 357 358
                l2l_cl[idx].cpu_funcs[0] = StarPUCpuWrapperClass::downardPassCallback;
                l2l_cl[idx].where |= STARPU_CPU;
            }
359
#endif
360
#ifdef SCALFMM_ENABLE_CUDA_KERNEL
361
            if(originalCpuKernel->supportL2L(FSTARPU_CUDA_IDX)){
362 363 364 365
                l2l_cl[idx].cuda_funcs[0] = StarPUCudaWrapperClass::downardPassCallback;
                l2l_cl[idx].where |= STARPU_CUDA;
            }
#endif
366
#ifdef SCALFMM_ENABLE_OPENCL_KERNEL
367
            if(originalCpuKernel->supportL2L(FSTARPU_OPENCL_IDX)){
368 369 370
                l2l_cl[idx].opencl_funcs[0] = StarPUOpenClWrapperClass::downardPassCallback;
                l2l_cl[idx].where |= STARPU_OPENCL;
            }
371
#endif
372 373
            l2l_cl[idx].nbuffers = (idx+2)*2;
            l2l_cl[idx].dyn_modes = (starpu_data_access_mode*)malloc(l2l_cl[idx].nbuffers*sizeof(starpu_data_access_mode));
374
            l2l_cl[idx].dyn_modes[0] = STARPU_R;
375
            l2l_cl[idx].dyn_modes[1] = STARPU_R;
376
            l2l_cl[idx].name = "l2l_cl";
377 378

            for(int idxBuffer = 0 ; idxBuffer <= idx ; ++idxBuffer){
379 380 381
                m2m_cl[idx].dyn_modes[(idxBuffer*2)+2] = STARPU_R;
                m2m_cl[idx].dyn_modes[(idxBuffer*2)+3] = STARPU_R;
                l2l_cl[idx].dyn_modes[(idxBuffer*2)+2] = STARPU_R;
382
                l2l_cl[idx].dyn_modes[(idxBuffer*2)+3] = starpu_data_access_mode(STARPU_RW|STARPU_COMMUTE_IF_SUPPORTED);
383 384 385 386
            }
        }

        memset(&l2p_cl, 0, sizeof(l2p_cl));
387
#ifdef STARPU_USE_CPU
388
        if(originalCpuKernel->supportL2P(FSTARPU_CPU_IDX)){
389 390 391
            l2p_cl.cpu_funcs[0] = StarPUCpuWrapperClass::mergePassCallback;
            l2p_cl.where |= STARPU_CPU;
        }
392
#endif
393
#ifdef SCALFMM_ENABLE_CUDA_KERNEL
394
        if(originalCpuKernel->supportL2P(FSTARPU_CUDA_IDX)){
395 396 397 398
            l2p_cl.cuda_funcs[0] = StarPUCudaWrapperClass::mergePassCallback;
            l2p_cl.where |= STARPU_CUDA;
        }
#endif
399
#ifdef SCALFMM_ENABLE_OPENCL_KERNEL
400
        if(originalCpuKernel->supportL2P(FSTARPU_OPENCL_IDX)){
401 402 403
            l2p_cl.opencl_funcs[0] = StarPUOpenClWrapperClass::mergePassCallback;
            l2p_cl.where |= STARPU_OPENCL;
        }
404
#endif
405
        l2p_cl.nbuffers = 4;
406
        l2p_cl.modes[0] = STARPU_R;
407 408
        l2p_cl.modes[1] = STARPU_R;
        l2p_cl.modes[2] = STARPU_R;
409
        l2p_cl.modes[3] = starpu_data_access_mode(STARPU_RW|STARPU_COMMUTE_IF_SUPPORTED);
410
        l2p_cl.name = "l2p_cl";
411 412

        memset(&p2p_cl_in, 0, sizeof(p2p_cl_in));
413
#ifdef STARPU_USE_CPU
414
        if(originalCpuKernel->supportP2P(FSTARPU_CPU_IDX)){
415 416 417
            p2p_cl_in.cpu_funcs[0] = StarPUCpuWrapperClass::directInPassCallback;
            p2p_cl_in.where |= STARPU_CPU;
        }
418
#endif
419
#ifdef SCALFMM_ENABLE_CUDA_KERNEL
420
        if(originalCpuKernel->supportP2P(FSTARPU_CUDA_IDX)){
421 422 423 424
            p2p_cl_in.cuda_funcs[0] = StarPUCudaWrapperClass::directInPassCallback;
            p2p_cl_in.where |= STARPU_CUDA;
        }
#endif
425
#ifdef SCALFMM_ENABLE_OPENCL_KERNEL
426
        if(originalCpuKernel->supportP2P(FSTARPU_OPENCL_IDX)){
427 428 429
            p2p_cl_in.opencl_funcs[0] = StarPUOpenClWrapperClass::directInPassCallback;
            p2p_cl_in.where |= STARPU_OPENCL;
        }
430
#endif
431 432
        p2p_cl_in.nbuffers = 2;
        p2p_cl_in.modes[0] = STARPU_R;
433
        p2p_cl_in.modes[1] = starpu_data_access_mode(STARPU_RW|STARPU_COMMUTE_IF_SUPPORTED);
434
        p2p_cl_in.name = "p2p_cl_in";
435
        memset(&p2p_cl_inout, 0, sizeof(p2p_cl_inout));
436
#ifdef STARPU_USE_CPU
437
        if(originalCpuKernel->supportP2PExtern(FSTARPU_CPU_IDX)){
438 439 440
            p2p_cl_inout.cpu_funcs[0] = StarPUCpuWrapperClass::directInoutPassCallback;
            p2p_cl_inout.where |= STARPU_CPU;
        }
441
#endif
442
#ifdef SCALFMM_ENABLE_CUDA_KERNEL
443
        if(originalCpuKernel->supportP2PExtern(FSTARPU_CUDA_IDX)){
444 445 446 447
            p2p_cl_inout.cuda_funcs[0] = StarPUCudaWrapperClass::directInoutPassCallback;
            p2p_cl_inout.where |= STARPU_CUDA;
        }
#endif
448
#ifdef SCALFMM_ENABLE_OPENCL_KERNEL
449
        if(originalCpuKernel->supportP2PExtern(FSTARPU_OPENCL_IDX)){
450 451 452
            p2p_cl_inout.opencl_funcs[0] = StarPUOpenClWrapperClass::directInoutPassCallback;
            p2p_cl_inout.where |= STARPU_OPENCL;
        }
453
#endif
454 455
        p2p_cl_inout.nbuffers = 4;
        p2p_cl_inout.modes[0] = STARPU_R;
456
        p2p_cl_inout.modes[1] = starpu_data_access_mode(STARPU_RW|STARPU_COMMUTE_IF_SUPPORTED);
457
        p2p_cl_inout.modes[2] = STARPU_R;
458
        p2p_cl_inout.modes[3] = starpu_data_access_mode(STARPU_RW|STARPU_COMMUTE_IF_SUPPORTED);
459
        p2p_cl_inout.name = "p2p_cl_inout";
460 461

        memset(&m2l_cl_in, 0, sizeof(m2l_cl_in));
462
#ifdef STARPU_USE_CPU
463
        if(originalCpuKernel->supportM2L(FSTARPU_CPU_IDX)){
464 465 466
            m2l_cl_in.cpu_funcs[0] = StarPUCpuWrapperClass::transferInPassCallback;
            m2l_cl_in.where |= STARPU_CPU;
        }
467
#endif
468
#ifdef SCALFMM_ENABLE_CUDA_KERNEL
469
        if(originalCpuKernel->supportM2L(FSTARPU_CUDA_IDX)){
470 471 472 473
            m2l_cl_in.cuda_funcs[0] = StarPUCudaWrapperClass::transferInPassCallback;
            m2l_cl_in.where |= STARPU_CUDA;
        }
#endif
474
#ifdef SCALFMM_ENABLE_OPENCL_KERNEL
475
        if(originalCpuKernel->supportM2L(FSTARPU_OPENCL_IDX)){
476 477 478
            m2l_cl_in.opencl_funcs[0] = StarPUOpenClWrapperClass::transferInPassCallback;
            m2l_cl_in.where |= STARPU_OPENCL;
        }
479
#endif
480 481
        m2l_cl_in.nbuffers = 3;
        m2l_cl_in.modes[0] = STARPU_R;
482
        m2l_cl_in.modes[1] = STARPU_R;
483
        m2l_cl_in.modes[2] = starpu_data_access_mode(STARPU_RW|STARPU_COMMUTE_IF_SUPPORTED);
484
        m2l_cl_in.name = "m2l_cl_in";
485
        memset(&m2l_cl_inout, 0, sizeof(m2l_cl_inout));
486
#ifdef STARPU_USE_CPU
487
        if(originalCpuKernel->supportM2LExtern(FSTARPU_CPU_IDX)){
488 489 490
            m2l_cl_inout.cpu_funcs[0] = StarPUCpuWrapperClass::transferInoutPassCallback;
            m2l_cl_inout.where |= STARPU_CPU;
        }
491
#endif
492
#ifdef SCALFMM_ENABLE_CUDA_KERNEL
493
        if(originalCpuKernel->supportM2LExtern(FSTARPU_CUDA_IDX)){
494 495 496 497
            m2l_cl_inout.cuda_funcs[0] = StarPUCudaWrapperClass::transferInoutPassCallback;
            m2l_cl_inout.where |= STARPU_CUDA;
        }
#endif
498
#ifdef SCALFMM_ENABLE_OPENCL_KERNEL
499
        if(originalCpuKernel->supportM2LExtern(FSTARPU_OPENCL_IDX)){
500 501 502
            m2l_cl_inout.opencl_funcs[0] = StarPUOpenClWrapperClass::transferInoutPassCallback;
            m2l_cl_inout.where |= STARPU_OPENCL;
        }
503
#endif
504 505 506
        m2l_cl_inout.nbuffers = 6;
        m2l_cl_inout.modes[0] = STARPU_R;
        m2l_cl_inout.modes[1] = STARPU_R;
507
        m2l_cl_inout.modes[2] = starpu_data_access_mode(STARPU_RW|STARPU_COMMUTE_IF_SUPPORTED);
508
        m2l_cl_inout.modes[3] = STARPU_R;
509
        m2l_cl_inout.modes[4] = STARPU_R;
510
        m2l_cl_inout.modes[5] = starpu_data_access_mode(STARPU_RW|STARPU_COMMUTE_IF_SUPPORTED);
511
        m2l_cl_inout.name = "m2l_cl_inout";
512 513 514 515 516
    }

    /** dealloc in a starpu way all the defined handles */
    void cleanHandle(){
        for(int idxLevel = 0 ; idxLevel < tree->getHeight() ; ++idxLevel){
517 518 519 520
            for(int idxHandle = 0 ; idxHandle < int(cellHandles[idxLevel].size()) ; ++idxHandle){
                starpu_data_unregister(cellHandles[idxLevel][idxHandle].symb);
                starpu_data_unregister(cellHandles[idxLevel][idxHandle].up);
                starpu_data_unregister(cellHandles[idxLevel][idxHandle].down);
521
            }
522
            cellHandles[idxLevel].clear();
523 524
        }
        {
525 526 527
            for(int idxHandle = 0 ; idxHandle < int(particleHandles.size()) ; ++idxHandle){
                starpu_data_unregister(particleHandles[idxHandle].symb);
                starpu_data_unregister(particleHandles[idxHandle].down);
528
            }
529
            particleHandles.clear();
530 531 532 533
        }
    }

    ////////////////////////////////////////////////////////////////////////////
534 535

    void initCodeletMpi(){
536
        memset(&p2p_cl_inout_mpi, 0, sizeof(p2p_cl_inout_mpi));
537
#ifdef STARPU_USE_CPU
538
        if(originalCpuKernel->supportP2PMpi(FSTARPU_CPU_IDX)){
539 540 541
            p2p_cl_inout_mpi.where |= STARPU_CPU;
            p2p_cl_inout_mpi.cpu_funcs[0] = StarPUCpuWrapperClass::directInoutPassCallbackMpi;
        }
542
#endif
543
#ifdef SCALFMM_ENABLE_CUDA_KERNEL
544
        if(originalCpuKernel->supportP2PMpi(FSTARPU_CUDA_IDX)){
545 546 547 548
            p2p_cl_inout_mpi.where |= STARPU_CUDA;
            p2p_cl_inout_mpi.cuda_funcs[0] = StarPUCudaWrapperClass::directInoutPassCallbackMpi;
        }
#endif
549
#ifdef SCALFMM_ENABLE_OPENCL_KERNEL
550
        if(originalCpuKernel->supportP2PMpi(FSTARPU_OPENCL_IDX)){
551 552 553
            p2p_cl_inout_mpi.where |= STARPU_OPENCL;
            p2p_cl_inout_mpi.opencl_funcs[0] = StarPUOpenClWrapperClass::directInoutPassCallbackMpi;
        }
554
#endif
555 556
        p2p_cl_inout_mpi.nbuffers = 3;
        p2p_cl_inout_mpi.modes[0] = STARPU_R;
557
        p2p_cl_inout_mpi.modes[1] = starpu_data_access_mode(STARPU_RW|STARPU_COMMUTE_IF_SUPPORTED);
558
        p2p_cl_inout_mpi.modes[2] = STARPU_R;
559
        p2p_cl_inout_mpi.name = "p2p_cl_inout_mpi";
560

561
        memset(&m2l_cl_inout_mpi, 0, sizeof(m2l_cl_inout_mpi));
562
#ifdef STARPU_USE_CPU
563
        if(originalCpuKernel->supportM2LMpi(FSTARPU_CPU_IDX)){
564 565 566
            m2l_cl_inout_mpi.where |= STARPU_CPU;
            m2l_cl_inout_mpi.cpu_funcs[0] = StarPUCpuWrapperClass::transferInoutPassCallbackMpi;
        }
567
#endif
568
#ifdef SCALFMM_ENABLE_CUDA_KERNEL
569
        if(originalCpuKernel->supportM2LMpi(FSTARPU_CUDA_IDX)){
570 571 572 573
            m2l_cl_inout_mpi.where |= STARPU_CUDA;
            m2l_cl_inout_mpi.cuda_funcs[0] = StarPUCudaWrapperClass::transferInoutPassCallbackMpi;
        }
#endif
574
#ifdef SCALFMM_ENABLE_OPENCL_KERNEL
575
        if(originalCpuKernel->supportM2LMpi(FSTARPU_OPENCL_IDX)){
576 577 578
            m2l_cl_inout_mpi.where |= STARPU_OPENCL;
            m2l_cl_inout_mpi.opencl_funcs[0] = StarPUOpenClWrapperClass::transferInoutPassCallbackMpi;
        }
579
#endif
580 581
        m2l_cl_inout_mpi.nbuffers = 4;
        m2l_cl_inout_mpi.modes[0] = STARPU_R;
582
        m2l_cl_inout_mpi.modes[1] = starpu_data_access_mode(STARPU_RW|STARPU_COMMUTE_IF_SUPPORTED);
583 584
        m2l_cl_inout_mpi.modes[2] = STARPU_R;
        m2l_cl_inout_mpi.modes[3] = STARPU_R;
585
        m2l_cl_inout_mpi.name = "m2l_cl_inout_mpi";
586 587
    }

588 589 590 591 592 593
    std::vector<std::pair<MortonIndex,MortonIndex>> processesIntervalPerLevels;
    struct BlockDescriptor{
        MortonIndex firstIndex;
        MortonIndex lastIndex;
        int owner;
        int bufferSize;
594 595 596 597
        size_t bufferSizeSymb;
        size_t bufferSizeUp;
        size_t bufferSizeDown;
        size_t leavesBufferSize;
598 599 600 601 602 603 604 605
    };
    std::vector<std::vector<BlockDescriptor>> processesBlockInfos;
    std::vector<int> nbBlocksPerLevelAll;
    std::vector<int> nbBlocksBeforeMinPerLevel;

    std::vector< std::vector< std::vector<BlockInteractions<CellContainerClass>>>> externalInteractionsAllLevelMpi;
    std::vector< std::vector<BlockInteractions<ParticleGroupClass>>> externalInteractionsLeafLevelMpi;

606
    struct RemoteHandle{
607 608 609 610
        RemoteHandle() : ptrSymb(nullptr), ptrUp(nullptr), ptrDown(nullptr){
            memset(&handleSymb, 0, sizeof(handleSymb));
            memset(&handleUp, 0, sizeof(handleUp));
            memset(&handleDown, 0, sizeof(handleDown));
611 612
        }

613 614 615 616 617 618
        unsigned char * ptrSymb;
        starpu_data_handle_t handleSymb;
        unsigned char * ptrUp;
        starpu_data_handle_t handleUp;
        unsigned char * ptrDown;
        starpu_data_handle_t handleDown;
BRAMAS Berenger's avatar
BRAMAS Berenger committed
619 620

        int intervalSize;
621 622 623 624 625
    };

    std::vector<std::vector<RemoteHandle>> remoteCellGroups;
    std::vector<RemoteHandle> remoteParticleGroupss;

626
    void buildRemoteInteractionsAndHandles(){
627 628
        cleanHandleMpi();

629 630
        // We need to have information about all other blocks
        std::unique_ptr<int[]> nbBlocksPerLevel(new int[tree->getHeight()]);
631
        nbBlocksPerLevel[0] = 0;
632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674
        for(int idxLevel = 1 ; idxLevel < tree->getHeight() ; ++idxLevel){
            nbBlocksPerLevel[idxLevel] = tree->getNbCellGroupAtLevel(idxLevel);
        }
        // Exchange the number of blocks per proc
        nbBlocksPerLevelAll.resize(tree->getHeight() * comm.processCount());
        FMpi::Assert(MPI_Allgather(nbBlocksPerLevel.get(), tree->getHeight(), MPI_INT,
                                   nbBlocksPerLevelAll.data(), tree->getHeight(), MPI_INT,
                                   comm.getComm()), __LINE__);
        // Compute the number of blocks before mine
        nbBlocksBeforeMinPerLevel.resize(tree->getHeight());
        for(int idxLevel = 1 ; idxLevel < tree->getHeight() ; ++idxLevel){
            nbBlocksBeforeMinPerLevel[idxLevel] = 0;
            for(int idxProc = 0 ; idxProc < comm.processId() ; ++idxProc){
                nbBlocksBeforeMinPerLevel[idxLevel] += nbBlocksPerLevelAll[idxProc*tree->getHeight() + idxLevel];
            }
        }
        // Prepare the block infos
        processesBlockInfos.resize(tree->getHeight());
        std::unique_ptr<int[]> recvBlocksCount(new int[comm.processCount()]);
        std::unique_ptr<int[]> recvBlockDispl(new int[comm.processCount()]);
        // Exchange the block info per level
        for(int idxLevel = 1 ; idxLevel < tree->getHeight() ; ++idxLevel){
            // Count the total number of blocks
            int nbBlocksInLevel = 0;
            recvBlockDispl[0] = 0;
            for(int idxProc = 0 ; idxProc < comm.processCount() ; ++idxProc){
                nbBlocksInLevel += nbBlocksPerLevelAll[idxProc*tree->getHeight() + idxLevel];
                // Count and displacement for the MPI all gatherv
                recvBlocksCount[idxProc] = nbBlocksPerLevelAll[idxProc*tree->getHeight() + idxLevel] * int(sizeof(BlockDescriptor));
                if(idxProc) recvBlockDispl[idxProc] = recvBlockDispl[idxProc-1] + recvBlocksCount[idxProc-1];
            }
            processesBlockInfos[idxLevel].resize(nbBlocksInLevel);
            // Fill my blocks
            std::vector<BlockDescriptor> myBlocksAtLevel;
            myBlocksAtLevel.resize(nbBlocksPerLevel[idxLevel]);
            FAssertLF(tree->getNbCellGroupAtLevel(idxLevel) == int(myBlocksAtLevel.size()));
            FAssertLF(nbBlocksPerLevel[idxLevel] == nbBlocksPerLevelAll[comm.processId()*tree->getHeight() + idxLevel]);

            for(int idxGroup = 0 ; idxGroup < tree->getNbCellGroupAtLevel(idxLevel) ; ++idxGroup){
                CellContainerClass*const currentCells = tree->getCellGroup(idxLevel, idxGroup);
                myBlocksAtLevel[idxGroup].firstIndex = currentCells->getStartingIndex();
                myBlocksAtLevel[idxGroup].lastIndex  = currentCells->getEndingIndex();
                myBlocksAtLevel[idxGroup].owner = comm.processId();
675 676 677
                myBlocksAtLevel[idxGroup].bufferSizeSymb = currentCells->getBufferSizeInByte();
                myBlocksAtLevel[idxGroup].bufferSizeUp   = currentCells->getMultipoleBufferSizeInByte();
                myBlocksAtLevel[idxGroup].bufferSizeDown = currentCells->getLocalBufferSizeInByte();
678 679 680 681 682 683 684 685 686 687 688 689 690

                if(idxLevel == tree->getHeight() - 1){
                    myBlocksAtLevel[idxGroup].leavesBufferSize = tree->getParticleGroup(idxGroup)->getBufferSizeInByte();
                }
                else{
                    myBlocksAtLevel[idxGroup].leavesBufferSize = 0;
                }
            }
            // Exchange with all other
            FMpi::Assert(MPI_Allgatherv(myBlocksAtLevel.data(), int(myBlocksAtLevel.size()*sizeof(BlockDescriptor)), MPI_BYTE,
                                        processesBlockInfos[idxLevel].data(), recvBlocksCount.get(), recvBlockDispl.get(), MPI_BYTE,
                                        comm.getComm()), __LINE__);
        }
691 692 693 694 695 696 697
        // Prepare remate ptr and handles
        remoteCellGroups.resize( tree->getHeight() );
        for(int idxLevel = 1 ; idxLevel < tree->getHeight() ; ++idxLevel){
            remoteCellGroups[idxLevel].resize( processesBlockInfos[idxLevel].size());
        }
        remoteParticleGroupss.resize(processesBlockInfos[tree->getHeight()-1].size());

698 699 700 701 702 703 704 705
        // From now we have the number of blocks for all process
        // we also have the size of the blocks therefor we can
        // create the handles we need
        // We will now detect the relation between our blocks and others
        // During the M2M (which is the same for the L2L)
        // During the M2L and during the P2P
        // I need to insert the task that read my data or that write the data I need.
        // M2L
706
        externalInteractionsAllLevelMpi.clear();
707 708 709 710 711 712 713 714 715 716 717 718 719 720
        externalInteractionsAllLevelMpi.resize(tree->getHeight());
        for(int idxLevel = tree->getHeight()-1 ; idxLevel >= 2 ; --idxLevel){
            // From this level there are no more blocks
            if(tree->getNbCellGroupAtLevel(idxLevel) == 0){
                // We stop here
                break;
            }
            // What are my morton interval at this level
            const MortonIndex myFirstIndex = tree->getCellGroup(idxLevel, 0)->getStartingIndex();
            const MortonIndex myLastIndex = tree->getCellGroup(idxLevel, tree->getNbCellGroupAtLevel(idxLevel)-1)->getEndingIndex();

            externalInteractionsAllLevelMpi[idxLevel].resize(tree->getNbCellGroupAtLevel(idxLevel));

            for(int idxGroup = 0 ; idxGroup < tree->getNbCellGroupAtLevel(idxLevel) ; ++idxGroup){
721
                CellContainerClass* currentCells = tree->getCellGroup(idxLevel, idxGroup);
722 723 724 725 726 727 728

                std::vector<BlockInteractions<CellContainerClass>>* externalInteractions = &externalInteractionsAllLevelMpi[idxLevel][idxGroup];

                #pragma omp task default(none) firstprivate(idxGroup, currentCells, idxLevel, externalInteractions)
                {
                    std::vector<OutOfBlockInteraction> outsideInteractions;

729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745
                    for(int idxCell = 0 ; idxCell < currentCells->getNumberOfCellsInBlock() ; ++idxCell){
                        const MortonIndex mindex = currentCells->getCellMortonIndex(idxCell);

                        MortonIndex interactionsIndexes[189];
                        int interactionsPosition[189];
                        const FTreeCoordinate coord(mindex, idxLevel);
                        int counter = coord.getInteractionNeighbors(idxLevel,interactionsIndexes,interactionsPosition);

                        for(int idxInter = 0 ; idxInter < counter ; ++idxInter){
                            // This interactions need a block owned by someoneelse
                            if(interactionsIndexes[idxInter] < myFirstIndex || myLastIndex <= interactionsIndexes[idxInter]){
                                OutOfBlockInteraction property;
                                property.insideIndex = mindex;
                                property.outIndex    = interactionsIndexes[idxInter];
                                property.outPosition = interactionsPosition[idxInter];
                                property.insideIdxInBlock = idxCell;
                                outsideInteractions.push_back(property);
746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764
                            }
                        }
                    }

                    // Manage outofblock interaction
                    FQuickSort<OutOfBlockInteraction, int>::QsSequential(outsideInteractions.data(),int(outsideInteractions.size()));

                    int currentOutInteraction = 0;
                    for(int idxOtherGroup = 0 ; idxOtherGroup < int(processesBlockInfos[idxLevel].size())
                                                && currentOutInteraction < int(outsideInteractions.size()) ; ++idxOtherGroup){
                        // Skip my blocks
                        if(idxOtherGroup == nbBlocksBeforeMinPerLevel[idxLevel]){
                            idxOtherGroup += tree->getNbCellGroupAtLevel(idxLevel);
                            if(idxOtherGroup == int(processesBlockInfos[idxLevel].size())){
                                break;
                            }
                            FAssertLF(idxOtherGroup < int(processesBlockInfos[idxLevel].size()));
                        }

BRAMAS Berenger's avatar
BRAMAS Berenger committed
765 766
                        const MortonIndex blockStartIdxOther = processesBlockInfos[idxLevel][idxOtherGroup].firstIndex;
                        const MortonIndex blockEndIdxOther   = processesBlockInfos[idxLevel][idxOtherGroup].lastIndex;
767

BRAMAS Berenger's avatar
BRAMAS Berenger committed
768
                        while(currentOutInteraction < int(outsideInteractions.size()) && outsideInteractions[currentOutInteraction].outIndex < blockStartIdxOther){
769 770 771 772
                            currentOutInteraction += 1;
                        }

                        int lastOutInteraction = currentOutInteraction;
BRAMAS Berenger's avatar
BRAMAS Berenger committed
773
                        while(lastOutInteraction < int(outsideInteractions.size()) && outsideInteractions[lastOutInteraction].outIndex < blockEndIdxOther){
774 775 776 777 778 779
                            lastOutInteraction += 1;
                        }

                        // Create interactions
                        const int nbInteractionsBetweenBlocks = (lastOutInteraction-currentOutInteraction);
                        if(nbInteractionsBetweenBlocks){
780
                            if(remoteCellGroups[idxLevel][idxOtherGroup].ptrSymb == nullptr){
781 782
                                #pragma omp critical(CreateM2LRemotes)
                                {
783
                                    if(remoteCellGroups[idxLevel][idxOtherGroup].ptrSymb == nullptr){
BRAMAS Berenger's avatar
BRAMAS Berenger committed
784
                                        const size_t nbBytesInBlockSymb = processesBlockInfos[idxLevel][idxOtherGroup].bufferSizeSymb;
785
                                        unsigned char* memoryBlockSymb = (unsigned char*)FAlignedMemory::AllocateBytes<32>(nbBytesInBlockSymb);
786 787 788
                                        remoteCellGroups[idxLevel][idxOtherGroup].ptrSymb = memoryBlockSymb;
                                        starpu_variable_data_register(&remoteCellGroups[idxLevel][idxOtherGroup].handleSymb, 0,
                                                                      (uintptr_t)remoteCellGroups[idxLevel][idxOtherGroup].ptrSymb, nbBytesInBlockSymb);
BRAMAS Berenger's avatar
BRAMAS Berenger committed
789
                                        const size_t nbBytesInBlockUp = processesBlockInfos[idxLevel][idxOtherGroup].bufferSizeUp;
790
                                        unsigned char* memoryBlockUp = (unsigned char*)FAlignedMemory::AllocateBytes<32>(nbBytesInBlockUp);
791 792 793
                                        remoteCellGroups[idxLevel][idxOtherGroup].ptrUp = memoryBlockUp;
                                        starpu_variable_data_register(&remoteCellGroups[idxLevel][idxOtherGroup].handleUp, 0,
                                                                      (uintptr_t)remoteCellGroups[idxLevel][idxOtherGroup].ptrUp, nbBytesInBlockUp);
794 795
                                    }
                                }
796 797
                            }

798 799
                            externalInteractions->emplace_back();
                            BlockInteractions<CellContainerClass>* interactions = &externalInteractions->back();
800
                            //interactions->otherBlock = remoteCellGroups[idxLevel][idxOtherGroup].ptr;
801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818
                            interactions->otherBlockId = idxOtherGroup;
                            interactions->interactions.resize(nbInteractionsBetweenBlocks);
                            std::copy(outsideInteractions.begin() + currentOutInteraction,
                                      outsideInteractions.begin() + lastOutInteraction,
                                      interactions->interactions.begin());
                        }

                        currentOutInteraction = lastOutInteraction;
                    }
                }
            }
        }
        // P2P
        // We create one big vector per block
        {
            const MortonIndex myFirstIndex = tree->getParticleGroup(0)->getStartingIndex();
            const MortonIndex myLastIndex = tree->getParticleGroup(tree->getNbParticleGroup()-1)->getEndingIndex();

819
            externalInteractionsLeafLevelMpi.clear();
820 821 822 823 824 825 826 827 828 829 830
            externalInteractionsLeafLevelMpi.resize(tree->getNbParticleGroup());
            for(int idxGroup = 0 ; idxGroup < tree->getNbParticleGroup() ; ++idxGroup){
                // Create the vector
                ParticleGroupClass* containers = tree->getParticleGroup(idxGroup);

                std::vector<BlockInteractions<ParticleGroupClass>>* externalInteractions = &externalInteractionsLeafLevelMpi[idxGroup];

                #pragma omp task default(none) firstprivate(idxGroup, containers, externalInteractions)
                { // Can be a task(inout:iterCells)
                    std::vector<OutOfBlockInteraction> outsideInteractions;

831
                    for(int idxLeaf = 0 ; idxLeaf < containers->getNumberOfLeavesInBlock() ; ++idxLeaf){
832
                        // ParticleContainerClass particles = containers->template getLeaf<ParticleContainerClass>(mindex);
833
                        const MortonIndex mindex = containers->getLeafMortonIndex(idxLeaf);
834
                        if(containers->exists(mindex)){
835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867
                            MortonIndex interactionsIndexes[26];
                            int interactionsPosition[26];
                            FTreeCoordinate coord(mindex, tree->getHeight()-1);
                            int counter = coord.getNeighborsIndexes(tree->getHeight(),interactionsIndexes,interactionsPosition);

                            for(int idxInter = 0 ; idxInter < counter ; ++idxInter){
                                if(interactionsIndexes[idxInter] < myFirstIndex ||
                                        myLastIndex <= interactionsIndexes[idxInter]){
                                    OutOfBlockInteraction property;
                                    property.insideIndex = mindex;
                                    property.outIndex    = interactionsIndexes[idxInter];
                                    property.outPosition = interactionsPosition[idxInter];
                                    outsideInteractions.push_back(property);
                                }
                            }
                        }
                    }

                    // Sort to match external order
                    FQuickSort<OutOfBlockInteraction, int>::QsSequential(outsideInteractions.data(),int(outsideInteractions.size()));

                    int currentOutInteraction = 0;
                    for(int idxOtherGroup = 0 ; idxOtherGroup < int(processesBlockInfos[tree->getHeight()-1].size())
                                                && currentOutInteraction < int(outsideInteractions.size()) ; ++idxOtherGroup){
                        // Skip my blocks
                        if(idxOtherGroup == nbBlocksBeforeMinPerLevel[tree->getHeight()-1]){
                            idxOtherGroup += tree->getNbCellGroupAtLevel(tree->getHeight()-1);
                            if(idxOtherGroup == int(processesBlockInfos[tree->getHeight()-1].size())){
                                break;
                            }
                            FAssertLF(idxOtherGroup < int(processesBlockInfos[tree->getHeight()-1].size()));
                        }

BRAMAS Berenger's avatar
BRAMAS Berenger committed
868 869
                        const MortonIndex blockStartIdxOther = processesBlockInfos[tree->getHeight()-1][idxOtherGroup].firstIndex;
                        const MortonIndex blockEndIdxOther   = processesBlockInfos[tree->getHeight()-1][idxOtherGroup].lastIndex;
870

BRAMAS Berenger's avatar
BRAMAS Berenger committed
871
                        while(currentOutInteraction < int(outsideInteractions.size()) && outsideInteractions[currentOutInteraction].outIndex < blockStartIdxOther){
872 873 874 875
                            currentOutInteraction += 1;
                        }

                        int lastOutInteraction = currentOutInteraction;
BRAMAS Berenger's avatar
BRAMAS Berenger committed
876
                        while(lastOutInteraction < int(outsideInteractions.size()) && outsideInteractions[lastOutInteraction].outIndex < blockEndIdxOther){
877 878 879 880 881
                            lastOutInteraction += 1;
                        }

                        const int nbInteractionsBetweenBlocks = (lastOutInteraction-currentOutInteraction);
                        if(nbInteractionsBetweenBlocks){
882
                            if(remoteParticleGroupss[idxOtherGroup].ptrSymb == nullptr){
883 884
                                #pragma omp critical(CreateM2LRemotes)
                                {
885
                                    if(remoteParticleGroupss[idxOtherGroup].ptrSymb == nullptr){
BRAMAS Berenger's avatar
BRAMAS Berenger committed
886
                                        const size_t nbBytesInBlock = processesBlockInfos[tree->getHeight()-1][idxOtherGroup].leavesBufferSize;
887
                                        unsigned char* memoryBlock = (unsigned char*)FAlignedMemory::AllocateBytes<32>(nbBytesInBlock);
888 889 890
                                        remoteParticleGroupss[idxOtherGroup].ptrSymb = memoryBlock;
                                        starpu_variable_data_register(&remoteParticleGroupss[idxOtherGroup].handleSymb, 0,
                                                                      (uintptr_t)remoteParticleGroupss[idxOtherGroup].ptrSymb, nbBytesInBlock);
891 892
                                    }
                                }
893 894
                            }

895 896
                            externalInteractions->emplace_back();
                            BlockInteractions<ParticleGroupClass>* interactions = &externalInteractions->back();
897
                            //interactions->otherBlock = remoteParticleGroupss[idxOtherGroup].ptr;
898 899 900 901 902 903 904 905 906 907 908 909 910
                            interactions->otherBlockId = idxOtherGroup;
                            interactions->interactions.resize(nbInteractionsBetweenBlocks);
                            std::copy(outsideInteractions.begin() + currentOutInteraction,
                                      outsideInteractions.begin() + lastOutInteraction,
                                      interactions->interactions.begin());
                        }

                        currentOutInteraction = lastOutInteraction;
                    }
                }
            }
        }
    }
911

912 913 914 915 916 917 918 919 920 921 922
    struct MpiDependency{
        int src;
        int dest;
        int level;
        int globalBlockId;
    };

    std::vector<MpiDependency> toSend;

    void postRecvAllocatedBlocks(){
        std::vector<MpiDependency> toRecv;
923
        FAssertLF(tree->getHeight() == int(remoteCellGroups.size()));
924 925
        const bool directOnly = (tree->getHeight() <= 2);
        if(!directOnly){
926
            for(int idxLevel = FAbstractAlgorithm::upperWorkingLevel ; idxLevel < FAbstractAlgorithm::lowerWorkingLevel ; ++idxLevel){
927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946
                for(int idxHandle = 0 ; idxHandle < int(remoteCellGroups[idxLevel].size()) ; ++idxHandle){
                    if(remoteCellGroups[idxLevel][idxHandle].ptrSymb){
                        FAssertLF(remoteCellGroups[idxLevel][idxHandle].ptrUp);
                        FLOG(FLog::Controller << "[SMpi] " << idxLevel << " Post a recv during M2L for Idx " << processesBlockInfos[idxLevel][idxHandle].firstIndex <<
                             " and dest is " << processesBlockInfos[idxLevel][idxHandle].owner << " tag " << getTag(idxLevel,processesBlockInfos[idxLevel][idxHandle].firstIndex, 0) << "\n");
                        FLOG(FLog::Controller << "[SMpi] " << idxLevel << " Post a recv during M2L for Idx " << processesBlockInfos[idxLevel][idxHandle].firstIndex <<
                             " and dest is " << processesBlockInfos[idxLevel][idxHandle].owner << " tag " << getTag(idxLevel,processesBlockInfos[idxLevel][idxHandle].firstIndex, 1) << "\n");

                        starpu_mpi_irecv_detached( remoteCellGroups[idxLevel][idxHandle].handleSymb,
                                                    processesBlockInfos[idxLevel][idxHandle].owner,
                                                    getTag(idxLevel,processesBlockInfos[idxLevel][idxHandle].firstIndex, 0),
                                                    comm.getComm(), 0, 0 );
                        starpu_mpi_irecv_detached( remoteCellGroups[idxLevel][idxHandle].handleUp,
                                                    processesBlockInfos[idxLevel][idxHandle].owner,
                                                    getTag(idxLevel,processesBlockInfos[idxLevel][idxHandle].firstIndex, 1),
                                                    comm.getComm(), 0, 0 );

                        toRecv.push_back({processesBlockInfos[idxLevel][idxHandle].owner,
                                            comm.processId(), idxLevel, idxHandle});
                    }
947 948 949 950 951
                }
            }
        }
        {
            for(int idxHandle = 0 ; idxHandle < int(remoteParticleGroupss.size()) ; ++idxHandle){
952
                if(remoteParticleGroupss[idxHandle].ptrSymb){
BRAMAS Berenger's avatar
BRAMAS Berenger committed
953
                    FLOG(FLog::Controller << "[SMpi] Post a recv during P2P for Idx " << processesBlockInfos[tree->getHeight()-1][idxHandle].firstIndex <<
954
                         " and dest is " << processesBlockInfos[tree->getHeight()-1][idxHandle].owner << " tag " << getTag(tree->getHeight(),processesBlockInfos[tree->getHeight()-1][idxHandle].firstIndex, 0) << "\n");
BRAMAS Berenger's avatar
BRAMAS Berenger committed
955

956
                    starpu_mpi_irecv_detached( remoteParticleGroupss[idxHandle].handleSymb,
957
                                                processesBlockInfos[tree->getHeight()-1][idxHandle].owner,
958
                                                getTag(tree->getHeight(),processesBlockInfos[tree->getHeight()-1][idxHandle].firstIndex, 0),
959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989
                                                comm.getComm(), 0, 0 );

                    toRecv.push_back({processesBlockInfos[tree->getHeight()-1][idxHandle].owner,
                                        comm.processId(), tree->getHeight(), idxHandle});
                }
            }
        }

        FQuickSort<MpiDependency, int>::QsSequential(toRecv.data(),int(toRecv.size()),[](const MpiDependency& d1, const MpiDependency& d2){
            return d1.src <= d2.src;
        });

        std::unique_ptr<int[]> nbBlocksToRecvFromEach(new int[comm.processCount()]);
        memset(nbBlocksToRecvFromEach.get(), 0, sizeof(int)*comm.processCount());
        for(int idxDep = 0 ; idxDep < int(toRecv.size()) ; ++idxDep){
            nbBlocksToRecvFromEach[toRecv[idxDep].src] += 1;
        }

        FAssertLF(nbBlocksToRecvFromEach[comm.processId()] == 0);
        int offset = 0;

        for(int idxProc = 0 ; idxProc < comm.processCount() ; ++idxProc){
            if(idxProc == comm.processId()){
                // How much to send to each
                std::unique_ptr<int[]> nbBlocksToSendToEach(new int[comm.processCount()]);
                FMpi::Assert(MPI_Gather(&nbBlocksToRecvFromEach[idxProc], 1,
                                 MPI_INT, nbBlocksToSendToEach.get(), 1,
                                 MPI_INT, idxProc, comm.getComm() ), __LINE__);

                std::unique_ptr<int[]> displs(new int[comm.processCount()]);
                displs[0] = 0;
BRAMAS Berenger's avatar
BRAMAS Berenger committed
990 991
                for(int idxProcOther = 1 ; idxProcOther < comm.processCount() ; ++idxProcOther){
                    displs[idxProcOther] = displs[idxProcOther-1] + nbBlocksToSendToEach[idxProcOther-1];
992
                }
993
                toSend.resize(displs[comm.processCount()-1] + nbBlocksToSendToEach[comm.processCount()-1]);
994

995
                // We work in bytes
BRAMAS Berenger's avatar
BRAMAS Berenger committed
996 997 998
                for(int idxProcOther = 0 ; idxProcOther < comm.processCount() ; ++idxProcOther){
                    nbBlocksToSendToEach[idxProcOther] *= int(sizeof(MpiDependency));
                    displs[idxProcOther] *= int(sizeof(MpiDependency));
999
                }
1000

1001 1002
                FMpi::Assert(MPI_Gatherv( nullptr, 0, MPI_BYTE,
                                 toSend.data(),
1003 1004 1005 1006 1007 1008 1009
                                 nbBlocksToSendToEach.get(), displs.get(),
                                 MPI_BYTE, idxProc, comm.getComm()), __LINE__);
            }
            else{
                FMpi::Assert(MPI_Gather(&nbBlocksToRecvFromEach[idxProc], 1,
                                 MPI_INT, 0, 0, MPI_INT, idxProc, comm.getComm() ), __LINE__);
                FMpi::Assert(MPI_Gatherv(
1010
                                 &toRecv[offset], int(nbBlocksToRecvFromEach[idxProc]*sizeof(MpiDependency)), MPI_BYTE,
1011
                                 0, 0, 0, MPI_BYTE, idxProc, comm.getComm() ), __LINE__);
1012

1013 1014 1015 1016 1017 1018 1019 1020 1021
                offset += nbBlocksToRecvFromEach[idxProc];
            }
        }
    }

    void insertParticlesSend(){
        for(int idxSd = 0 ; idxSd < int(toSend.size()) ; ++idxSd){
            const MpiDependency sd = toSend[idxSd];
            if(sd.level == tree->getHeight()){
1022 1023
                const int localId = sd.globalBlockId - nbBlocksBeforeMinPerLevel[tree->getHeight()-1];
                FAssertLF(sd.src == comm.processId());
1024 1025 1026 1027
                FAssertLF(0 <= localId);
                FAssertLF(localId < tree->getNbParticleGroup());

                FLOG(FLog::Controller << "[SMpi] Post a send during P2P for Idx " << tree->getParticleGroup(localId)->getStartingIndex() <<
1028
                     " and dest is " << sd.dest << " tag " << getTag(tree->getHeight(),tree->getParticleGroup(localId)->getStartingIndex(), 0) <<  "\n");
1029

1030 1031
                starpu_mpi_isend_detached( particleHandles[localId].symb, sd.dest,
                        getTag(tree->getHeight(),tree->getParticleGroup(localId)->getStartingIndex(), 0),
BRAMAS Berenger's avatar