FGroupTaskStarpuMpiAlgorithm.hpp 123 KB
Newer Older
1 2 3 4
// Keep in private GIT
#ifndef FGROUPTASKSTARPUMPIALGORITHM_HPP
#define FGROUPTASKSTARPUMPIALGORITHM_HPP

5 6 7 8 9 10 11 12 13 14 15
#include "../../Utils/FGlobal.hpp"
#include "../../Core/FCoreCommon.hpp"
#include "../../Utils/FQuickSort.hpp"
#include "../../Containers/FTreeCoordinate.hpp"
#include "../../Utils/FLog.hpp"
#include "../../Utils/FTic.hpp"
#include "../../Utils/FAssert.hpp"
#include "../../Utils/FAlignedMemory.hpp"
#include "../../Utils/FAssert.hpp"

#include "../../Utils/FMpi.hpp"
16

17 18
#include "FOutOfBlockInteraction.hpp"

19 20 21 22 23 24
#include <vector>
#include <memory>

#include <omp.h>

#include <starpu.h>
25
#include <starpu_mpi.h>
26
#include "../StarPUUtils/FStarPUUtils.hpp"
27
#include "../StarPUUtils/FStarPUFmmPriorities.hpp"
28

29
#ifdef STARPU_USE_CPU
30
#include "../StarPUUtils/FStarPUCpuWrapper.hpp"
31
#endif
32
#ifdef SCALFMM_ENABLE_CUDA_KERNEL
33 34 35 36 37
#include "../StarPUUtils/FStarPUCudaWrapper.hpp"
#include "../Cuda/FCudaEmptyKernel.hpp"
#include "../Cuda/FCudaGroupAttachedLeaf.hpp"
#include "../Cuda/FCudaGroupOfParticles.hpp"
#include "../Cuda/FCudaGroupOfCells.hpp"
38
#include "../Cuda/FCudaEmptyCellSymb.hpp"
39
#endif
40
#ifdef SCALFMM_ENABLE_OPENCL_KERNEL
41 42
#include "../StarPUUtils/FStarPUOpenClWrapper.hpp"
#include "../OpenCl/FOpenCLDeviceWrapper.hpp"
43
#include "../OpenCl/FEmptyOpenCLCode.hpp"
44
#endif
45

BRAMAS Berenger's avatar
BRAMAS Berenger committed
46 47
#include "../StarPUUtils/FStarPUReduxCpu.hpp"

48

49
template <class OctreeClass, class CellContainerClass, class KernelClass, class ParticleGroupClass, class StarPUCpuWrapperClass
BRAMAS Berenger's avatar
BRAMAS Berenger committed
50 51 52 53 54 55 56
          #ifdef SCALFMM_ENABLE_CUDA_KERNEL
          , class StarPUCudaWrapperClass = FStarPUCudaWrapper<KernelClass, FCudaEmptyCellSymb, int, int, FCudaGroupOfCells<FCudaEmptyCellSymb, int, int>,
                                                              FCudaGroupOfParticles<int, 0, 0, int>, FCudaGroupAttachedLeaf<int, 0, 0, int>, FCudaEmptyKernel<int> >
          #endif
          #ifdef SCALFMM_ENABLE_OPENCL_KERNEL
          , class StarPUOpenClWrapperClass = FStarPUOpenClWrapper<KernelClass, FOpenCLDeviceWrapper<KernelClass>>
          #endif
57
          >
58
class FGroupTaskStarPUMpiAlgorithm : public FAbstractAlgorithm {
59
protected:
60
    typedef FGroupTaskStarPUMpiAlgorithm<OctreeClass, CellContainerClass, KernelClass, ParticleGroupClass, StarPUCpuWrapperClass
61
#ifdef SCALFMM_ENABLE_CUDA_KERNEL
BRAMAS Berenger's avatar
BRAMAS Berenger committed
62
    , StarPUCudaWrapperClass
63
#endif
64
#ifdef SCALFMM_ENABLE_OPENCL_KERNEL
65
    , StarPUOpenClWrapperClass
66 67
#endif
    > ThisClass;
68

69
    int getTag(const int inLevel, const MortonIndex mindex, const int mode) const{
70 71 72
        int shift = 0;
        int height = tree->getHeight();
        while(height) { shift += 1; height >>= 1; }
BRAMAS Berenger's avatar
BRAMAS Berenger committed
73
        return int((((mindex<<shift) + inLevel) << 5) + mode);
74 75
    }

76 77 78 79 80 81 82 83 84
    const FMpi::FComm& comm;

    template <class OtherBlockClass>
    struct BlockInteractions{
        OtherBlockClass* otherBlock;
        int otherBlockId;
        std::vector<OutOfBlockInteraction> interactions;
    };

85 86 87 88
    struct CellHandles{
        starpu_data_handle_t symb;
        starpu_data_handle_t up;
        starpu_data_handle_t down;
89
        int intervalSize;
90 91 92 93 94
    };

    struct ParticleHandles{
        starpu_data_handle_t symb;
        starpu_data_handle_t down;
95
        int intervalSize;
96 97
    };

98 99 100 101
    std::vector< std::vector< std::vector<BlockInteractions<CellContainerClass>>>> externalInteractionsAllLevel;
    std::vector< std::vector<BlockInteractions<ParticleGroupClass>>> externalInteractionsLeafLevel;

    OctreeClass*const tree;       //< The Tree
102
    KernelClass*const originalCpuKernel;
103

104 105
    std::vector<CellHandles>* cellHandles;
    std::vector<ParticleHandles> particleHandles;
106 107

    starpu_codelet p2m_cl;
BRAMAS Berenger's avatar
BRAMAS Berenger committed
108 109
    starpu_codelet m2m_cl;
    starpu_codelet l2l_cl;
BRAMAS Berenger's avatar
BRAMAS Berenger committed
110
    starpu_codelet l2l_cl_nocommute;
111 112 113 114
    starpu_codelet l2p_cl;

    starpu_codelet m2l_cl_in;
    starpu_codelet m2l_cl_inout;
115
    starpu_codelet m2l_cl_inout_mpi;
116 117 118

    starpu_codelet p2p_cl_in;
    starpu_codelet p2p_cl_inout;
119
    starpu_codelet p2p_cl_inout_mpi;
120

BRAMAS Berenger's avatar
BRAMAS Berenger committed
121 122 123 124 125 126 127

#ifdef STARPU_USE_REDUX
    starpu_codelet p2p_redux_init;
    starpu_codelet p2p_redux_perform;
    starpu_codelet p2p_redux_read;
#endif

128
#ifdef STARPU_USE_CPU
129
    StarPUCpuWrapperClass cpuWrapper;
130
#endif
131
#ifdef SCALFMM_ENABLE_CUDA_KERNEL
132 133
    StarPUCudaWrapperClass cudaWrapper;
#endif
134
#ifdef SCALFMM_ENABLE_OPENCL_KERNEL
135 136
    StarPUOpenClWrapperClass openclWrapper;
#endif
137 138 139

    FStarPUPtrInterface wrappers;
    FStarPUPtrInterface* wrapperptr;
140

141
#ifdef STARPU_SUPPORT_ARBITER
BRAMAS Berenger's avatar
BRAMAS Berenger committed
142
    starpu_arbiter_t arbiterGlobal;
143
#endif
BRAMAS Berenger's avatar
BRAMAS Berenger committed
144 145 146 147 148 149 150 151 152 153 154 155

#ifdef STARPU_USE_TASK_NAME
    std::vector<std::unique_ptr<char[]>> m2mTaskNames;
    std::vector<std::unique_ptr<char[]>> m2lTaskNames;
    std::vector<std::unique_ptr<char[]>> m2lOuterTaskNames;
    std::vector<std::unique_ptr<char[]>> l2lTaskNames;
    std::unique_ptr<char[]> p2mTaskNames;
    std::unique_ptr<char[]> l2pTaskNames;
    std::unique_ptr<char[]> p2pTaskNames;
    std::unique_ptr<char[]> p2pOuterTaskNames;
#endif

156
public:
157 158
    FGroupTaskStarPUMpiAlgorithm(const FMpi::FComm& inComm, OctreeClass*const inTree, KernelClass* inKernels)
        :   comm(inComm), tree(inTree), originalCpuKernel(inKernels),
159
          cellHandles(nullptr),
BRAMAS Berenger's avatar
BRAMAS Berenger committed
160 161 162 163 164 165 166 167 168 169
      #ifdef STARPU_USE_CPU
          cpuWrapper(tree->getHeight()),
      #endif
      #ifdef SCALFMM_ENABLE_CUDA_KERNEL
          cudaWrapper(tree->getHeight()),
      #endif
      #ifdef SCALFMM_ENABLE_OPENCL_KERNEL
          openclWrapper(tree->getHeight()),
      #endif
          wrapperptr(&wrappers){
170 171 172
        FAssertLF(tree, "tree cannot be null");
        FAssertLF(inKernels, "kernels cannot be null");

173 174
        FAbstractAlgorithm::setNbLevelsInTree(tree->getHeight());

175 176
        struct starpu_conf conf;
        FAssertLF(starpu_conf_init(&conf) == 0);
177
#ifdef SCALFMM_STARPU_USE_PRIO
178
        FStarPUFmmPriorities::Controller().init(&conf, tree->getHeight(), inKernels);
179
#endif
180
        FAssertLF(starpu_init(&conf) == 0);
181
        FAssertLF(starpu_mpi_init ( 0, 0, 0 ) == 0);
182

BRAMAS Berenger's avatar
BRAMAS Berenger committed
183 184
        starpu_malloc_set_align(32);

185 186
        starpu_pthread_mutex_t initMutex;
        starpu_pthread_mutex_init(&initMutex, NULL);
187
#ifdef STARPU_USE_CPU
188 189 190 191 192
        FStarPUUtils::ExecOnWorkers(STARPU_CPU, [&](){
            starpu_pthread_mutex_lock(&initMutex);
            cpuWrapper.initKernel(starpu_worker_get_id(), inKernels);
            starpu_pthread_mutex_unlock(&initMutex);
        });
193
        wrappers.set(FSTARPU_CPU_IDX, &cpuWrapper);
194
#endif
195
#ifdef SCALFMM_ENABLE_CUDA_KERNEL
196 197 198 199 200 201 202
        FStarPUUtils::ExecOnWorkers(STARPU_CUDA, [&](){
            starpu_pthread_mutex_lock(&initMutex);
            cudaWrapper.initKernel(starpu_worker_get_id(), inKernels);
            starpu_pthread_mutex_unlock(&initMutex);
        });
        wrappers.set(FSTARPU_CUDA_IDX, &cudaWrapper);
#endif
203
#ifdef SCALFMM_ENABLE_OPENCL_KERNEL
204 205 206 207 208 209
        FStarPUUtils::ExecOnWorkers(STARPU_OPENCL, [&](){
            starpu_pthread_mutex_lock(&initMutex);
            openclWrapper.initKernel(starpu_worker_get_id(), inKernels);
            starpu_pthread_mutex_unlock(&initMutex);
        });
        wrappers.set(FSTARPU_OPENCL_IDX, &openclWrapper);
210
#endif
211 212
        starpu_pthread_mutex_destroy(&initMutex);

213 214
        starpu_pause();

215
        cellHandles   = new std::vector<CellHandles>[tree->getHeight()];
216

217
#ifdef STARPU_SUPPORT_ARBITER
BRAMAS Berenger's avatar
BRAMAS Berenger committed
218
        arbiterGlobal = starpu_arbiter_create();
219
#endif
220

BRAMAS Berenger's avatar
BRAMAS Berenger committed
221 222
        initCodelet();
        initCodeletMpi();
BRAMAS Berenger's avatar
BRAMAS Berenger committed
223
        rebuildInteractions();
BRAMAS Berenger's avatar
BRAMAS Berenger committed
224

BRAMAS Berenger's avatar
BRAMAS Berenger committed
225
        FLOG(FLog::Controller << "FGroupTaskStarPUAlgorithm (Max Worker " << starpu_worker_get_count() << ")\n");
BRAMAS Berenger's avatar
BRAMAS Berenger committed
226
#ifdef STARPU_USE_CPU
BRAMAS Berenger's avatar
BRAMAS Berenger committed
227
        FLOG(FLog::Controller << "FGroupTaskStarPUAlgorithm (Max CPU " << starpu_cpu_worker_get_count() << ")\n");
BRAMAS Berenger's avatar
BRAMAS Berenger committed
228
#endif
229
#ifdef SCALFMM_ENABLE_OPENCL_KERNEL
BRAMAS Berenger's avatar
BRAMAS Berenger committed
230 231
        FLOG(FLog::Controller << "FGroupTaskStarPUAlgorithm (Max OpenCL " << starpu_opencl_worker_get_count() << ")\n");
#endif
232
#ifdef SCALFMM_ENABLE_CUDA_KERNEL
BRAMAS Berenger's avatar
BRAMAS Berenger committed
233 234
        FLOG(FLog::Controller << "FGroupTaskStarPUAlgorithm (Max CUDA " << starpu_cuda_worker_get_count() << ")\n");
#endif
BRAMAS Berenger's avatar
BRAMAS Berenger committed
235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265

        buildTaskNames();
    }

    void buildTaskNames(){
#ifdef STARPU_USE_TASK_NAME
        const int namesLength = 128;
        m2mTaskNames.resize(tree->getHeight());
        m2lTaskNames.resize(tree->getHeight());
        m2lOuterTaskNames.resize(tree->getHeight());
        l2lTaskNames.resize(tree->getHeight());
        for(int idxLevel = 0 ; idxLevel < tree->getHeight() ; ++idxLevel){
            m2mTaskNames[idxLevel].reset(new char[namesLength]);
            snprintf(m2mTaskNames[idxLevel].get(), namesLength, "M2M-level-%d", idxLevel);
            m2lTaskNames[idxLevel].reset(new char[namesLength]);
            snprintf(m2lTaskNames[idxLevel].get(), namesLength, "M2L-level-%d", idxLevel);
            m2lOuterTaskNames[idxLevel].reset(new char[namesLength]);
            snprintf(m2lOuterTaskNames[idxLevel].get(), namesLength, "M2L-out-level-%d", idxLevel);
            l2lTaskNames[idxLevel].reset(new char[namesLength]);
            snprintf(l2lTaskNames[idxLevel].get(), namesLength, "L2L-level-%d", idxLevel);
        }

        p2mTaskNames.reset(new char[namesLength]);
        snprintf(p2mTaskNames.get(), namesLength, "P2M");
        l2pTaskNames.reset(new char[namesLength]);
        snprintf(l2pTaskNames.get(), namesLength, "L2P");
        p2pTaskNames.reset(new char[namesLength]);
        snprintf(p2pTaskNames.get(), namesLength, "P2P");
        p2pOuterTaskNames.reset(new char[namesLength]);
        snprintf(p2pOuterTaskNames.get(), namesLength, "P2P-out");
#endif
266 267
    }

268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288
    void syncData(){
        for(int idxLevel = 0 ; idxLevel < tree->getHeight() ; ++idxLevel){
            for(int idxHandle = 0 ; idxHandle < int(cellHandles[idxLevel].size()) ; ++idxHandle){
                starpu_data_acquire(cellHandles[idxLevel][idxHandle].symb, STARPU_R);
                starpu_data_release(cellHandles[idxLevel][idxHandle].symb);
                starpu_data_acquire(cellHandles[idxLevel][idxHandle].up, STARPU_R);
                starpu_data_release(cellHandles[idxLevel][idxHandle].up);
                starpu_data_acquire(cellHandles[idxLevel][idxHandle].down, STARPU_R);
                starpu_data_release(cellHandles[idxLevel][idxHandle].down);
            }
        }
        {
            for(int idxHandle = 0 ; idxHandle < int(particleHandles.size()) ; ++idxHandle){
                starpu_data_acquire(particleHandles[idxHandle].symb, STARPU_R);
                starpu_data_release(particleHandles[idxHandle].symb);
                starpu_data_acquire(particleHandles[idxHandle].down, STARPU_R);
                starpu_data_release(particleHandles[idxHandle].down);
            }
        }
    }

289
    ~FGroupTaskStarPUMpiAlgorithm(){
BRAMAS Berenger's avatar
BRAMAS Berenger committed
290 291
        starpu_resume();

292
        cleanHandle();
293
        cleanHandleMpi();
294
        delete[] cellHandles;
295

BRAMAS Berenger's avatar
BRAMAS Berenger committed
296 297 298 299 300 301 302 303 304 305
        starpu_pthread_mutex_t releaseMutex;
        starpu_pthread_mutex_init(&releaseMutex, NULL);
#ifdef STARPU_USE_CPU
        FStarPUUtils::ExecOnWorkers(STARPU_CPU, [&](){
            starpu_pthread_mutex_lock(&releaseMutex);
            cpuWrapper.releaseKernel(starpu_worker_get_id());
            starpu_pthread_mutex_unlock(&releaseMutex);
        });
        wrappers.set(FSTARPU_CPU_IDX, &cpuWrapper);
#endif
306
#ifdef SCALFMM_ENABLE_CUDA_KERNEL
BRAMAS Berenger's avatar
BRAMAS Berenger committed
307 308 309 310 311 312 313
        FStarPUUtils::ExecOnWorkers(STARPU_CUDA, [&](){
            starpu_pthread_mutex_lock(&releaseMutex);
            cudaWrapper.releaseKernel(starpu_worker_get_id());
            starpu_pthread_mutex_unlock(&releaseMutex);
        });
        wrappers.set(FSTARPU_CUDA_IDX, &cudaWrapper);
#endif
314
#ifdef SCALFMM_ENABLE_OPENCL_KERNEL
BRAMAS Berenger's avatar
BRAMAS Berenger committed
315 316 317 318 319 320 321 322 323
        FStarPUUtils::ExecOnWorkers(STARPU_OPENCL, [&](){
            starpu_pthread_mutex_lock(&releaseMutex);
            openclWrapper.releaseKernel(starpu_worker_get_id());
            starpu_pthread_mutex_unlock(&releaseMutex);
        });
        wrappers.set(FSTARPU_OPENCL_IDX, &openclWrapper);
#endif
        starpu_pthread_mutex_destroy(&releaseMutex);

324 325

#ifdef STARPU_SUPPORT_ARBITER
BRAMAS Berenger's avatar
BRAMAS Berenger committed
326
        starpu_arbiter_destroy(arbiterGlobal);
327
#endif
328
        starpu_mpi_shutdown();
329 330 331
        starpu_shutdown();
    }

BRAMAS Berenger's avatar
BRAMAS Berenger committed
332
    void rebuildInteractions(){
333
        FAssertLF(getenv("OMP_WAIT_POLICY") == nullptr
BRAMAS Berenger's avatar
BRAMAS Berenger committed
334 335
                || strcmp(getenv("OMP_WAIT_POLICY"), "PASSIVE") == 0
                  || strcmp(getenv("OMP_WAIT_POLICY"), "passive") == 0);
336

BRAMAS Berenger's avatar
BRAMAS Berenger committed
337 338
#pragma omp parallel
#pragma omp single
339 340 341
        buildExternalInteractionVecs();
        buildHandles();

BRAMAS Berenger's avatar
BRAMAS Berenger committed
342 343
#pragma omp parallel
#pragma omp single
344
        buildRemoteInteractionsAndHandles();
BRAMAS Berenger's avatar
BRAMAS Berenger committed
345 346 347
    }

protected:
348

BRAMAS Berenger's avatar
BRAMAS Berenger committed
349 350 351 352 353 354 355 356 357 358 359
    /**
      * Runs the complete algorithm.
      */
    void executeCore(const unsigned operationsToProceed) override {
        FLOG( FLog::Controller << "\tStart FGroupTaskStarPUMpiAlgorithm\n" );
        const bool directOnly = (tree->getHeight() <= 2);

#ifdef STARPU_USE_CPU
        FTIME_TASKS(cpuWrapper.taskTimeRecorder.start());
#endif

360
        starpu_resume();
361
        postRecvAllocatedBlocks();
362

363
        if( operationsToProceed & FFmmP2P ) insertParticlesSend();
364

365
        if(operationsToProceed & FFmmP2M && !directOnly) bottomPass();
366

BRAMAS Berenger's avatar
BRAMAS Berenger committed
367
        if(operationsToProceed & FFmmM2M && !directOnly) upwardPass();
368
        if(operationsToProceed & FFmmM2L && !directOnly) insertCellsSend();
369

BRAMAS Berenger's avatar
BRAMAS Berenger committed
370 371 372
        if(operationsToProceed & FFmmM2L && !directOnly) transferPass(FAbstractAlgorithm::upperWorkingLevel, FAbstractAlgorithm::lowerWorkingLevel-1 , true, true);
        if(operationsToProceed & FFmmM2L && !directOnly) transferPass(FAbstractAlgorithm::lowerWorkingLevel-1, FAbstractAlgorithm::lowerWorkingLevel, false, false);
        if(operationsToProceed & FFmmM2L && !directOnly) transferPassMpi();
373

BRAMAS Berenger's avatar
BRAMAS Berenger committed
374
        if(operationsToProceed & FFmmL2L && !directOnly) downardPass();
375

BRAMAS Berenger's avatar
BRAMAS Berenger committed
376
        if(operationsToProceed & FFmmM2L && !directOnly) transferPass(FAbstractAlgorithm::lowerWorkingLevel-1, FAbstractAlgorithm::lowerWorkingLevel, true, true);
BRAMAS Berenger's avatar
BRAMAS Berenger committed
377

378 379
        if( operationsToProceed & FFmmP2P ) directPass();
        if( operationsToProceed & FFmmP2P ) directPassMpi();
BRAMAS Berenger's avatar
BRAMAS Berenger committed
380

BRAMAS Berenger's avatar
BRAMAS Berenger committed
381 382 383 384
        if( operationsToProceed & FFmmL2P && !directOnly) mergePass();
#ifdef STARPU_USE_REDUX
        if( operationsToProceed & FFmmL2P && !directOnly) readParticle();
#endif
385 386

        starpu_task_wait_for_all();
387 388 389 390 391

        FLOG( FTic timerSync; );
        syncData();
        FLOG( FLog::Controller << "\t\t Moving data to the host took " << timerSync.tacAndElapsed() << "s\n" );

392
        starpu_pause();
BRAMAS Berenger's avatar
BRAMAS Berenger committed
393 394 395 396 397

#ifdef STARPU_USE_CPU
        FTIME_TASKS(cpuWrapper.taskTimeRecorder.end());
        FTIME_TASKS(cpuWrapper.taskTimeRecorder.saveToDisk("/tmp/taskstime-FGroupTaskStarPUAlgorithm.txt"));
#endif
398 399
    }

400

401 402
    void initCodelet(){
        memset(&p2m_cl, 0, sizeof(p2m_cl));
403
#ifdef STARPU_USE_CPU
404
        if(originalCpuKernel->supportP2M(FSTARPU_CPU_IDX)){
405 406 407
            p2m_cl.cpu_funcs[0] = StarPUCpuWrapperClass::bottomPassCallback;
            p2m_cl.where |= STARPU_CPU;
        }
408
#endif
409
#ifdef SCALFMM_ENABLE_CUDA_KERNEL
410
        if(originalCpuKernel->supportP2M(FSTARPU_CUDA_IDX)){
411 412 413 414
            p2m_cl.cuda_funcs[0] = StarPUCudaWrapperClass::bottomPassCallback;
            p2m_cl.where |= STARPU_CUDA;
        }
#endif
415
#ifdef SCALFMM_ENABLE_OPENCL_KERNEL
416
        if(originalCpuKernel->supportP2M(FSTARPU_OPENCL_IDX)){
417 418 419
            p2m_cl.opencl_funcs[0] = StarPUOpenClWrapperClass::bottomPassCallback;
            p2m_cl.where |= STARPU_OPENCL;
        }
420
#endif
421 422 423 424
        p2m_cl.nbuffers = 3;
        p2m_cl.modes[0] = STARPU_R;
        p2m_cl.modes[1] = STARPU_RW;
        p2m_cl.modes[2] = STARPU_R;
425
        p2m_cl.name = "p2m_cl";
426

BRAMAS Berenger's avatar
BRAMAS Berenger committed
427
        memset(&m2m_cl, 0, sizeof(m2m_cl));
428
#ifdef STARPU_USE_CPU
BRAMAS Berenger's avatar
BRAMAS Berenger committed
429 430 431 432
        if(originalCpuKernel->supportM2M(FSTARPU_CPU_IDX)){
            m2m_cl.cpu_funcs[0] = StarPUCpuWrapperClass::upwardPassCallback;
            m2m_cl.where |= STARPU_CPU;
        }
433
#endif
434
#ifdef SCALFMM_ENABLE_CUDA_KERNEL
BRAMAS Berenger's avatar
BRAMAS Berenger committed
435 436 437 438
        if(originalCpuKernel->supportM2M(FSTARPU_CUDA_IDX)){
            m2m_cl.cuda_funcs[0] = StarPUCudaWrapperClass::upwardPassCallback;
            m2m_cl.where |= STARPU_CUDA;
        }
439
#endif
440
#ifdef SCALFMM_ENABLE_OPENCL_KERNEL
BRAMAS Berenger's avatar
BRAMAS Berenger committed
441 442 443 444
        if(originalCpuKernel->supportM2M(FSTARPU_OPENCL_IDX)){
            m2m_cl.opencl_funcs[0] = StarPUOpenClWrapperClass::upwardPassCallback;
            m2m_cl.where |= STARPU_OPENCL;
        }
445
#endif
BRAMAS Berenger's avatar
BRAMAS Berenger committed
446 447 448 449 450 451 452 453 454
        m2m_cl.nbuffers = 4;
        m2m_cl.dyn_modes = (starpu_data_access_mode*)malloc(m2m_cl.nbuffers*sizeof(starpu_data_access_mode));
        m2m_cl.dyn_modes[0] = STARPU_R;
        m2m_cl.dyn_modes[1] = starpu_data_access_mode(STARPU_RW|STARPU_COMMUTE_IF_SUPPORTED);
        m2m_cl.name = "m2m_cl";
        m2m_cl.dyn_modes[2] = STARPU_R;
        m2m_cl.dyn_modes[3] = STARPU_R;

        memset(&l2l_cl, 0, sizeof(l2l_cl));
455
#ifdef STARPU_USE_CPU
BRAMAS Berenger's avatar
BRAMAS Berenger committed
456 457 458 459
        if(originalCpuKernel->supportL2L(FSTARPU_CPU_IDX)){
            l2l_cl.cpu_funcs[0] = StarPUCpuWrapperClass::downardPassCallback;
            l2l_cl.where |= STARPU_CPU;
        }
460
#endif
461
#ifdef SCALFMM_ENABLE_CUDA_KERNEL
BRAMAS Berenger's avatar
BRAMAS Berenger committed
462 463 464 465
        if(originalCpuKernel->supportL2L(FSTARPU_CUDA_IDX)){
            l2l_cl.cuda_funcs[0] = StarPUCudaWrapperClass::downardPassCallback;
            l2l_cl.where |= STARPU_CUDA;
        }
466
#endif
467
#ifdef SCALFMM_ENABLE_OPENCL_KERNEL
BRAMAS Berenger's avatar
BRAMAS Berenger committed
468 469 470
        if(originalCpuKernel->supportL2L(FSTARPU_OPENCL_IDX)){
            l2l_cl.opencl_funcs[0] = StarPUOpenClWrapperClass::downardPassCallback;
            l2l_cl.where |= STARPU_OPENCL;
471
        }
BRAMAS Berenger's avatar
BRAMAS Berenger committed
472 473 474 475 476 477 478 479
#endif
        l2l_cl.nbuffers = 4;
        l2l_cl.dyn_modes = (starpu_data_access_mode*)malloc(l2l_cl.nbuffers*sizeof(starpu_data_access_mode));
        l2l_cl.dyn_modes[0] = STARPU_R;
        l2l_cl.dyn_modes[1] = STARPU_R;
        l2l_cl.name = "l2l_cl";
        l2l_cl.dyn_modes[2] = STARPU_R;
        l2l_cl.dyn_modes[3] = starpu_data_access_mode(STARPU_RW|STARPU_COMMUTE_IF_SUPPORTED);
480

BRAMAS Berenger's avatar
BRAMAS Berenger committed
481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507
        memset(&l2l_cl_nocommute, 0, sizeof(l2l_cl_nocommute));
#ifdef STARPU_USE_CPU
        if(originalCpuKernel->supportL2L(FSTARPU_CPU_IDX)){
            l2l_cl_nocommute.cpu_funcs[0] = StarPUCpuWrapperClass::downardPassCallback;
            l2l_cl_nocommute.where |= STARPU_CPU;
        }
#endif
#ifdef SCALFMM_ENABLE_CUDA_KERNEL
        if(originalCpuKernel->supportL2L(FSTARPU_CUDA_IDX)){
            l2l_cl_nocommute.cuda_funcs[0] = StarPUCudaWrapperClass::downardPassCallback;
            l2l_cl_nocommute.where |= STARPU_CUDA;
        }
#endif
#ifdef SCALFMM_ENABLE_OPENCL_KERNEL
        if(originalCpuKernel->supportL2L(FSTARPU_OPENCL_IDX)){
            l2l_cl_nocommute.opencl_funcs[0] = StarPUOpenClWrapperClass::downardPassCallback;
            l2l_cl_nocommute.where |= STARPU_OPENCL;
        }
#endif
        l2l_cl_nocommute.nbuffers = 4;
        l2l_cl_nocommute.dyn_modes = (starpu_data_access_mode*)malloc(l2l_cl_nocommute.nbuffers*sizeof(starpu_data_access_mode));
        l2l_cl_nocommute.dyn_modes[0] = STARPU_R;
        l2l_cl_nocommute.dyn_modes[1] = STARPU_R;
        l2l_cl_nocommute.name = "l2l_cl";
        l2l_cl_nocommute.dyn_modes[2] = STARPU_R;
        l2l_cl_nocommute.dyn_modes[3] = STARPU_RW;

508
        memset(&l2p_cl, 0, sizeof(l2p_cl));
509
#ifdef STARPU_USE_CPU
510
        if(originalCpuKernel->supportL2P(FSTARPU_CPU_IDX)){
511 512 513
            l2p_cl.cpu_funcs[0] = StarPUCpuWrapperClass::mergePassCallback;
            l2p_cl.where |= STARPU_CPU;
        }
514
#endif
515
#ifdef SCALFMM_ENABLE_CUDA_KERNEL
516
        if(originalCpuKernel->supportL2P(FSTARPU_CUDA_IDX)){
517 518 519 520
            l2p_cl.cuda_funcs[0] = StarPUCudaWrapperClass::mergePassCallback;
            l2p_cl.where |= STARPU_CUDA;
        }
#endif
521
#ifdef SCALFMM_ENABLE_OPENCL_KERNEL
522
        if(originalCpuKernel->supportL2P(FSTARPU_OPENCL_IDX)){
523 524 525
            l2p_cl.opencl_funcs[0] = StarPUOpenClWrapperClass::mergePassCallback;
            l2p_cl.where |= STARPU_OPENCL;
        }
526
#endif
527
        l2p_cl.nbuffers = 4;
528
        l2p_cl.modes[0] = STARPU_R;
529 530
        l2p_cl.modes[1] = STARPU_R;
        l2p_cl.modes[2] = STARPU_R;
BRAMAS Berenger's avatar
BRAMAS Berenger committed
531 532 533
#ifdef STARPU_USE_REDUX
        l2p_cl.modes[3] = STARPU_REDUX;
#else
534
        l2p_cl.modes[3] = starpu_data_access_mode(STARPU_RW|STARPU_COMMUTE_IF_SUPPORTED);
BRAMAS Berenger's avatar
BRAMAS Berenger committed
535
#endif
536
        l2p_cl.name = "l2p_cl";
537 538

        memset(&p2p_cl_in, 0, sizeof(p2p_cl_in));
539
#ifdef STARPU_USE_CPU
540
        if(originalCpuKernel->supportP2P(FSTARPU_CPU_IDX)){
541 542 543
            p2p_cl_in.cpu_funcs[0] = StarPUCpuWrapperClass::directInPassCallback;
            p2p_cl_in.where |= STARPU_CPU;
        }
544
#endif
545
#ifdef SCALFMM_ENABLE_CUDA_KERNEL
546
        if(originalCpuKernel->supportP2P(FSTARPU_CUDA_IDX)){
547 548 549 550
            p2p_cl_in.cuda_funcs[0] = StarPUCudaWrapperClass::directInPassCallback;
            p2p_cl_in.where |= STARPU_CUDA;
        }
#endif
551
#ifdef SCALFMM_ENABLE_OPENCL_KERNEL
552
        if(originalCpuKernel->supportP2P(FSTARPU_OPENCL_IDX)){
553 554 555
            p2p_cl_in.opencl_funcs[0] = StarPUOpenClWrapperClass::directInPassCallback;
            p2p_cl_in.where |= STARPU_OPENCL;
        }
556
#endif
557 558
        p2p_cl_in.nbuffers = 2;
        p2p_cl_in.modes[0] = STARPU_R;
BRAMAS Berenger's avatar
BRAMAS Berenger committed
559 560 561
#ifdef STARPU_USE_REDUX
        p2p_cl_in.modes[1] = STARPU_REDUX;
#else
562
        p2p_cl_in.modes[1] = starpu_data_access_mode(STARPU_RW|STARPU_COMMUTE_IF_SUPPORTED);
BRAMAS Berenger's avatar
BRAMAS Berenger committed
563
#endif
564
        p2p_cl_in.name = "p2p_cl_in";
565
        memset(&p2p_cl_inout, 0, sizeof(p2p_cl_inout));
566
#ifdef STARPU_USE_CPU
567
        if(originalCpuKernel->supportP2PExtern(FSTARPU_CPU_IDX)){
568 569 570
            p2p_cl_inout.cpu_funcs[0] = StarPUCpuWrapperClass::directInoutPassCallback;
            p2p_cl_inout.where |= STARPU_CPU;
        }
571
#endif
572
#ifdef SCALFMM_ENABLE_CUDA_KERNEL
573
        if(originalCpuKernel->supportP2PExtern(FSTARPU_CUDA_IDX)){
574 575 576 577
            p2p_cl_inout.cuda_funcs[0] = StarPUCudaWrapperClass::directInoutPassCallback;
            p2p_cl_inout.where |= STARPU_CUDA;
        }
#endif
578
#ifdef SCALFMM_ENABLE_OPENCL_KERNEL
579
        if(originalCpuKernel->supportP2PExtern(FSTARPU_OPENCL_IDX)){
580 581 582
            p2p_cl_inout.opencl_funcs[0] = StarPUOpenClWrapperClass::directInoutPassCallback;
            p2p_cl_inout.where |= STARPU_OPENCL;
        }
583
#endif
584 585
        p2p_cl_inout.nbuffers = 4;
        p2p_cl_inout.modes[0] = STARPU_R;
BRAMAS Berenger's avatar
BRAMAS Berenger committed
586 587 588
#ifdef STARPU_USE_REDUX
        p2p_cl_inout.modes[1] = STARPU_REDUX;
#else
589
        p2p_cl_inout.modes[1] = starpu_data_access_mode(STARPU_RW|STARPU_COMMUTE_IF_SUPPORTED);
BRAMAS Berenger's avatar
BRAMAS Berenger committed
590
#endif
591
        p2p_cl_inout.modes[2] = STARPU_R;
BRAMAS Berenger's avatar
BRAMAS Berenger committed
592 593 594
#ifdef STARPU_USE_REDUX
        p2p_cl_inout.modes[3] = STARPU_REDUX;
#else
595
        p2p_cl_inout.modes[3] = starpu_data_access_mode(STARPU_RW|STARPU_COMMUTE_IF_SUPPORTED);
BRAMAS Berenger's avatar
BRAMAS Berenger committed
596
#endif
597
        p2p_cl_inout.name = "p2p_cl_inout";
598 599

        memset(&m2l_cl_in, 0, sizeof(m2l_cl_in));
600
#ifdef STARPU_USE_CPU
601
        if(originalCpuKernel->supportM2L(FSTARPU_CPU_IDX)){
602 603 604
            m2l_cl_in.cpu_funcs[0] = StarPUCpuWrapperClass::transferInPassCallback;
            m2l_cl_in.where |= STARPU_CPU;
        }
605
#endif
606
#ifdef SCALFMM_ENABLE_CUDA_KERNEL
607
        if(originalCpuKernel->supportM2L(FSTARPU_CUDA_IDX)){
608 609 610 611
            m2l_cl_in.cuda_funcs[0] = StarPUCudaWrapperClass::transferInPassCallback;
            m2l_cl_in.where |= STARPU_CUDA;
        }
#endif
612
#ifdef SCALFMM_ENABLE_OPENCL_KERNEL
613
        if(originalCpuKernel->supportM2L(FSTARPU_OPENCL_IDX)){
614 615 616
            m2l_cl_in.opencl_funcs[0] = StarPUOpenClWrapperClass::transferInPassCallback;
            m2l_cl_in.where |= STARPU_OPENCL;
        }
617
#endif
618 619
        m2l_cl_in.nbuffers = 3;
        m2l_cl_in.modes[0] = STARPU_R;
620
        m2l_cl_in.modes[1] = STARPU_R;
621
        m2l_cl_in.modes[2] = starpu_data_access_mode(STARPU_RW|STARPU_COMMUTE_IF_SUPPORTED);
622
        m2l_cl_in.name = "m2l_cl_in";
BRAMAS Berenger's avatar
BRAMAS Berenger committed
623

624
        memset(&m2l_cl_inout, 0, sizeof(m2l_cl_inout));
625
#ifdef STARPU_USE_CPU
626
        if(originalCpuKernel->supportM2LExtern(FSTARPU_CPU_IDX)){
627 628 629
            m2l_cl_inout.cpu_funcs[0] = StarPUCpuWrapperClass::transferInoutPassCallback;
            m2l_cl_inout.where |= STARPU_CPU;
        }
630
#endif
631
#ifdef SCALFMM_ENABLE_CUDA_KERNEL
632
        if(originalCpuKernel->supportM2LExtern(FSTARPU_CUDA_IDX)){
633 634 635 636
            m2l_cl_inout.cuda_funcs[0] = StarPUCudaWrapperClass::transferInoutPassCallback;
            m2l_cl_inout.where |= STARPU_CUDA;
        }
#endif
637
#ifdef SCALFMM_ENABLE_OPENCL_KERNEL
638
        if(originalCpuKernel->supportM2LExtern(FSTARPU_OPENCL_IDX)){
639 640 641
            m2l_cl_inout.opencl_funcs[0] = StarPUOpenClWrapperClass::transferInoutPassCallback;
            m2l_cl_inout.where |= STARPU_OPENCL;
        }
642
#endif
BRAMAS Berenger's avatar
BRAMAS Berenger committed
643
        m2l_cl_inout.nbuffers = 4;
644
        m2l_cl_inout.modes[0] = STARPU_R;
BRAMAS Berenger's avatar
BRAMAS Berenger committed
645 646
        m2l_cl_inout.modes[1] = starpu_data_access_mode(STARPU_RW|STARPU_COMMUTE_IF_SUPPORTED);
        m2l_cl_inout.modes[2] = STARPU_R;
647
        m2l_cl_inout.modes[3] = STARPU_R;
648
        m2l_cl_inout.name = "m2l_cl_inout";
BRAMAS Berenger's avatar
BRAMAS Berenger committed
649

650

BRAMAS Berenger's avatar
BRAMAS Berenger committed
651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693
#ifdef STARPU_USE_REDUX
        memset(&p2p_redux_init, 0, sizeof(p2p_redux_init));
#ifdef STARPU_USE_CPU
        p2p_redux_init.cpu_funcs[0] = FStarPUReduxCpu::InitData<typename ParticleGroupClass::ParticleDataType>;
        p2p_redux_init.where |= STARPU_CPU;
#endif
        p2p_redux_init.nbuffers = 1;
        p2p_redux_init.modes[0] = STARPU_RW;
        p2p_redux_init.name = "p2p_redux_init";

        memset(&p2p_redux_perform, 0, sizeof(p2p_redux_perform));
#ifdef STARPU_USE_CPU
        p2p_redux_perform.cpu_funcs[0] = FStarPUReduxCpu::ReduceData<typename ParticleGroupClass::ParticleDataType>;
        p2p_redux_perform.where |= STARPU_CPU;
#endif
        p2p_redux_perform.nbuffers = 2;
        p2p_redux_perform.modes[0] = STARPU_RW;
        p2p_redux_perform.modes[1] = STARPU_R;
        p2p_redux_perform.name = "p2p_redux_perform";

        memset(&p2p_redux_read, 0, sizeof(p2p_redux_read));
#ifdef STARPU_USE_CPU
        if(originalCpuKernel->supportL2P(FSTARPU_CPU_IDX)){
            p2p_redux_read.cpu_funcs[0] = FStarPUReduxCpu::EmptyCodelet<typename ParticleGroupClass::ParticleDataType>;
            p2p_redux_read.where |= STARPU_CPU;
        }
#endif
#ifdef SCALFMM_ENABLE_CUDA_KERNEL
        if(originalCpuKernel->supportL2P(FSTARPU_CUDA_IDX)){
            p2p_redux_read.cuda_funcs[0] = FStarPUReduxCpu::EmptyCodelet<typename ParticleGroupClass::ParticleDataType>;
            p2p_redux_read.where |= STARPU_CUDA;
        }
#endif
#ifdef SCALFMM_ENABLE_OPENCL_KERNEL
        if(originalCpuKernel->supportL2P(FSTARPU_OPENCL_IDX)){
            p2p_redux_read.opencl_funcs[0] = FStarPUReduxCpu::EmptyCodelet<typename ParticleGroupClass::ParticleDataType>;
            p2p_redux_read.where |= STARPU_OPENCL;
        }
#endif
        p2p_redux_read.nbuffers = 1;
        p2p_redux_read.modes[0] = STARPU_R;
        p2p_redux_read.name = "p2p_redux_read";
#endif
BRAMAS Berenger's avatar
BRAMAS Berenger committed
694
    }
BRAMAS Berenger's avatar
BRAMAS Berenger committed
695

696 697 698
    /** dealloc in a starpu way all the defined handles */
    void cleanHandle(){
        for(int idxLevel = 0 ; idxLevel < tree->getHeight() ; ++idxLevel){
699 700 701 702
            for(int idxHandle = 0 ; idxHandle < int(cellHandles[idxLevel].size()) ; ++idxHandle){
                starpu_data_unregister(cellHandles[idxLevel][idxHandle].symb);
                starpu_data_unregister(cellHandles[idxLevel][idxHandle].up);
                starpu_data_unregister(cellHandles[idxLevel][idxHandle].down);
703
            }
704
            cellHandles[idxLevel].clear();
705 706
        }
        {
707 708 709
            for(int idxHandle = 0 ; idxHandle < int(particleHandles.size()) ; ++idxHandle){
                starpu_data_unregister(particleHandles[idxHandle].symb);
                starpu_data_unregister(particleHandles[idxHandle].down);
710
            }
711
            particleHandles.clear();
712 713 714 715
        }
    }

    ////////////////////////////////////////////////////////////////////////////
716 717

    void initCodeletMpi(){
718
        memset(&p2p_cl_inout_mpi, 0, sizeof(p2p_cl_inout_mpi));
719
#ifdef STARPU_USE_CPU
720
        if(originalCpuKernel->supportP2PMpi(FSTARPU_CPU_IDX)){
721 722 723
            p2p_cl_inout_mpi.where |= STARPU_CPU;
            p2p_cl_inout_mpi.cpu_funcs[0] = StarPUCpuWrapperClass::directInoutPassCallbackMpi;
        }
724
#endif
725
#ifdef SCALFMM_ENABLE_CUDA_KERNEL
726
        if(originalCpuKernel->supportP2PMpi(FSTARPU_CUDA_IDX)){
727 728 729 730
            p2p_cl_inout_mpi.where |= STARPU_CUDA;
            p2p_cl_inout_mpi.cuda_funcs[0] = StarPUCudaWrapperClass::directInoutPassCallbackMpi;
        }
#endif
731
#ifdef SCALFMM_ENABLE_OPENCL_KERNEL
732
        if(originalCpuKernel->supportP2PMpi(FSTARPU_OPENCL_IDX)){
733 734 735
            p2p_cl_inout_mpi.where |= STARPU_OPENCL;
            p2p_cl_inout_mpi.opencl_funcs[0] = StarPUOpenClWrapperClass::directInoutPassCallbackMpi;
        }
736
#endif
737 738
        p2p_cl_inout_mpi.nbuffers = 3;
        p2p_cl_inout_mpi.modes[0] = STARPU_R;
739
        p2p_cl_inout_mpi.modes[1] = starpu_data_access_mode(STARPU_RW|STARPU_COMMUTE_IF_SUPPORTED);
740
        p2p_cl_inout_mpi.modes[2] = STARPU_R;
741
        p2p_cl_inout_mpi.name = "p2p_cl_inout_mpi";
742

743
        memset(&m2l_cl_inout_mpi, 0, sizeof(m2l_cl_inout_mpi));
744
#ifdef STARPU_USE_CPU
745
        if(originalCpuKernel->supportM2LMpi(FSTARPU_CPU_IDX)){
746 747 748
            m2l_cl_inout_mpi.where |= STARPU_CPU;
            m2l_cl_inout_mpi.cpu_funcs[0] = StarPUCpuWrapperClass::transferInoutPassCallbackMpi;
        }
749
#endif
750
#ifdef SCALFMM_ENABLE_CUDA_KERNEL
751
        if(originalCpuKernel->supportM2LMpi(FSTARPU_CUDA_IDX)){
752 753 754 755
            m2l_cl_inout_mpi.where |= STARPU_CUDA;
            m2l_cl_inout_mpi.cuda_funcs[0] = StarPUCudaWrapperClass::transferInoutPassCallbackMpi;
        }
#endif
756
#ifdef SCALFMM_ENABLE_OPENCL_KERNEL
757
        if(originalCpuKernel->supportM2LMpi(FSTARPU_OPENCL_IDX)){
758 759 760
            m2l_cl_inout_mpi.where |= STARPU_OPENCL;
            m2l_cl_inout_mpi.opencl_funcs[0] = StarPUOpenClWrapperClass::transferInoutPassCallbackMpi;
        }
761
#endif
762 763
        m2l_cl_inout_mpi.nbuffers = 4;
        m2l_cl_inout_mpi.modes[0] = STARPU_R;
764
        m2l_cl_inout_mpi.modes[1] = starpu_data_access_mode(STARPU_RW|STARPU_COMMUTE_IF_SUPPORTED);
765 766
        m2l_cl_inout_mpi.modes[2] = STARPU_R;
        m2l_cl_inout_mpi.modes[3] = STARPU_R;
767
        m2l_cl_inout_mpi.name = "m2l_cl_inout_mpi";
768 769
    }

770 771 772 773 774 775
    std::vector<std::pair<MortonIndex,MortonIndex>> processesIntervalPerLevels;
    struct BlockDescriptor{
        MortonIndex firstIndex;
        MortonIndex lastIndex;
        int owner;
        int bufferSize;
776 777 778 779
        size_t bufferSizeSymb;
        size_t bufferSizeUp;
        size_t bufferSizeDown;
        size_t leavesBufferSize;
780 781 782 783 784 785 786 787
    };
    std::vector<std::vector<BlockDescriptor>> processesBlockInfos;
    std::vector<int> nbBlocksPerLevelAll;
    std::vector<int> nbBlocksBeforeMinPerLevel;

    std::vector< std::vector< std::vector<BlockInteractions<CellContainerClass>>>> externalInteractionsAllLevelMpi;
    std::vector< std::vector<BlockInteractions<ParticleGroupClass>>> externalInteractionsLeafLevelMpi;

788
    struct RemoteHandle{
789 790 791 792
        RemoteHandle() : ptrSymb(nullptr), ptrUp(nullptr), ptrDown(nullptr){
            memset(&handleSymb, 0, sizeof(handleSymb));
            memset(&handleUp, 0, sizeof(handleUp));
            memset(&handleDown, 0, sizeof(handleDown));
793 794
        }

795 796 797 798 799 800
        unsigned char * ptrSymb;
        starpu_data_handle_t handleSymb;
        unsigned char * ptrUp;
        starpu_data_handle_t handleUp;
        unsigned char * ptrDown;
        starpu_data_handle_t handleDown;
BRAMAS Berenger's avatar
BRAMAS Berenger committed
801 802

        int intervalSize;
803 804 805 806 807
    };

    std::vector<std::vector<RemoteHandle>> remoteCellGroups;
    std::vector<RemoteHandle> remoteParticleGroupss;

808
    void buildRemoteInteractionsAndHandles(){
809 810
        cleanHandleMpi();

811 812
        // We need to have information about all other blocks
        std::unique_ptr<int[]> nbBlocksPerLevel(new int[tree->getHeight()]);
813
        nbBlocksPerLevel[0] = 0;
814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856
        for(int idxLevel = 1 ; idxLevel < tree->getHeight() ; ++idxLevel){
            nbBlocksPerLevel[idxLevel] = tree->getNbCellGroupAtLevel(idxLevel);
        }
        // Exchange the number of blocks per proc
        nbBlocksPerLevelAll.resize(tree->getHeight() * comm.processCount());
        FMpi::Assert(MPI_Allgather(nbBlocksPerLevel.get(), tree->getHeight(), MPI_INT,
                                   nbBlocksPerLevelAll.data(), tree->getHeight(), MPI_INT,
                                   comm.getComm()), __LINE__);
        // Compute the number of blocks before mine
        nbBlocksBeforeMinPerLevel.resize(tree->getHeight());
        for(int idxLevel = 1 ; idxLevel < tree->getHeight() ; ++idxLevel){
            nbBlocksBeforeMinPerLevel[idxLevel] = 0;
            for(int idxProc = 0 ; idxProc < comm.processId() ; ++idxProc){
                nbBlocksBeforeMinPerLevel[idxLevel] += nbBlocksPerLevelAll[idxProc*tree->getHeight() + idxLevel];
            }
        }
        // Prepare the block infos
        processesBlockInfos.resize(tree->getHeight());
        std::unique_ptr<int[]> recvBlocksCount(new int[comm.processCount()]);
        std::unique_ptr<int[]> recvBlockDispl(new int[comm.processCount()]);
        // Exchange the block info per level
        for(int idxLevel = 1 ; idxLevel < tree->getHeight() ; ++idxLevel){
            // Count the total number of blocks
            int nbBlocksInLevel = 0;
            recvBlockDispl[0] = 0;
            for(int idxProc = 0 ; idxProc < comm.processCount() ; ++idxProc){
                nbBlocksInLevel += nbBlocksPerLevelAll[idxProc*tree->getHeight() + idxLevel];
                // Count and displacement for the MPI all gatherv
                recvBlocksCount[idxProc] = nbBlocksPerLevelAll[idxProc*tree->getHeight() + idxLevel] * int(sizeof(BlockDescriptor));
                if(idxProc) recvBlockDispl[idxProc] = recvBlockDispl[idxProc-1] + recvBlocksCount[idxProc-1];
            }
            processesBlockInfos[idxLevel].resize(nbBlocksInLevel);
            // Fill my blocks
            std::vector<BlockDescriptor> myBlocksAtLevel;
            myBlocksAtLevel.resize(nbBlocksPerLevel[idxLevel]);
            FAssertLF(tree->getNbCellGroupAtLevel(idxLevel) == int(myBlocksAtLevel.size()));
            FAssertLF(nbBlocksPerLevel[idxLevel] == nbBlocksPerLevelAll[comm.processId()*tree->getHeight() + idxLevel]);

            for(int idxGroup = 0 ; idxGroup < tree->getNbCellGroupAtLevel(idxLevel) ; ++idxGroup){
                CellContainerClass*const currentCells = tree->getCellGroup(idxLevel, idxGroup);
                myBlocksAtLevel[idxGroup].firstIndex = currentCells->getStartingIndex();
                myBlocksAtLevel[idxGroup].lastIndex  = currentCells->getEndingIndex();
                myBlocksAtLevel[idxGroup].owner = comm.processId();
857 858 859
                myBlocksAtLevel[idxGroup].bufferSizeSymb = currentCells->getBufferSizeInByte();
                myBlocksAtLevel[idxGroup].bufferSizeUp   = currentCells->getMultipoleBufferSizeInByte();
                myBlocksAtLevel[idxGroup].bufferSizeDown = currentCells->getLocalBufferSizeInByte();
860 861 862 863 864 865 866 867 868 869 870 871 872

                if(idxLevel == tree->getHeight() - 1){
                    myBlocksAtLevel[idxGroup].leavesBufferSize = tree->getParticleGroup(idxGroup)->getBufferSizeInByte();
                }
                else{
                    myBlocksAtLevel[idxGroup].leavesBufferSize = 0;
                }
            }
            // Exchange with all other
            FMpi::Assert(MPI_Allgatherv(myBlocksAtLevel.data(), int(myBlocksAtLevel.size()*sizeof(BlockDescriptor)), MPI_BYTE,
                                        processesBlockInfos[idxLevel].data(), recvBlocksCount.get(), recvBlockDispl.get(), MPI_BYTE,
                                        comm.getComm()), __LINE__);
        }
873 874 875 876 877 878 879
        // Prepare remate ptr and handles
        remoteCellGroups.resize( tree->getHeight() );
        for(int idxLevel = 1 ; idxLevel < tree->getHeight() ; ++idxLevel){
            remoteCellGroups[idxLevel].resize( processesBlockInfos[idxLevel].size());
        }
        remoteParticleGroupss.resize(processesBlockInfos[tree->getHeight()-1].size());

880 881 882 883 884 885 886 887
        // From now we have the number of blocks for all process
        // we also have the size of the blocks therefor we can
        // create the handles we need
        // We will now detect the relation between our blocks and others
        // During the M2M (which is the same for the L2L)
        // During the M2L and during the P2P
        // I need to insert the task that read my data or that write the data I need.
        // M2L
888
        externalInteractionsAllLevelMpi.clear();
889 890 891 892 893 894 895 896 897 898 899 900 901 902
        externalInteractionsAllLevelMpi.resize(tree->getHeight());
        for(int idxLevel = tree->getHeight()-1 ; idxLevel >= 2 ; --idxLevel){
            // From this level there are no more blocks
            if(tree->getNbCellGroupAtLevel(idxLevel) == 0){
                // We stop here
                break;
            }
            // What are my morton interval at this level
            const MortonIndex myFirstIndex = tree->getCellGroup(idxLevel, 0)->getStartingIndex();
            const MortonIndex myLastIndex = tree->getCellGroup(idxLevel, tree->getNbCellGroupAtLevel(idxLevel)-1)->getEndingIndex();

            externalInteractionsAllLevelMpi[idxLevel].resize(tree->getNbCellGroupAtLevel(idxLevel));

            for(int idxGroup = 0 ; idxGroup < tree->getNbCellGroupAtLevel(idxLevel) ; ++idxGroup){
903
                CellContainerClass* currentCells = tree->getCellGroup(idxLevel, idxGroup);
904 905 906

                std::vector<BlockInteractions<CellContainerClass>>* externalInteractions = &externalInteractionsAllLevelMpi[idxLevel][idxGroup];

BRAMAS Berenger's avatar
BRAMAS Berenger committed
907
#pragma omp task default(none) firstprivate(idxGroup, currentCells, idxLevel, externalInteractions)
908 909 910
                {
                    std::vector<OutOfBlockInteraction> outsideInteractions;

911 912 913 914 915 916 917 918 919 920 921 922 923 924
                    for(int idxCell = 0 ; idxCell < currentCells->getNumberOfCellsInBlock() ; ++idxCell){
                        const MortonIndex mindex = currentCells->getCellMortonIndex(idxCell);

                        MortonIndex interactionsIndexes[189];
                        int interactionsPosition[189];
                        const FTreeCoordinate coord(mindex, idxLevel);
                        int counter = coord.getInteractionNeighbors(idxLevel,interactionsIndexes,interactionsPosition);

                        for(int idxInter = 0 ; idxInter < counter ; ++idxInter){
                            // This interactions need a block owned by someoneelse
                            if(interactionsIndexes[idxInter] < myFirstIndex || myLastIndex <= interactionsIndexes[idxInter]){
                                OutOfBlockInteraction property;
                                property.insideIndex = mindex;
                                property.outIndex    = interactionsIndexes[idxInter];
BRAMAS Berenger's avatar
BRAMAS Berenger committed
925
                                property.relativeOutPosition = interactionsPosition[idxInter];
926
                                property.insideIdxInBlock = idxCell;
BRAMAS Berenger's avatar
BRAMAS Berenger committed
927
                                property.outsideIdxInBlock = -1;
928
                                outsideInteractions.push_back(property);
929 930 931 932 933 934 935 936 937
                            }
                        }
                    }

                    // Manage outofblock interaction
                    FQuickSort<OutOfBlockInteraction, int>::QsSequential(outsideInteractions.data(),int(outsideInteractions.size()));

                    int currentOutInteraction = 0;
                    for(int idxOtherGroup = 0 ; idxOtherGroup < int(processesBlockInfos[idxLevel].size())
BRAMAS Berenger's avatar
BRAMAS Berenger committed
938
                        && currentOutInteraction < int(outsideInteractions.size()) ; ++idxOtherGroup){
939 940 941 942 943 944 945 946 947
                        // Skip my blocks
                        if(idxOtherGroup == nbBlocksBeforeMinPerLevel[idxLevel]){
                            idxOtherGroup += tree->getNbCellGroupAtLevel(idxLevel);
                            if(idxOtherGroup == int(processesBlockInfos[idxLevel].size())){
                                break;
                            }
                            FAssertLF(idxOtherGroup < int(processesBlockInfos[idxLevel].size()));
                        }

BRAMAS Berenger's avatar
BRAMAS Berenger committed
948 949
                        const MortonIndex blockStartIdxOther = processesBlockInfos[idxLevel][idxOtherGroup].firstIndex;
                        const MortonIndex blockEndIdxOther   = processesBlockInfos[idxLevel][idxOtherGroup].lastIndex;
950

BRAMAS Berenger's avatar
BRAMAS Berenger committed
951
                        while(currentOutInteraction < int(outsideInteractions.size()) && outsideInteractions[currentOutInteraction].outIndex < blockStartIdxOther){
952 953 954 955
                            currentOutInteraction += 1;
                        }

                        int lastOutInteraction = currentOutInteraction;
BRAMAS Berenger's avatar
BRAMAS Berenger committed
956
                        while(lastOutInteraction < int(outsideInteractions.size()) && outsideInteractions[lastOutInteraction].outIndex < blockEndIdxOther){
957 958 959 960 961 962
                            lastOutInteraction += 1;
                        }

                        // Create interactions
                        const int nbInteractionsBetweenBlocks = (lastOutInteraction-currentOutInteraction);
                        if(nbInteractionsBetweenBlocks){
963
                            if(remoteCellGroups[idxLevel][idxOtherGroup].ptrSymb == nullptr){
BRAMAS Berenger's avatar
BRAMAS Berenger committed
964
#pragma omp critical(CreateM2LRemotes)
965
                                {
966
                                    if(remoteCellGroups[idxLevel][idxOtherGroup].ptrSymb == nullptr){
BRAMAS Berenger's avatar
BRAMAS Berenger committed
967
                                        const size_t nbBytesInBlockSymb = processesBlockInfos[idxLevel][idxOtherGroup].bufferSizeSymb;
968
                                        unsigned char* memoryBlockSymb = (unsigned char*)FAlignedMemory::AllocateBytes<32>(nbBytesInBlockSymb);
969 970 971
                                        remoteCellGroups[idxLevel][idxOtherGroup].ptrSymb = memoryBlockSymb;
                                        starpu_variable_data_register(&remoteCellGroups[idxLevel][idxOtherGroup].handleSymb, 0,
                                                                      (uintptr_t)remoteCellGroups[idxLevel][idxOtherGroup].ptrSymb, nbBytesInBlockSymb);
BRAMAS Berenger's avatar
BRAMAS Berenger committed
972
                                        const size_t nbBytesInBlockUp = processesBlockInfos[idxLevel][idxOtherGroup].bufferSizeUp;
973
                                        unsigned char* memoryBlockUp = (unsigned char*)FAlignedMemory::AllocateBytes<32>(nbBytesInBlockUp);
974 975 976
                                        remoteCellGroups[idxLevel][idxOtherGroup].ptrUp = memoryBlockUp;
                                        starpu_variable_data_register(&remoteCellGroups[idxLevel][idxOtherGroup].handleUp, 0,
                                                                      (uintptr_t)remoteCellGroups[idxLevel][idxOtherGroup].ptrUp, nbBytesInBlockUp);
977 978
                                    }
                                }
979 980
                            }

981 982
                            externalInteractions->emplace_back();
                            BlockInteractions<CellContainerClass>* interactions = &externalInteractions->back();
983
                            //interactions->otherBlock = remoteCellGroups[idxLevel][idxOtherGroup].ptr;
984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001
                            interactions->otherBlockId = idxOtherGroup;
                            interactions->interactions.resize(nbInteractionsBetweenBlocks);
                            std::copy(outsideInteractions.begin() + currentOutInteraction,
                                      outsideInteractions.begin() + lastOutInteraction,
                                      interactions->interactions.begin());
                        }

                        currentOutInteraction = lastOutInteraction;
                    }
                }
            }
        }
        // P2P
        // We create one big vector per block
        {
            const MortonIndex myFirstIndex = tree->getParticleGroup(0)->getStartingIndex();
            const MortonIndex myLastIndex = tree->getParticleGroup(tree->getNbParticleGroup()-1)->getEndingIndex();

1002
            externalInteractionsLeafLevelMpi.clear();
1003 1004 1005 1006 1007 1008 1009
            externalInteractionsLeafLevelMpi.resize(tree->getNbParticleGroup());
            for(int idxGroup = 0 ; idxGroup < tree->getNbParticleGroup() ; ++idxGroup){
                // Create the vector
                ParticleGroupClass* containers = tree->getParticleGroup(idxGroup);

                std::vector<BlockInteractions<ParticleGroupClass>>* externalInteractions = &externalInteractionsLeafLevelMpi[idxGroup];

BRAMAS Berenger's avatar
BRAMAS Berenger committed
1010
#pragma omp task default(none) firstprivate(idxGroup, containers, externalInteractions)
1011 1012 1013
                { // Can be a task(inout:iterCells)
                    std::vector<OutOfBlockInteraction> outsideInteractions;

1014
                    for(int idxLeaf = 0 ; idxLeaf < containers->getNumberOfLeavesInBlock() ; ++idxLeaf){
1015
                        // ParticleContainerClass particles = containers->template getLeaf<ParticleContainerClass>(mindex);
1016
                        const MortonIndex mindex = containers->getLeafMortonIndex(idxLeaf);
1017
                        if(containers->exists(mindex)){
1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028
                            MortonIndex interactionsIndexes[26];
                            int interactionsPosition[26];
                            FTreeCoordinate coord(mindex, tree->getHeight()-1);
                            int counter = coord.getNeighborsIndexes(tree->getHeight(),interactionsIndexes,interactionsPosition);

                            for(int idxInter = 0 ; idxInter < counter ; ++idxInter){
                                if(interactionsIndexes[idxInter] < myFirstIndex ||
                                        myLastIndex <= interactionsIndexes[idxInter]){
                                    OutOfBlockInteraction property;
                                    property.insideIndex = mindex;
                                    property.outIndex    = interactionsIndexes[idxInter];
BRAMAS Berenger's avatar
BRAMAS Berenger committed
1029
                                    property.relativeOutPosition = interactionsPosition[idxInter];
BRAMAS Berenger's avatar
BRAMAS Berenger committed
1030
                                    property.outsideIdxInBlock = -1;
BRAMAS Berenger's avatar
BRAMAS Berenger committed
1031
                                    property.insideIdxInBlock = idxLeaf;
1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042
                                    outsideInteractions.push_back(property);
                                }
                            }
                        }
                    }

                    // Sort to match external order
                    FQuickSort<OutOfBlockInteraction, int>::QsSequential(outsideInteractions.data(),int(outsideInteractions.size()));

                    int currentOutInteraction = 0;
                    for(int idxOtherGroup = 0 ; idxOtherGroup < int(processesBlockInfos[tree->getHeight()-1].size())
BRAMAS Berenger's avatar
BRAMAS Berenger committed
1043
                        && currentOutInteraction < int(outsideInteractions.size()) ; ++idxOtherGroup){
1044 1045 1046 1047 1048 1049 1050 1051 1052
                        // Skip my blocks
                        if(idxOtherGroup == nbBlocksBeforeMinPerLevel[tree->getHeight()-1]){
                            idxOtherGroup += tree->getNbCellGroupAtLevel(tree->getHeight()-1);
                            if(idxOtherGroup == int(processesBlockInfos[tree->getHeight()-1].size())){
                                break;
                            }
                            FAssertLF(idxOtherGroup < int(processesBlockInfos[tree->getHeight()-1].size()));
                        }

BRAMAS Berenger's avatar
BRAMAS Berenger committed
1053 1054
                        const MortonIndex blockStartIdxOther = processesBlockInfos[tree->getHeight()-1][idxOtherGroup].firstIndex;
                        const MortonIndex blockEndIdxOther   = processesBlockInfos[tree->getHeight()-1][idxOtherGroup].lastIndex;
1055

BRAMAS Berenger's avatar
BRAMAS Berenger committed
1056
                        while(currentOutInteraction < int(outsideInteractions.size()) && outsideInteractions[currentOutInteraction].outIndex < blockStartIdxOther){
1057 1058 1059 1060
                            currentOutInteraction += 1;
                        }

                        int lastOutInteraction = currentOutInteraction;
BRAMAS Berenger's avatar
BRAMAS Berenger committed
1061
                        while(lastOutInteraction < int(outsideInteractions.size()) && outsideInteractions[lastOutInteraction].outIndex < blockEndIdxOther){
1062 1063 1064 1065 1066
                            lastOutInteraction += 1;
                        }

                        const int nbInteractionsBetweenBlocks = (lastOutInteraction-currentOutInteraction);
                        if(nbInteractionsBetweenBlocks){
1067
                            if(remoteParticleGroupss[idxOtherGroup].ptrSymb == nullptr){
BRAMAS Berenger's avatar
BRAMAS Berenger committed
1068
#pragma omp critical(CreateM2LRemotes)
1069
                                {
1070
                                    if(remoteParticleGroupss[idxOtherGroup].ptrSymb == nullptr){
BRAMAS Berenger's avatar
BRAMAS Berenger committed
1071
                                        const