FGroupTaskStarpuMpiAlgorithm.hpp 111 KB
Newer Older
1 2 3 4 5
// Keep in private GIT
// @SCALFMM_PRIVATE
#ifndef FGROUPTASKSTARPUMPIALGORITHM_HPP
#define FGROUPTASKSTARPUMPIALGORITHM_HPP

6 7 8 9 10 11 12 13 14 15 16
#include "../../Utils/FGlobal.hpp"
#include "../../Core/FCoreCommon.hpp"
#include "../../Utils/FQuickSort.hpp"
#include "../../Containers/FTreeCoordinate.hpp"
#include "../../Utils/FLog.hpp"
#include "../../Utils/FTic.hpp"
#include "../../Utils/FAssert.hpp"
#include "../../Utils/FAlignedMemory.hpp"
#include "../../Utils/FAssert.hpp"

#include "../../Utils/FMpi.hpp"
17

18 19
#include "FOutOfBlockInteraction.hpp"

20 21 22 23 24 25
#include <vector>
#include <memory>

#include <omp.h>

#include <starpu.h>
26
#include <starpu_mpi.h>
27
#include "../StarPUUtils/FStarPUUtils.hpp"
28
#include "../StarPUUtils/FStarPUFmmPriorities.hpp"
29

30
#ifdef STARPU_USE_CPU
31
#include "../StarPUUtils/FStarPUCpuWrapper.hpp"
32
#endif
33
#ifdef SCALFMM_ENABLE_CUDA_KERNEL
34 35 36 37 38
#include "../StarPUUtils/FStarPUCudaWrapper.hpp"
#include "../Cuda/FCudaEmptyKernel.hpp"
#include "../Cuda/FCudaGroupAttachedLeaf.hpp"
#include "../Cuda/FCudaGroupOfParticles.hpp"
#include "../Cuda/FCudaGroupOfCells.hpp"
39
#include "../Cuda/FCudaEmptyCellSymb.hpp"
40
#endif
41
#ifdef SCALFMM_ENABLE_OPENCL_KERNEL
42 43
#include "../StarPUUtils/FStarPUOpenClWrapper.hpp"
#include "../OpenCl/FOpenCLDeviceWrapper.hpp"
44
#include "../OpenCl/FEmptyOpenCLCode.hpp"
45
#endif
46

47

48
template <class OctreeClass, class CellContainerClass, class KernelClass, class ParticleGroupClass, class StarPUCpuWrapperClass
49
#ifdef SCALFMM_ENABLE_CUDA_KERNEL
50
    , class StarPUCudaWrapperClass = FStarPUCudaWrapper<KernelClass, FCudaEmptyCellSymb, int, int, FCudaGroupOfCells<FCudaEmptyCellSymb, int, int>,
BRAMAS Berenger's avatar
BRAMAS Berenger committed
51
                                                        FCudaGroupOfParticles<int, 0, 0, int>, FCudaGroupAttachedLeaf<int, 0, 0, int>, FCudaEmptyKernel<int> >
52
#endif
53
#ifdef SCALFMM_ENABLE_OPENCL_KERNEL
54
    , class StarPUOpenClWrapperClass = FStarPUOpenClWrapper<KernelClass, FOpenCLDeviceWrapper<KernelClass>>
55 56
#endif
          >
57
class FGroupTaskStarPUMpiAlgorithm : public FAbstractAlgorithm {
58
protected:
59
    typedef FGroupTaskStarPUMpiAlgorithm<OctreeClass, CellContainerClass, KernelClass, ParticleGroupClass, StarPUCpuWrapperClass
60
#ifdef SCALFMM_ENABLE_CUDA_KERNEL
61
        , StarPUCudaWrapperClass
62
#endif
63
#ifdef SCALFMM_ENABLE_OPENCL_KERNEL
64
    , StarPUOpenClWrapperClass
65 66
#endif
    > ThisClass;
67

68
    int getTag(const int inLevel, const MortonIndex mindex, const int mode) const{
69 70 71
        int shift = 0;
        int height = tree->getHeight();
        while(height) { shift += 1; height >>= 1; }
BRAMAS Berenger's avatar
BRAMAS Berenger committed
72
        return int((((mindex<<shift) + inLevel) << 5) + mode);
73 74
    }

75 76 77 78 79 80 81 82 83
    const FMpi::FComm& comm;

    template <class OtherBlockClass>
    struct BlockInteractions{
        OtherBlockClass* otherBlock;
        int otherBlockId;
        std::vector<OutOfBlockInteraction> interactions;
    };

84 85 86 87
    struct CellHandles{
        starpu_data_handle_t symb;
        starpu_data_handle_t up;
        starpu_data_handle_t down;
88
        int intervalSize;
89 90 91 92 93
    };

    struct ParticleHandles{
        starpu_data_handle_t symb;
        starpu_data_handle_t down;
94
        int intervalSize;
95 96
    };

97 98 99 100
    std::vector< std::vector< std::vector<BlockInteractions<CellContainerClass>>>> externalInteractionsAllLevel;
    std::vector< std::vector<BlockInteractions<ParticleGroupClass>>> externalInteractionsLeafLevel;

    OctreeClass*const tree;       //< The Tree
101
    KernelClass*const originalCpuKernel;
102

103 104
    std::vector<CellHandles>* cellHandles;
    std::vector<ParticleHandles> particleHandles;
105 106

    starpu_codelet p2m_cl;
BRAMAS Berenger's avatar
BRAMAS Berenger committed
107 108
    starpu_codelet m2m_cl;
    starpu_codelet l2l_cl;
109 110 111 112
    starpu_codelet l2p_cl;

    starpu_codelet m2l_cl_in;
    starpu_codelet m2l_cl_inout;
113
    starpu_codelet m2l_cl_inout_mpi;
114 115 116

    starpu_codelet p2p_cl_in;
    starpu_codelet p2p_cl_inout;
117
    starpu_codelet p2p_cl_inout_mpi;
118

119
#ifdef STARPU_USE_CPU
120
    StarPUCpuWrapperClass cpuWrapper;
121
#endif
122
#ifdef SCALFMM_ENABLE_CUDA_KERNEL
123 124
    StarPUCudaWrapperClass cudaWrapper;
#endif
125
#ifdef SCALFMM_ENABLE_OPENCL_KERNEL
126 127
    StarPUOpenClWrapperClass openclWrapper;
#endif
128 129 130

    FStarPUPtrInterface wrappers;
    FStarPUPtrInterface* wrapperptr;
131

132
#ifdef STARPU_SUPPORT_ARBITER
BRAMAS Berenger's avatar
BRAMAS Berenger committed
133
    starpu_arbiter_t arbiterGlobal;
134
#endif
BRAMAS Berenger's avatar
BRAMAS Berenger committed
135 136 137 138 139 140 141 142 143 144 145 146

#ifdef STARPU_USE_TASK_NAME
    std::vector<std::unique_ptr<char[]>> m2mTaskNames;
    std::vector<std::unique_ptr<char[]>> m2lTaskNames;
    std::vector<std::unique_ptr<char[]>> m2lOuterTaskNames;
    std::vector<std::unique_ptr<char[]>> l2lTaskNames;
    std::unique_ptr<char[]> p2mTaskNames;
    std::unique_ptr<char[]> l2pTaskNames;
    std::unique_ptr<char[]> p2pTaskNames;
    std::unique_ptr<char[]> p2pOuterTaskNames;
#endif

147
public:
148 149
    FGroupTaskStarPUMpiAlgorithm(const FMpi::FComm& inComm, OctreeClass*const inTree, KernelClass* inKernels)
        :   comm(inComm), tree(inTree), originalCpuKernel(inKernels),
150
          cellHandles(nullptr),
151 152
#ifdef STARPU_USE_CPU
            cpuWrapper(tree->getHeight()),
153
#endif
154
#ifdef SCALFMM_ENABLE_CUDA_KERNEL
155 156
            cudaWrapper(tree->getHeight()),
#endif
157
#ifdef SCALFMM_ENABLE_OPENCL_KERNEL
158
            openclWrapper(tree->getHeight()),
159 160
#endif
            wrapperptr(&wrappers){
161 162 163
        FAssertLF(tree, "tree cannot be null");
        FAssertLF(inKernels, "kernels cannot be null");

164 165
        FAbstractAlgorithm::setNbLevelsInTree(tree->getHeight());

166 167
        struct starpu_conf conf;
        FAssertLF(starpu_conf_init(&conf) == 0);
168
        FStarPUFmmPriorities::Controller().init(&conf, tree->getHeight(), inKernels);
169
        FAssertLF(starpu_init(&conf) == 0);
170
        FAssertLF(starpu_mpi_init ( 0, 0, 0 ) == 0);
171 172 173

        starpu_pthread_mutex_t initMutex;
        starpu_pthread_mutex_init(&initMutex, NULL);
174
#ifdef STARPU_USE_CPU
175 176 177 178 179
        FStarPUUtils::ExecOnWorkers(STARPU_CPU, [&](){
            starpu_pthread_mutex_lock(&initMutex);
            cpuWrapper.initKernel(starpu_worker_get_id(), inKernels);
            starpu_pthread_mutex_unlock(&initMutex);
        });
180
        wrappers.set(FSTARPU_CPU_IDX, &cpuWrapper);
181
#endif
182
#ifdef SCALFMM_ENABLE_CUDA_KERNEL
183 184 185 186 187 188 189
        FStarPUUtils::ExecOnWorkers(STARPU_CUDA, [&](){
            starpu_pthread_mutex_lock(&initMutex);
            cudaWrapper.initKernel(starpu_worker_get_id(), inKernels);
            starpu_pthread_mutex_unlock(&initMutex);
        });
        wrappers.set(FSTARPU_CUDA_IDX, &cudaWrapper);
#endif
190
#ifdef SCALFMM_ENABLE_OPENCL_KERNEL
191 192 193 194 195 196
        FStarPUUtils::ExecOnWorkers(STARPU_OPENCL, [&](){
            starpu_pthread_mutex_lock(&initMutex);
            openclWrapper.initKernel(starpu_worker_get_id(), inKernels);
            starpu_pthread_mutex_unlock(&initMutex);
        });
        wrappers.set(FSTARPU_OPENCL_IDX, &openclWrapper);
197
#endif
198 199
        starpu_pthread_mutex_destroy(&initMutex);

200 201
        starpu_pause();

202
        cellHandles   = new std::vector<CellHandles>[tree->getHeight()];
203

204
#ifdef STARPU_SUPPORT_ARBITER
BRAMAS Berenger's avatar
BRAMAS Berenger committed
205
        arbiterGlobal = starpu_arbiter_create();
206
#endif
207

BRAMAS Berenger's avatar
BRAMAS Berenger committed
208 209 210
        initCodelet();
        initCodeletMpi();

BRAMAS Berenger's avatar
BRAMAS Berenger committed
211
        FLOG(FLog::Controller << "FGroupTaskStarPUAlgorithm (Max Worker " << starpu_worker_get_count() << ")\n");
BRAMAS Berenger's avatar
BRAMAS Berenger committed
212
#ifdef STARPU_USE_CPU
BRAMAS Berenger's avatar
BRAMAS Berenger committed
213
        FLOG(FLog::Controller << "FGroupTaskStarPUAlgorithm (Max CPU " << starpu_cpu_worker_get_count() << ")\n");
BRAMAS Berenger's avatar
BRAMAS Berenger committed
214
#endif
215
#ifdef SCALFMM_ENABLE_OPENCL_KERNEL
BRAMAS Berenger's avatar
BRAMAS Berenger committed
216 217
        FLOG(FLog::Controller << "FGroupTaskStarPUAlgorithm (Max OpenCL " << starpu_opencl_worker_get_count() << ")\n");
#endif
218
#ifdef SCALFMM_ENABLE_CUDA_KERNEL
BRAMAS Berenger's avatar
BRAMAS Berenger committed
219 220
        FLOG(FLog::Controller << "FGroupTaskStarPUAlgorithm (Max CUDA " << starpu_cuda_worker_get_count() << ")\n");
#endif
BRAMAS Berenger's avatar
BRAMAS Berenger committed
221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251

        buildTaskNames();
    }

    void buildTaskNames(){
#ifdef STARPU_USE_TASK_NAME
        const int namesLength = 128;
        m2mTaskNames.resize(tree->getHeight());
        m2lTaskNames.resize(tree->getHeight());
        m2lOuterTaskNames.resize(tree->getHeight());
        l2lTaskNames.resize(tree->getHeight());
        for(int idxLevel = 0 ; idxLevel < tree->getHeight() ; ++idxLevel){
            m2mTaskNames[idxLevel].reset(new char[namesLength]);
            snprintf(m2mTaskNames[idxLevel].get(), namesLength, "M2M-level-%d", idxLevel);
            m2lTaskNames[idxLevel].reset(new char[namesLength]);
            snprintf(m2lTaskNames[idxLevel].get(), namesLength, "M2L-level-%d", idxLevel);
            m2lOuterTaskNames[idxLevel].reset(new char[namesLength]);
            snprintf(m2lOuterTaskNames[idxLevel].get(), namesLength, "M2L-out-level-%d", idxLevel);
            l2lTaskNames[idxLevel].reset(new char[namesLength]);
            snprintf(l2lTaskNames[idxLevel].get(), namesLength, "L2L-level-%d", idxLevel);
        }

        p2mTaskNames.reset(new char[namesLength]);
        snprintf(p2mTaskNames.get(), namesLength, "P2M");
        l2pTaskNames.reset(new char[namesLength]);
        snprintf(l2pTaskNames.get(), namesLength, "L2P");
        p2pTaskNames.reset(new char[namesLength]);
        snprintf(p2pTaskNames.get(), namesLength, "P2P");
        p2pOuterTaskNames.reset(new char[namesLength]);
        snprintf(p2pOuterTaskNames.get(), namesLength, "P2P-out");
#endif
252 253 254
    }

    ~FGroupTaskStarPUMpiAlgorithm(){
BRAMAS Berenger's avatar
BRAMAS Berenger committed
255 256
        starpu_resume();

257
        cleanHandle();
258
        cleanHandleMpi();
259
        delete[] cellHandles;
260

BRAMAS Berenger's avatar
BRAMAS Berenger committed
261 262 263 264 265 266 267 268 269 270
        starpu_pthread_mutex_t releaseMutex;
        starpu_pthread_mutex_init(&releaseMutex, NULL);
#ifdef STARPU_USE_CPU
        FStarPUUtils::ExecOnWorkers(STARPU_CPU, [&](){
            starpu_pthread_mutex_lock(&releaseMutex);
            cpuWrapper.releaseKernel(starpu_worker_get_id());
            starpu_pthread_mutex_unlock(&releaseMutex);
        });
        wrappers.set(FSTARPU_CPU_IDX, &cpuWrapper);
#endif
271
#ifdef SCALFMM_ENABLE_CUDA_KERNEL
BRAMAS Berenger's avatar
BRAMAS Berenger committed
272 273 274 275 276 277 278
        FStarPUUtils::ExecOnWorkers(STARPU_CUDA, [&](){
            starpu_pthread_mutex_lock(&releaseMutex);
            cudaWrapper.releaseKernel(starpu_worker_get_id());
            starpu_pthread_mutex_unlock(&releaseMutex);
        });
        wrappers.set(FSTARPU_CUDA_IDX, &cudaWrapper);
#endif
279
#ifdef SCALFMM_ENABLE_OPENCL_KERNEL
BRAMAS Berenger's avatar
BRAMAS Berenger committed
280 281 282 283 284 285 286 287 288
        FStarPUUtils::ExecOnWorkers(STARPU_OPENCL, [&](){
            starpu_pthread_mutex_lock(&releaseMutex);
            openclWrapper.releaseKernel(starpu_worker_get_id());
            starpu_pthread_mutex_unlock(&releaseMutex);
        });
        wrappers.set(FSTARPU_OPENCL_IDX, &openclWrapper);
#endif
        starpu_pthread_mutex_destroy(&releaseMutex);

289 290

#ifdef STARPU_SUPPORT_ARBITER
BRAMAS Berenger's avatar
BRAMAS Berenger committed
291
        starpu_arbiter_destroy(arbiterGlobal);
292
#endif
293
        starpu_mpi_shutdown();
294 295 296
        starpu_shutdown();
    }

297 298 299 300 301
protected:
    /**
      * Runs the complete algorithm.
      */
    void executeCore(const unsigned operationsToProceed) override {
302
        FLOG( FLog::Controller << "\tStart FGroupTaskStarPUMpiAlgorithm\n" );
303
        const bool directOnly = (tree->getHeight() <= 2);
304 305 306 307 308 309

        #pragma omp parallel
        #pragma omp single
        buildExternalInteractionVecs();
        buildHandles();

310 311
        #pragma omp parallel
        #pragma omp single
312 313 314
        buildRemoteInteractionsAndHandles();

        starpu_resume();
315
        postRecvAllocatedBlocks();
316

317
        if( operationsToProceed & FFmmP2P ) insertParticlesSend();
318

319
        if(operationsToProceed & FFmmP2M && !directOnly) bottomPass();
320

321 322
        if(operationsToProceed & FFmmM2M && !directOnly) upwardPass();
        if(operationsToProceed & FFmmM2L && !directOnly) insertCellsSend();
323

324 325
        if(operationsToProceed & FFmmM2L && !directOnly) transferPass();
        if(operationsToProceed & FFmmM2L && !directOnly) transferPassMpi();
326

327
        if(operationsToProceed & FFmmL2L && !directOnly) downardPass();
328

329 330
        if( operationsToProceed & FFmmP2P ) directPass();
        if( operationsToProceed & FFmmP2P ) directPassMpi();
BRAMAS Berenger's avatar
BRAMAS Berenger committed
331

332
        if( operationsToProceed & FFmmL2P  && !directOnly) mergePass();
333 334 335 336 337

        starpu_task_wait_for_all();
        starpu_pause();
    }

338

339 340
    void initCodelet(){
        memset(&p2m_cl, 0, sizeof(p2m_cl));
341
#ifdef STARPU_USE_CPU
342
        if(originalCpuKernel->supportP2M(FSTARPU_CPU_IDX)){
343 344 345
            p2m_cl.cpu_funcs[0] = StarPUCpuWrapperClass::bottomPassCallback;
            p2m_cl.where |= STARPU_CPU;
        }
346
#endif
347
#ifdef SCALFMM_ENABLE_CUDA_KERNEL
348
        if(originalCpuKernel->supportP2M(FSTARPU_CUDA_IDX)){
349 350 351 352
            p2m_cl.cuda_funcs[0] = StarPUCudaWrapperClass::bottomPassCallback;
            p2m_cl.where |= STARPU_CUDA;
        }
#endif
353
#ifdef SCALFMM_ENABLE_OPENCL_KERNEL
354
        if(originalCpuKernel->supportP2M(FSTARPU_OPENCL_IDX)){
355 356 357
            p2m_cl.opencl_funcs[0] = StarPUOpenClWrapperClass::bottomPassCallback;
            p2m_cl.where |= STARPU_OPENCL;
        }
358
#endif
359 360 361 362
        p2m_cl.nbuffers = 3;
        p2m_cl.modes[0] = STARPU_R;
        p2m_cl.modes[1] = STARPU_RW;
        p2m_cl.modes[2] = STARPU_R;
363
        p2m_cl.name = "p2m_cl";
364

BRAMAS Berenger's avatar
BRAMAS Berenger committed
365
        memset(&m2m_cl, 0, sizeof(m2m_cl));
366
#ifdef STARPU_USE_CPU
BRAMAS Berenger's avatar
BRAMAS Berenger committed
367 368 369 370
        if(originalCpuKernel->supportM2M(FSTARPU_CPU_IDX)){
            m2m_cl.cpu_funcs[0] = StarPUCpuWrapperClass::upwardPassCallback;
            m2m_cl.where |= STARPU_CPU;
        }
371
#endif
372
#ifdef SCALFMM_ENABLE_CUDA_KERNEL
BRAMAS Berenger's avatar
BRAMAS Berenger committed
373 374 375 376
        if(originalCpuKernel->supportM2M(FSTARPU_CUDA_IDX)){
            m2m_cl.cuda_funcs[0] = StarPUCudaWrapperClass::upwardPassCallback;
            m2m_cl.where |= STARPU_CUDA;
        }
377
#endif
378
#ifdef SCALFMM_ENABLE_OPENCL_KERNEL
BRAMAS Berenger's avatar
BRAMAS Berenger committed
379 380 381 382
        if(originalCpuKernel->supportM2M(FSTARPU_OPENCL_IDX)){
            m2m_cl.opencl_funcs[0] = StarPUOpenClWrapperClass::upwardPassCallback;
            m2m_cl.where |= STARPU_OPENCL;
        }
383
#endif
BRAMAS Berenger's avatar
BRAMAS Berenger committed
384 385 386 387 388 389 390 391 392
        m2m_cl.nbuffers = 4;
        m2m_cl.dyn_modes = (starpu_data_access_mode*)malloc(m2m_cl.nbuffers*sizeof(starpu_data_access_mode));
        m2m_cl.dyn_modes[0] = STARPU_R;
        m2m_cl.dyn_modes[1] = starpu_data_access_mode(STARPU_RW|STARPU_COMMUTE_IF_SUPPORTED);
        m2m_cl.name = "m2m_cl";
        m2m_cl.dyn_modes[2] = STARPU_R;
        m2m_cl.dyn_modes[3] = STARPU_R;

        memset(&l2l_cl, 0, sizeof(l2l_cl));
393
#ifdef STARPU_USE_CPU
BRAMAS Berenger's avatar
BRAMAS Berenger committed
394 395 396 397
        if(originalCpuKernel->supportL2L(FSTARPU_CPU_IDX)){
            l2l_cl.cpu_funcs[0] = StarPUCpuWrapperClass::downardPassCallback;
            l2l_cl.where |= STARPU_CPU;
        }
398
#endif
399
#ifdef SCALFMM_ENABLE_CUDA_KERNEL
BRAMAS Berenger's avatar
BRAMAS Berenger committed
400 401 402 403
        if(originalCpuKernel->supportL2L(FSTARPU_CUDA_IDX)){
            l2l_cl.cuda_funcs[0] = StarPUCudaWrapperClass::downardPassCallback;
            l2l_cl.where |= STARPU_CUDA;
        }
404
#endif
405
#ifdef SCALFMM_ENABLE_OPENCL_KERNEL
BRAMAS Berenger's avatar
BRAMAS Berenger committed
406 407 408
        if(originalCpuKernel->supportL2L(FSTARPU_OPENCL_IDX)){
            l2l_cl.opencl_funcs[0] = StarPUOpenClWrapperClass::downardPassCallback;
            l2l_cl.where |= STARPU_OPENCL;
409
        }
BRAMAS Berenger's avatar
BRAMAS Berenger committed
410 411 412 413 414 415 416 417
#endif
        l2l_cl.nbuffers = 4;
        l2l_cl.dyn_modes = (starpu_data_access_mode*)malloc(l2l_cl.nbuffers*sizeof(starpu_data_access_mode));
        l2l_cl.dyn_modes[0] = STARPU_R;
        l2l_cl.dyn_modes[1] = STARPU_R;
        l2l_cl.name = "l2l_cl";
        l2l_cl.dyn_modes[2] = STARPU_R;
        l2l_cl.dyn_modes[3] = starpu_data_access_mode(STARPU_RW|STARPU_COMMUTE_IF_SUPPORTED);
418 419

        memset(&l2p_cl, 0, sizeof(l2p_cl));
420
#ifdef STARPU_USE_CPU
421
        if(originalCpuKernel->supportL2P(FSTARPU_CPU_IDX)){
422 423 424
            l2p_cl.cpu_funcs[0] = StarPUCpuWrapperClass::mergePassCallback;
            l2p_cl.where |= STARPU_CPU;
        }
425
#endif
426
#ifdef SCALFMM_ENABLE_CUDA_KERNEL
427
        if(originalCpuKernel->supportL2P(FSTARPU_CUDA_IDX)){
428 429 430 431
            l2p_cl.cuda_funcs[0] = StarPUCudaWrapperClass::mergePassCallback;
            l2p_cl.where |= STARPU_CUDA;
        }
#endif
432
#ifdef SCALFMM_ENABLE_OPENCL_KERNEL
433
        if(originalCpuKernel->supportL2P(FSTARPU_OPENCL_IDX)){
434 435 436
            l2p_cl.opencl_funcs[0] = StarPUOpenClWrapperClass::mergePassCallback;
            l2p_cl.where |= STARPU_OPENCL;
        }
437
#endif
438
        l2p_cl.nbuffers = 4;
439
        l2p_cl.modes[0] = STARPU_R;
440 441
        l2p_cl.modes[1] = STARPU_R;
        l2p_cl.modes[2] = STARPU_R;
442
        l2p_cl.modes[3] = starpu_data_access_mode(STARPU_RW|STARPU_COMMUTE_IF_SUPPORTED);
443
        l2p_cl.name = "l2p_cl";
444 445

        memset(&p2p_cl_in, 0, sizeof(p2p_cl_in));
446
#ifdef STARPU_USE_CPU
447
        if(originalCpuKernel->supportP2P(FSTARPU_CPU_IDX)){
448 449 450
            p2p_cl_in.cpu_funcs[0] = StarPUCpuWrapperClass::directInPassCallback;
            p2p_cl_in.where |= STARPU_CPU;
        }
451
#endif
452
#ifdef SCALFMM_ENABLE_CUDA_KERNEL
453
        if(originalCpuKernel->supportP2P(FSTARPU_CUDA_IDX)){
454 455 456 457
            p2p_cl_in.cuda_funcs[0] = StarPUCudaWrapperClass::directInPassCallback;
            p2p_cl_in.where |= STARPU_CUDA;
        }
#endif
458
#ifdef SCALFMM_ENABLE_OPENCL_KERNEL
459
        if(originalCpuKernel->supportP2P(FSTARPU_OPENCL_IDX)){
460 461 462
            p2p_cl_in.opencl_funcs[0] = StarPUOpenClWrapperClass::directInPassCallback;
            p2p_cl_in.where |= STARPU_OPENCL;
        }
463
#endif
464 465
        p2p_cl_in.nbuffers = 2;
        p2p_cl_in.modes[0] = STARPU_R;
466
        p2p_cl_in.modes[1] = starpu_data_access_mode(STARPU_RW|STARPU_COMMUTE_IF_SUPPORTED);
467
        p2p_cl_in.name = "p2p_cl_in";
468
        memset(&p2p_cl_inout, 0, sizeof(p2p_cl_inout));
469
#ifdef STARPU_USE_CPU
470
        if(originalCpuKernel->supportP2PExtern(FSTARPU_CPU_IDX)){
471 472 473
            p2p_cl_inout.cpu_funcs[0] = StarPUCpuWrapperClass::directInoutPassCallback;
            p2p_cl_inout.where |= STARPU_CPU;
        }
474
#endif
475
#ifdef SCALFMM_ENABLE_CUDA_KERNEL
476
        if(originalCpuKernel->supportP2PExtern(FSTARPU_CUDA_IDX)){
477 478 479 480
            p2p_cl_inout.cuda_funcs[0] = StarPUCudaWrapperClass::directInoutPassCallback;
            p2p_cl_inout.where |= STARPU_CUDA;
        }
#endif
481
#ifdef SCALFMM_ENABLE_OPENCL_KERNEL
482
        if(originalCpuKernel->supportP2PExtern(FSTARPU_OPENCL_IDX)){
483 484 485
            p2p_cl_inout.opencl_funcs[0] = StarPUOpenClWrapperClass::directInoutPassCallback;
            p2p_cl_inout.where |= STARPU_OPENCL;
        }
486
#endif
487 488
        p2p_cl_inout.nbuffers = 4;
        p2p_cl_inout.modes[0] = STARPU_R;
489
        p2p_cl_inout.modes[1] = starpu_data_access_mode(STARPU_RW|STARPU_COMMUTE_IF_SUPPORTED);
490
        p2p_cl_inout.modes[2] = STARPU_R;
491
        p2p_cl_inout.modes[3] = starpu_data_access_mode(STARPU_RW|STARPU_COMMUTE_IF_SUPPORTED);
492
        p2p_cl_inout.name = "p2p_cl_inout";
493 494

        memset(&m2l_cl_in, 0, sizeof(m2l_cl_in));
495
#ifdef STARPU_USE_CPU
496
        if(originalCpuKernel->supportM2L(FSTARPU_CPU_IDX)){
497 498 499
            m2l_cl_in.cpu_funcs[0] = StarPUCpuWrapperClass::transferInPassCallback;
            m2l_cl_in.where |= STARPU_CPU;
        }
500
#endif
501
#ifdef SCALFMM_ENABLE_CUDA_KERNEL
502
        if(originalCpuKernel->supportM2L(FSTARPU_CUDA_IDX)){
503 504 505 506
            m2l_cl_in.cuda_funcs[0] = StarPUCudaWrapperClass::transferInPassCallback;
            m2l_cl_in.where |= STARPU_CUDA;
        }
#endif
507
#ifdef SCALFMM_ENABLE_OPENCL_KERNEL
508
        if(originalCpuKernel->supportM2L(FSTARPU_OPENCL_IDX)){
509 510 511
            m2l_cl_in.opencl_funcs[0] = StarPUOpenClWrapperClass::transferInPassCallback;
            m2l_cl_in.where |= STARPU_OPENCL;
        }
512
#endif
513 514
        m2l_cl_in.nbuffers = 3;
        m2l_cl_in.modes[0] = STARPU_R;
515
        m2l_cl_in.modes[1] = STARPU_R;
516
        m2l_cl_in.modes[2] = starpu_data_access_mode(STARPU_RW|STARPU_COMMUTE_IF_SUPPORTED);
517
        m2l_cl_in.name = "m2l_cl_in";
BRAMAS Berenger's avatar
BRAMAS Berenger committed
518

519
        memset(&m2l_cl_inout, 0, sizeof(m2l_cl_inout));
520
#ifdef STARPU_USE_CPU
521
        if(originalCpuKernel->supportM2LExtern(FSTARPU_CPU_IDX)){
522 523 524
            m2l_cl_inout.cpu_funcs[0] = StarPUCpuWrapperClass::transferInoutPassCallback;
            m2l_cl_inout.where |= STARPU_CPU;
        }
525
#endif
526
#ifdef SCALFMM_ENABLE_CUDA_KERNEL
527
        if(originalCpuKernel->supportM2LExtern(FSTARPU_CUDA_IDX)){
528 529 530 531
            m2l_cl_inout.cuda_funcs[0] = StarPUCudaWrapperClass::transferInoutPassCallback;
            m2l_cl_inout.where |= STARPU_CUDA;
        }
#endif
532
#ifdef SCALFMM_ENABLE_OPENCL_KERNEL
533
        if(originalCpuKernel->supportM2LExtern(FSTARPU_OPENCL_IDX)){
534 535 536
            m2l_cl_inout.opencl_funcs[0] = StarPUOpenClWrapperClass::transferInoutPassCallback;
            m2l_cl_inout.where |= STARPU_OPENCL;
        }
537
#endif
BRAMAS Berenger's avatar
BRAMAS Berenger committed
538
        m2l_cl_inout.nbuffers = 4;
539
        m2l_cl_inout.modes[0] = STARPU_R;
BRAMAS Berenger's avatar
BRAMAS Berenger committed
540 541
        m2l_cl_inout.modes[1] = starpu_data_access_mode(STARPU_RW|STARPU_COMMUTE_IF_SUPPORTED);
        m2l_cl_inout.modes[2] = STARPU_R;
542
        m2l_cl_inout.modes[3] = STARPU_R;
543
        m2l_cl_inout.name = "m2l_cl_inout";
544 545 546 547 548
    }

    /** dealloc in a starpu way all the defined handles */
    void cleanHandle(){
        for(int idxLevel = 0 ; idxLevel < tree->getHeight() ; ++idxLevel){
549 550 551 552
            for(int idxHandle = 0 ; idxHandle < int(cellHandles[idxLevel].size()) ; ++idxHandle){
                starpu_data_unregister(cellHandles[idxLevel][idxHandle].symb);
                starpu_data_unregister(cellHandles[idxLevel][idxHandle].up);
                starpu_data_unregister(cellHandles[idxLevel][idxHandle].down);
553
            }
554
            cellHandles[idxLevel].clear();
555 556
        }
        {
557 558 559
            for(int idxHandle = 0 ; idxHandle < int(particleHandles.size()) ; ++idxHandle){
                starpu_data_unregister(particleHandles[idxHandle].symb);
                starpu_data_unregister(particleHandles[idxHandle].down);
560
            }
561
            particleHandles.clear();
562 563 564 565
        }
    }

    ////////////////////////////////////////////////////////////////////////////
566 567

    void initCodeletMpi(){
568
        memset(&p2p_cl_inout_mpi, 0, sizeof(p2p_cl_inout_mpi));
569
#ifdef STARPU_USE_CPU
570
        if(originalCpuKernel->supportP2PMpi(FSTARPU_CPU_IDX)){
571 572 573
            p2p_cl_inout_mpi.where |= STARPU_CPU;
            p2p_cl_inout_mpi.cpu_funcs[0] = StarPUCpuWrapperClass::directInoutPassCallbackMpi;
        }
574
#endif
575
#ifdef SCALFMM_ENABLE_CUDA_KERNEL
576
        if(originalCpuKernel->supportP2PMpi(FSTARPU_CUDA_IDX)){
577 578 579 580
            p2p_cl_inout_mpi.where |= STARPU_CUDA;
            p2p_cl_inout_mpi.cuda_funcs[0] = StarPUCudaWrapperClass::directInoutPassCallbackMpi;
        }
#endif
581
#ifdef SCALFMM_ENABLE_OPENCL_KERNEL
582
        if(originalCpuKernel->supportP2PMpi(FSTARPU_OPENCL_IDX)){
583 584 585
            p2p_cl_inout_mpi.where |= STARPU_OPENCL;
            p2p_cl_inout_mpi.opencl_funcs[0] = StarPUOpenClWrapperClass::directInoutPassCallbackMpi;
        }
586
#endif
587 588
        p2p_cl_inout_mpi.nbuffers = 3;
        p2p_cl_inout_mpi.modes[0] = STARPU_R;
589
        p2p_cl_inout_mpi.modes[1] = starpu_data_access_mode(STARPU_RW|STARPU_COMMUTE_IF_SUPPORTED);
590
        p2p_cl_inout_mpi.modes[2] = STARPU_R;
591
        p2p_cl_inout_mpi.name = "p2p_cl_inout_mpi";
592

593
        memset(&m2l_cl_inout_mpi, 0, sizeof(m2l_cl_inout_mpi));
594
#ifdef STARPU_USE_CPU
595
        if(originalCpuKernel->supportM2LMpi(FSTARPU_CPU_IDX)){
596 597 598
            m2l_cl_inout_mpi.where |= STARPU_CPU;
            m2l_cl_inout_mpi.cpu_funcs[0] = StarPUCpuWrapperClass::transferInoutPassCallbackMpi;
        }
599
#endif
600
#ifdef SCALFMM_ENABLE_CUDA_KERNEL
601
        if(originalCpuKernel->supportM2LMpi(FSTARPU_CUDA_IDX)){
602 603 604 605
            m2l_cl_inout_mpi.where |= STARPU_CUDA;
            m2l_cl_inout_mpi.cuda_funcs[0] = StarPUCudaWrapperClass::transferInoutPassCallbackMpi;
        }
#endif
606
#ifdef SCALFMM_ENABLE_OPENCL_KERNEL
607
        if(originalCpuKernel->supportM2LMpi(FSTARPU_OPENCL_IDX)){
608 609 610
            m2l_cl_inout_mpi.where |= STARPU_OPENCL;
            m2l_cl_inout_mpi.opencl_funcs[0] = StarPUOpenClWrapperClass::transferInoutPassCallbackMpi;
        }
611
#endif
612 613
        m2l_cl_inout_mpi.nbuffers = 4;
        m2l_cl_inout_mpi.modes[0] = STARPU_R;
614
        m2l_cl_inout_mpi.modes[1] = starpu_data_access_mode(STARPU_RW|STARPU_COMMUTE_IF_SUPPORTED);
615 616
        m2l_cl_inout_mpi.modes[2] = STARPU_R;
        m2l_cl_inout_mpi.modes[3] = STARPU_R;
617
        m2l_cl_inout_mpi.name = "m2l_cl_inout_mpi";
618 619
    }

620 621 622 623 624 625
    std::vector<std::pair<MortonIndex,MortonIndex>> processesIntervalPerLevels;
    struct BlockDescriptor{
        MortonIndex firstIndex;
        MortonIndex lastIndex;
        int owner;
        int bufferSize;
626 627 628 629
        size_t bufferSizeSymb;
        size_t bufferSizeUp;
        size_t bufferSizeDown;
        size_t leavesBufferSize;
630 631 632 633 634 635 636 637
    };
    std::vector<std::vector<BlockDescriptor>> processesBlockInfos;
    std::vector<int> nbBlocksPerLevelAll;
    std::vector<int> nbBlocksBeforeMinPerLevel;

    std::vector< std::vector< std::vector<BlockInteractions<CellContainerClass>>>> externalInteractionsAllLevelMpi;
    std::vector< std::vector<BlockInteractions<ParticleGroupClass>>> externalInteractionsLeafLevelMpi;

638
    struct RemoteHandle{
639 640 641 642
        RemoteHandle() : ptrSymb(nullptr), ptrUp(nullptr), ptrDown(nullptr){
            memset(&handleSymb, 0, sizeof(handleSymb));
            memset(&handleUp, 0, sizeof(handleUp));
            memset(&handleDown, 0, sizeof(handleDown));
643 644
        }

645 646 647 648 649 650
        unsigned char * ptrSymb;
        starpu_data_handle_t handleSymb;
        unsigned char * ptrUp;
        starpu_data_handle_t handleUp;
        unsigned char * ptrDown;
        starpu_data_handle_t handleDown;
BRAMAS Berenger's avatar
BRAMAS Berenger committed
651 652

        int intervalSize;
653 654 655 656 657
    };

    std::vector<std::vector<RemoteHandle>> remoteCellGroups;
    std::vector<RemoteHandle> remoteParticleGroupss;

658
    void buildRemoteInteractionsAndHandles(){
659 660
        cleanHandleMpi();

661 662
        // We need to have information about all other blocks
        std::unique_ptr<int[]> nbBlocksPerLevel(new int[tree->getHeight()]);
663
        nbBlocksPerLevel[0] = 0;
664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706
        for(int idxLevel = 1 ; idxLevel < tree->getHeight() ; ++idxLevel){
            nbBlocksPerLevel[idxLevel] = tree->getNbCellGroupAtLevel(idxLevel);
        }
        // Exchange the number of blocks per proc
        nbBlocksPerLevelAll.resize(tree->getHeight() * comm.processCount());
        FMpi::Assert(MPI_Allgather(nbBlocksPerLevel.get(), tree->getHeight(), MPI_INT,
                                   nbBlocksPerLevelAll.data(), tree->getHeight(), MPI_INT,
                                   comm.getComm()), __LINE__);
        // Compute the number of blocks before mine
        nbBlocksBeforeMinPerLevel.resize(tree->getHeight());
        for(int idxLevel = 1 ; idxLevel < tree->getHeight() ; ++idxLevel){
            nbBlocksBeforeMinPerLevel[idxLevel] = 0;
            for(int idxProc = 0 ; idxProc < comm.processId() ; ++idxProc){
                nbBlocksBeforeMinPerLevel[idxLevel] += nbBlocksPerLevelAll[idxProc*tree->getHeight() + idxLevel];
            }
        }
        // Prepare the block infos
        processesBlockInfos.resize(tree->getHeight());
        std::unique_ptr<int[]> recvBlocksCount(new int[comm.processCount()]);
        std::unique_ptr<int[]> recvBlockDispl(new int[comm.processCount()]);
        // Exchange the block info per level
        for(int idxLevel = 1 ; idxLevel < tree->getHeight() ; ++idxLevel){
            // Count the total number of blocks
            int nbBlocksInLevel = 0;
            recvBlockDispl[0] = 0;
            for(int idxProc = 0 ; idxProc < comm.processCount() ; ++idxProc){
                nbBlocksInLevel += nbBlocksPerLevelAll[idxProc*tree->getHeight() + idxLevel];
                // Count and displacement for the MPI all gatherv
                recvBlocksCount[idxProc] = nbBlocksPerLevelAll[idxProc*tree->getHeight() + idxLevel] * int(sizeof(BlockDescriptor));
                if(idxProc) recvBlockDispl[idxProc] = recvBlockDispl[idxProc-1] + recvBlocksCount[idxProc-1];
            }
            processesBlockInfos[idxLevel].resize(nbBlocksInLevel);
            // Fill my blocks
            std::vector<BlockDescriptor> myBlocksAtLevel;
            myBlocksAtLevel.resize(nbBlocksPerLevel[idxLevel]);
            FAssertLF(tree->getNbCellGroupAtLevel(idxLevel) == int(myBlocksAtLevel.size()));
            FAssertLF(nbBlocksPerLevel[idxLevel] == nbBlocksPerLevelAll[comm.processId()*tree->getHeight() + idxLevel]);

            for(int idxGroup = 0 ; idxGroup < tree->getNbCellGroupAtLevel(idxLevel) ; ++idxGroup){
                CellContainerClass*const currentCells = tree->getCellGroup(idxLevel, idxGroup);
                myBlocksAtLevel[idxGroup].firstIndex = currentCells->getStartingIndex();
                myBlocksAtLevel[idxGroup].lastIndex  = currentCells->getEndingIndex();
                myBlocksAtLevel[idxGroup].owner = comm.processId();
707 708 709
                myBlocksAtLevel[idxGroup].bufferSizeSymb = currentCells->getBufferSizeInByte();
                myBlocksAtLevel[idxGroup].bufferSizeUp   = currentCells->getMultipoleBufferSizeInByte();
                myBlocksAtLevel[idxGroup].bufferSizeDown = currentCells->getLocalBufferSizeInByte();
710 711 712 713 714 715 716 717 718 719 720 721 722

                if(idxLevel == tree->getHeight() - 1){
                    myBlocksAtLevel[idxGroup].leavesBufferSize = tree->getParticleGroup(idxGroup)->getBufferSizeInByte();
                }
                else{
                    myBlocksAtLevel[idxGroup].leavesBufferSize = 0;
                }
            }
            // Exchange with all other
            FMpi::Assert(MPI_Allgatherv(myBlocksAtLevel.data(), int(myBlocksAtLevel.size()*sizeof(BlockDescriptor)), MPI_BYTE,
                                        processesBlockInfos[idxLevel].data(), recvBlocksCount.get(), recvBlockDispl.get(), MPI_BYTE,
                                        comm.getComm()), __LINE__);
        }
723 724 725 726 727 728 729
        // Prepare remate ptr and handles
        remoteCellGroups.resize( tree->getHeight() );
        for(int idxLevel = 1 ; idxLevel < tree->getHeight() ; ++idxLevel){
            remoteCellGroups[idxLevel].resize( processesBlockInfos[idxLevel].size());
        }
        remoteParticleGroupss.resize(processesBlockInfos[tree->getHeight()-1].size());

730 731 732 733 734 735 736 737
        // From now we have the number of blocks for all process
        // we also have the size of the blocks therefor we can
        // create the handles we need
        // We will now detect the relation between our blocks and others
        // During the M2M (which is the same for the L2L)
        // During the M2L and during the P2P
        // I need to insert the task that read my data or that write the data I need.
        // M2L
738
        externalInteractionsAllLevelMpi.clear();
739 740 741 742 743 744 745 746 747 748 749 750 751 752
        externalInteractionsAllLevelMpi.resize(tree->getHeight());
        for(int idxLevel = tree->getHeight()-1 ; idxLevel >= 2 ; --idxLevel){
            // From this level there are no more blocks
            if(tree->getNbCellGroupAtLevel(idxLevel) == 0){
                // We stop here
                break;
            }
            // What are my morton interval at this level
            const MortonIndex myFirstIndex = tree->getCellGroup(idxLevel, 0)->getStartingIndex();
            const MortonIndex myLastIndex = tree->getCellGroup(idxLevel, tree->getNbCellGroupAtLevel(idxLevel)-1)->getEndingIndex();

            externalInteractionsAllLevelMpi[idxLevel].resize(tree->getNbCellGroupAtLevel(idxLevel));

            for(int idxGroup = 0 ; idxGroup < tree->getNbCellGroupAtLevel(idxLevel) ; ++idxGroup){
753
                CellContainerClass* currentCells = tree->getCellGroup(idxLevel, idxGroup);
754 755 756 757 758 759 760

                std::vector<BlockInteractions<CellContainerClass>>* externalInteractions = &externalInteractionsAllLevelMpi[idxLevel][idxGroup];

                #pragma omp task default(none) firstprivate(idxGroup, currentCells, idxLevel, externalInteractions)
                {
                    std::vector<OutOfBlockInteraction> outsideInteractions;

761 762 763 764 765 766 767 768 769 770 771 772 773 774
                    for(int idxCell = 0 ; idxCell < currentCells->getNumberOfCellsInBlock() ; ++idxCell){
                        const MortonIndex mindex = currentCells->getCellMortonIndex(idxCell);

                        MortonIndex interactionsIndexes[189];
                        int interactionsPosition[189];
                        const FTreeCoordinate coord(mindex, idxLevel);
                        int counter = coord.getInteractionNeighbors(idxLevel,interactionsIndexes,interactionsPosition);

                        for(int idxInter = 0 ; idxInter < counter ; ++idxInter){
                            // This interactions need a block owned by someoneelse
                            if(interactionsIndexes[idxInter] < myFirstIndex || myLastIndex <= interactionsIndexes[idxInter]){
                                OutOfBlockInteraction property;
                                property.insideIndex = mindex;
                                property.outIndex    = interactionsIndexes[idxInter];
BRAMAS Berenger's avatar
BRAMAS Berenger committed
775
                                property.relativeOutPosition = interactionsPosition[idxInter];
776 777
                                property.insideIdxInBlock = idxCell;
                                outsideInteractions.push_back(property);
778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796
                            }
                        }
                    }

                    // Manage outofblock interaction
                    FQuickSort<OutOfBlockInteraction, int>::QsSequential(outsideInteractions.data(),int(outsideInteractions.size()));

                    int currentOutInteraction = 0;
                    for(int idxOtherGroup = 0 ; idxOtherGroup < int(processesBlockInfos[idxLevel].size())
                                                && currentOutInteraction < int(outsideInteractions.size()) ; ++idxOtherGroup){
                        // Skip my blocks
                        if(idxOtherGroup == nbBlocksBeforeMinPerLevel[idxLevel]){
                            idxOtherGroup += tree->getNbCellGroupAtLevel(idxLevel);
                            if(idxOtherGroup == int(processesBlockInfos[idxLevel].size())){
                                break;
                            }
                            FAssertLF(idxOtherGroup < int(processesBlockInfos[idxLevel].size()));
                        }

BRAMAS Berenger's avatar
BRAMAS Berenger committed
797 798
                        const MortonIndex blockStartIdxOther = processesBlockInfos[idxLevel][idxOtherGroup].firstIndex;
                        const MortonIndex blockEndIdxOther   = processesBlockInfos[idxLevel][idxOtherGroup].lastIndex;
799

BRAMAS Berenger's avatar
BRAMAS Berenger committed
800
                        while(currentOutInteraction < int(outsideInteractions.size()) && outsideInteractions[currentOutInteraction].outIndex < blockStartIdxOther){
801 802 803 804
                            currentOutInteraction += 1;
                        }

                        int lastOutInteraction = currentOutInteraction;
BRAMAS Berenger's avatar
BRAMAS Berenger committed
805
                        while(lastOutInteraction < int(outsideInteractions.size()) && outsideInteractions[lastOutInteraction].outIndex < blockEndIdxOther){
806 807 808 809 810 811
                            lastOutInteraction += 1;
                        }

                        // Create interactions
                        const int nbInteractionsBetweenBlocks = (lastOutInteraction-currentOutInteraction);
                        if(nbInteractionsBetweenBlocks){
812
                            if(remoteCellGroups[idxLevel][idxOtherGroup].ptrSymb == nullptr){
813 814
                                #pragma omp critical(CreateM2LRemotes)
                                {
815
                                    if(remoteCellGroups[idxLevel][idxOtherGroup].ptrSymb == nullptr){
BRAMAS Berenger's avatar
BRAMAS Berenger committed
816
                                        const size_t nbBytesInBlockSymb = processesBlockInfos[idxLevel][idxOtherGroup].bufferSizeSymb;
817
                                        unsigned char* memoryBlockSymb = (unsigned char*)FAlignedMemory::AllocateBytes<32>(nbBytesInBlockSymb);
818 819 820
                                        remoteCellGroups[idxLevel][idxOtherGroup].ptrSymb = memoryBlockSymb;
                                        starpu_variable_data_register(&remoteCellGroups[idxLevel][idxOtherGroup].handleSymb, 0,
                                                                      (uintptr_t)remoteCellGroups[idxLevel][idxOtherGroup].ptrSymb, nbBytesInBlockSymb);
BRAMAS Berenger's avatar
BRAMAS Berenger committed
821
                                        const size_t nbBytesInBlockUp = processesBlockInfos[idxLevel][idxOtherGroup].bufferSizeUp;
822
                                        unsigned char* memoryBlockUp = (unsigned char*)FAlignedMemory::AllocateBytes<32>(nbBytesInBlockUp);
823 824 825
                                        remoteCellGroups[idxLevel][idxOtherGroup].ptrUp = memoryBlockUp;
                                        starpu_variable_data_register(&remoteCellGroups[idxLevel][idxOtherGroup].handleUp, 0,
                                                                      (uintptr_t)remoteCellGroups[idxLevel][idxOtherGroup].ptrUp, nbBytesInBlockUp);
826 827
                                    }
                                }
828 829
                            }

830 831
                            externalInteractions->emplace_back();
                            BlockInteractions<CellContainerClass>* interactions = &externalInteractions->back();
832
                            //interactions->otherBlock = remoteCellGroups[idxLevel][idxOtherGroup].ptr;
833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850
                            interactions->otherBlockId = idxOtherGroup;
                            interactions->interactions.resize(nbInteractionsBetweenBlocks);
                            std::copy(outsideInteractions.begin() + currentOutInteraction,
                                      outsideInteractions.begin() + lastOutInteraction,
                                      interactions->interactions.begin());
                        }

                        currentOutInteraction = lastOutInteraction;
                    }
                }
            }
        }
        // P2P
        // We create one big vector per block
        {
            const MortonIndex myFirstIndex = tree->getParticleGroup(0)->getStartingIndex();
            const MortonIndex myLastIndex = tree->getParticleGroup(tree->getNbParticleGroup()-1)->getEndingIndex();

851
            externalInteractionsLeafLevelMpi.clear();
852 853 854 855 856 857 858 859 860 861 862
            externalInteractionsLeafLevelMpi.resize(tree->getNbParticleGroup());
            for(int idxGroup = 0 ; idxGroup < tree->getNbParticleGroup() ; ++idxGroup){
                // Create the vector
                ParticleGroupClass* containers = tree->getParticleGroup(idxGroup);

                std::vector<BlockInteractions<ParticleGroupClass>>* externalInteractions = &externalInteractionsLeafLevelMpi[idxGroup];

                #pragma omp task default(none) firstprivate(idxGroup, containers, externalInteractions)
                { // Can be a task(inout:iterCells)
                    std::vector<OutOfBlockInteraction> outsideInteractions;

863
                    for(int idxLeaf = 0 ; idxLeaf < containers->getNumberOfLeavesInBlock() ; ++idxLeaf){
864
                        // ParticleContainerClass particles = containers->template getLeaf<ParticleContainerClass>(mindex);
865
                        const MortonIndex mindex = containers->getLeafMortonIndex(idxLeaf);
866
                        if(containers->exists(mindex)){
867 868 869 870 871 872 873 874 875 876 877
                            MortonIndex interactionsIndexes[26];
                            int interactionsPosition[26];
                            FTreeCoordinate coord(mindex, tree->getHeight()-1);
                            int counter = coord.getNeighborsIndexes(tree->getHeight(),interactionsIndexes,interactionsPosition);

                            for(int idxInter = 0 ; idxInter < counter ; ++idxInter){
                                if(interactionsIndexes[idxInter] < myFirstIndex ||
                                        myLastIndex <= interactionsIndexes[idxInter]){
                                    OutOfBlockInteraction property;
                                    property.insideIndex = mindex;
                                    property.outIndex    = interactionsIndexes[idxInter];
BRAMAS Berenger's avatar
BRAMAS Berenger committed
878
                                    property.relativeOutPosition = interactionsPosition[idxInter];
879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899
                                    outsideInteractions.push_back(property);
                                }
                            }
                        }
                    }

                    // Sort to match external order
                    FQuickSort<OutOfBlockInteraction, int>::QsSequential(outsideInteractions.data(),int(outsideInteractions.size()));

                    int currentOutInteraction = 0;
                    for(int idxOtherGroup = 0 ; idxOtherGroup < int(processesBlockInfos[tree->getHeight()-1].size())
                                                && currentOutInteraction < int(outsideInteractions.size()) ; ++idxOtherGroup){
                        // Skip my blocks
                        if(idxOtherGroup == nbBlocksBeforeMinPerLevel[tree->getHeight()-1]){
                            idxOtherGroup += tree->getNbCellGroupAtLevel(tree->getHeight()-1);
                            if(idxOtherGroup == int(processesBlockInfos[tree->getHeight()-1].size())){
                                break;
                            }
                            FAssertLF(idxOtherGroup < int(processesBlockInfos[tree->getHeight()-1].size()));
                        }

BRAMAS Berenger's avatar
BRAMAS Berenger committed
900 901
                        const MortonIndex blockStartIdxOther = processesBlockInfos[tree->getHeight()-1][idxOtherGroup].firstIndex;
                        const MortonIndex blockEndIdxOther   = processesBlockInfos[tree->getHeight()-1][idxOtherGroup].lastIndex;
902

BRAMAS Berenger's avatar
BRAMAS Berenger committed
903
                        while(currentOutInteraction < int(outsideInteractions.size()) && outsideInteractions[currentOutInteraction].outIndex < blockStartIdxOther){
904 905 906 907
                            currentOutInteraction += 1;
                        }

                        int lastOutInteraction = currentOutInteraction;
BRAMAS Berenger's avatar
BRAMAS Berenger committed
908
                        while(lastOutInteraction < int(outsideInteractions.size()) && outsideInteractions[lastOutInteraction].outIndex < blockEndIdxOther){
909 910 911 912 913
                            lastOutInteraction += 1;
                        }

                        const int nbInteractionsBetweenBlocks = (lastOutInteraction-currentOutInteraction);
                        if(nbInteractionsBetweenBlocks){
914
                            if(remoteParticleGroupss[idxOtherGroup].ptrSymb == nullptr){
915 916
                                #pragma omp critical(CreateM2LRemotes)
                                {
917
                                    if(remoteParticleGroupss[idxOtherGroup].ptrSymb == nullptr){
BRAMAS Berenger's avatar
BRAMAS Berenger committed
918
                                        const size_t nbBytesInBlock = processesBlockInfos[tree->getHeight()-1][idxOtherGroup].leavesBufferSize;
919
                                        unsigned char* memoryBlock = (unsigned char*)FAlignedMemory::AllocateBytes<32>(nbBytesInBlock);
920 921 922
                                        remoteParticleGroupss[idxOtherGroup].ptrSymb = memoryBlock;
                                        starpu_variable_data_register(&remoteParticleGroupss[idxOtherGroup].handleSymb, 0,
                                                                      (uintptr_t)remoteParticleGroupss[idxOtherGroup].ptrSymb, nbBytesInBlock);
923 924
                                    }
                                }
925 926
                            }

927 928
                            externalInteractions->emplace_back();
                            BlockInteractions<ParticleGroupClass>* interactions = &externalInteractions->back();
929
                            //interactions->otherBlock = remoteParticleGroupss[idxOtherGroup].ptr;
930 931 932 933 934 935 936 937 938 939 940 941 942
                            interactions->otherBlockId = idxOtherGroup;
                            interactions->interactions.resize(nbInteractionsBetweenBlocks);
                            std::copy(outsideInteractions.begin() + currentOutInteraction,
                                      outsideInteractions.begin() + lastOutInteraction,
                                      interactions->interactions.begin());
                        }

                        currentOutInteraction = lastOutInteraction;
                    }
                }
            }
        }
    }
943

944 945 946 947 948 949 950 951 952 953 954
    struct MpiDependency{
        int src;
        int dest;
        int level;
        int globalBlockId;
    };

    std::vector<MpiDependency> toSend;

    void postRecvAllocatedBlocks(){
        std::vector<MpiDependency> toRecv;
955
        FAssertLF(tree->getHeight() == int(remoteCellGroups.size()));
956 957
        const bool directOnly = (tree->getHeight() <= 2);
        if(!directOnly){
958
            for(int idxLevel = FAbstractAlgorithm::upperWorkingLevel ; idxLevel < FAbstractAlgorithm::lowerWorkingLevel ; ++idxLevel){
959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978
                for(int idxHandle = 0 ; idxHandle < int(remoteCellGroups[idxLevel].size()) ; ++idxHandle){
                    if(remoteCellGroups[idxLevel][idxHandle].ptrSymb){
                        FAssertLF(remoteCellGroups[idxLevel][idxHandle].ptrUp);
                        FLOG(FLog::Controller << "[SMpi] " << idxLevel << " Post a recv during M2L for Idx " << processesBlockInfos[idxLevel][idxHandle].firstIndex <<
                             " and dest is " << processesBlockInfos[idxLevel][idxHandle].owner << " tag " << getTag(idxLevel,processesBlockInfos[idxLevel][idxHandle].firstIndex, 0) << "\n");
                        FLOG(FLog::Controller << "[SMpi] " << idxLevel << " Post a recv during M2L for Idx " << processesBlockInfos[idxLevel][idxHandle].firstIndex <<
                             " and dest is " << processesBlockInfos[idxLevel][idxHandle].owner << " tag " << getTag(idxLevel,processesBlockInfos[idxLevel][idxHandle].firstIndex, 1) << "\n");

                        starpu_mpi_irecv_detached( remoteCellGroups[idxLevel][idxHandle].handleSymb,
                                                    processesBlockInfos[idxLevel][idxHandle].owner,
                                                    getTag(idxLevel,processesBlockInfos[idxLevel][idxHandle].firstIndex, 0),
                                                    comm.getComm(), 0, 0 );
                        starpu_mpi_irecv_detached( remoteCellGroups[idxLevel][idxHandle].handleUp,
                                                    processesBlockInfos[idxLevel][idxHandle].owner,
                                                    getTag(idxLevel,processesBlockInfos[idxLevel][idxHandle].firstIndex, 1),
                                                    comm.getComm(), 0, 0 );

                        toRecv.push_back({processesBlockInfos[idxLevel][idxHandle].owner,
                                            comm.processId(), idxLevel, idxHandle});
                    }
979 980 981 982 983
                }
            }
        }
        {
            for(int idxHandle = 0 ; idxHandle < int(remoteParticleGroupss.size()) ; ++idxHandle){
984
                if(remoteParticleGroupss[idxHandle].ptrSymb){
BRAMAS Berenger's avatar
BRAMAS Berenger committed
985
                    FLOG(FLog::Controller << "[SMpi] Post a recv during P2P for Idx " << processesBlockInfos[tree->getHeight()-1][idxHandle].firstIndex <<
986
                         " and dest is " << processesBlockInfos[tree->getHeight()-1][idxHandle].owner << " tag " << getTag(tree->getHeight(),processesBlockInfos[tree->getHeight()-1][idxHandle].firstIndex, 0) << "\n");
BRAMAS Berenger's avatar
BRAMAS Berenger committed
987

988
                    starpu_mpi_irecv_detached( remoteParticleGroupss[idxHandle].handleSymb,
989
                                                processesBlockInfos[tree->getHeight()-1][idxHandle].owner,
990
                                                getTag(tree->getHeight(),processesBlockInfos[tree->getHeight()-1][idxHandle].firstIndex, 0),
991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021
                                                comm.getComm(), 0, 0 );

                    toRecv.push_back({processesBlockInfos[tree->getHeight()-1][idxHandle].owner,
                                        comm.processId(), tree->getHeight(), idxHandle});
                }
            }
        }

        FQuickSort<MpiDependency, int>::QsSequential(toRecv.data(),int(toRecv.size()),[](const MpiDependency& d1, const MpiDependency& d2){
            return d1.src <= d2.src;
        });

        std::unique_ptr<int[]> nbBlocksToRecvFromEach(new int[comm.processCount()]);
        memset(nbBlocksToRecvFromEach.get(), 0, sizeof(int)*comm.processCount());
        for(int idxDep = 0 ; idxDep < int(toRecv.size()) ; ++idxDep){
            nbBlocksToRecvFromEach[toRecv[idxDep].src] += 1;
        }

        FAssertLF(nbBlocksToRecvFromEach[comm.processId()] == 0);
        int offset = 0;

        for(int idxProc = 0 ; idxProc < comm.processCount() ; ++idxProc){
            if(idxProc == comm.processId()){
                // How much to send to each
                std::unique_ptr<int[]> nbBlocksToSendToEach(new int[comm.processCount()]);
                FMpi::Assert(MPI_Gather(&nbBlocksToRecvFromEach[idxProc], 1,
                                 MPI_INT, nbBlocksToSendToEach.get(), 1,
                                 MPI_INT, idxProc, comm.getComm() ), __LINE__);

                std::unique_ptr<int[]> displs(new int[comm.processCount()]);
                displs[0] = 0;
BRAMAS Berenger's avatar
BRAMAS Berenger committed
1022 1023
                for(int idxProcOther = 1 ; idxProcOther < comm.processCount() ; ++idxProcOther){
                    displs[idxProcOther] = displs[idxProcOther-1] + nbBlocksToSendToEach[idxProcOther-1];
1024
                }
1025
                toSend.resize(displs[comm.processCount()-1] + nbBlocksToSendToEach[comm.processCount()-1]);
1026

1027
                // We work in bytes
BRAMAS Berenger's avatar
BRAMAS Berenger committed
1028 1029 1030
                for(int idxProcOther = 0 ; idxProcOther < comm.processCount() ; ++idxProcOther){
                    nbBlocksToSendToEach[idxProcOther] *= int(sizeof(MpiDependency));
                    displs[idxProcOther] *= int(sizeof(MpiDependency));
1031
                }
1032

1033 1034
                FMpi::Assert(MPI_Gatherv( nullptr, 0, MPI_BYTE,
                                 toSend.data(),
1035 1036 1037 1038 1039 1040 1041
                                 nbBlocksToSendToEach.get(), displs.get(),
                                 MPI_BYTE, idxProc, comm.getComm()), __LINE__);
            }
            else{
                FMpi::Assert(MPI_Gather(&nbBlocksToRecvFromEach[idxProc], 1,
                                 MPI_INT, 0, 0, MPI_INT, idxProc, comm.getComm() ), __LINE__);
                FMpi::Assert(MPI_Gatherv(
1042
                                 &toRecv[offset], int(nbBlocksToRecvFromEach[idxProc]*sizeof(MpiDependency)), MPI_BYTE,
1043
                                 0, 0, 0, MPI_BYTE, idxProc, comm.getComm() ), __LINE__);
1044

1045 1046 1047 1048 1049 1050 1051 1052 1053
                offset += nbBlocksToRecvFromEach[idxProc];
            }
        }
    }

    void insertParticlesSend(){
        for(int idxSd = 0 ; idxSd < int(toSend.size()) ; ++idxSd){
            const MpiDependency sd = toSend[idxSd];
            if(sd.level == tree->getHeight()){