FGroupTaskStarpuMpiAlgorithm.hpp 104 KB
Newer Older
1 2 3 4 5
// Keep in private GIT
// @SCALFMM_PRIVATE
#ifndef FGROUPTASKSTARPUMPIALGORITHM_HPP
#define FGROUPTASKSTARPUMPIALGORITHM_HPP

6 7 8 9 10 11 12 13 14 15 16
#include "../../Utils/FGlobal.hpp"
#include "../../Core/FCoreCommon.hpp"
#include "../../Utils/FQuickSort.hpp"
#include "../../Containers/FTreeCoordinate.hpp"
#include "../../Utils/FLog.hpp"
#include "../../Utils/FTic.hpp"
#include "../../Utils/FAssert.hpp"
#include "../../Utils/FAlignedMemory.hpp"
#include "../../Utils/FAssert.hpp"

#include "../../Utils/FMpi.hpp"
17

18 19
#include "FOutOfBlockInteraction.hpp"

20 21 22 23 24 25
#include <vector>
#include <memory>

#include <omp.h>

#include <starpu.h>
26
#include <starpu_mpi.h>
27
#include "../StarPUUtils/FStarPUUtils.hpp"
28
#include "../StarPUUtils/FStarPUFmmPriorities.hpp"
29

30
#ifdef STARPU_USE_CPU
31
#include "../StarPUUtils/FStarPUCpuWrapper.hpp"
32
#endif
33
#ifdef SCALFMM_ENABLE_CUDA_KERNEL
34 35 36 37 38
#include "../StarPUUtils/FStarPUCudaWrapper.hpp"
#include "../Cuda/FCudaEmptyKernel.hpp"
#include "../Cuda/FCudaGroupAttachedLeaf.hpp"
#include "../Cuda/FCudaGroupOfParticles.hpp"
#include "../Cuda/FCudaGroupOfCells.hpp"
39
#include "../Cuda/FCudaEmptyCellSymb.hpp"
40
#endif
41
#ifdef SCALFMM_ENABLE_OPENCL_KERNEL
42 43
#include "../StarPUUtils/FStarPUOpenClWrapper.hpp"
#include "../OpenCl/FOpenCLDeviceWrapper.hpp"
44
#include "../OpenCl/FEmptyOpenCLCode.hpp"
45
#endif
46

47

48
template <class OctreeClass, class CellContainerClass, class KernelClass, class ParticleGroupClass, class StarPUCpuWrapperClass
49
#ifdef SCALFMM_ENABLE_CUDA_KERNEL
50
    , class StarPUCudaWrapperClass = FStarPUCudaWrapper<KernelClass, FCudaEmptyCellSymb, int, int, FCudaGroupOfCells<FCudaEmptyCellSymb, int, int>,
BRAMAS Berenger's avatar
BRAMAS Berenger committed
51
                                                        FCudaGroupOfParticles<int, 0, 0, int>, FCudaGroupAttachedLeaf<int, 0, 0, int>, FCudaEmptyKernel<int> >
52
#endif
53
#ifdef SCALFMM_ENABLE_OPENCL_KERNEL
54
    , class StarPUOpenClWrapperClass = FStarPUOpenClWrapper<KernelClass, FOpenCLDeviceWrapper<KernelClass>>
55 56
#endif
          >
57
class FGroupTaskStarPUMpiAlgorithm : public FAbstractAlgorithm {
58
protected:
59
    typedef FGroupTaskStarPUMpiAlgorithm<OctreeClass, CellContainerClass, KernelClass, ParticleGroupClass, StarPUCpuWrapperClass
60
#ifdef SCALFMM_ENABLE_CUDA_KERNEL
61
        , StarPUCudaWrapperClass
62
#endif
63
#ifdef SCALFMM_ENABLE_OPENCL_KERNEL
64
    , StarPUOpenClWrapperClass
65 66
#endif
    > ThisClass;
67

68
    int getTag(const int inLevel, const MortonIndex mindex, const int mode) const{
69 70 71
        int shift = 0;
        int height = tree->getHeight();
        while(height) { shift += 1; height >>= 1; }
BRAMAS Berenger's avatar
BRAMAS Berenger committed
72
        return int((((mindex<<shift) + inLevel) << 5) + mode);
73 74
    }

75 76 77 78 79 80 81 82 83
    const FMpi::FComm& comm;

    template <class OtherBlockClass>
    struct BlockInteractions{
        OtherBlockClass* otherBlock;
        int otherBlockId;
        std::vector<OutOfBlockInteraction> interactions;
    };

84 85 86 87
    struct CellHandles{
        starpu_data_handle_t symb;
        starpu_data_handle_t up;
        starpu_data_handle_t down;
88
        int intervalSize;
89 90 91 92 93
    };

    struct ParticleHandles{
        starpu_data_handle_t symb;
        starpu_data_handle_t down;
94
        int intervalSize;
95 96
    };

97 98 99 100
    std::vector< std::vector< std::vector<BlockInteractions<CellContainerClass>>>> externalInteractionsAllLevel;
    std::vector< std::vector<BlockInteractions<ParticleGroupClass>>> externalInteractionsLeafLevel;

    OctreeClass*const tree;       //< The Tree
101
    KernelClass*const originalCpuKernel;
102

103 104
    std::vector<CellHandles>* cellHandles;
    std::vector<ParticleHandles> particleHandles;
105 106 107 108 109 110 111 112

    starpu_codelet p2m_cl;
    starpu_codelet m2m_cl[9];
    starpu_codelet l2l_cl[9];
    starpu_codelet l2p_cl;

    starpu_codelet m2l_cl_in;
    starpu_codelet m2l_cl_inout;
113
    starpu_codelet m2l_cl_inout_mpi;
114 115 116

    starpu_codelet p2p_cl_in;
    starpu_codelet p2p_cl_inout;
117
    starpu_codelet p2p_cl_inout_mpi;
118

119
#ifdef STARPU_USE_CPU
120
    StarPUCpuWrapperClass cpuWrapper;
121
#endif
122
#ifdef SCALFMM_ENABLE_CUDA_KERNEL
123 124
    StarPUCudaWrapperClass cudaWrapper;
#endif
125
#ifdef SCALFMM_ENABLE_OPENCL_KERNEL
126 127
    StarPUOpenClWrapperClass openclWrapper;
#endif
128 129 130

    FStarPUPtrInterface wrappers;
    FStarPUPtrInterface* wrapperptr;
131

132
#ifdef STARPU_SUPPORT_ARBITER
BRAMAS Berenger's avatar
BRAMAS Berenger committed
133
    starpu_arbiter_t arbiterGlobal;
134
#endif
135
public:
136 137
    FGroupTaskStarPUMpiAlgorithm(const FMpi::FComm& inComm, OctreeClass*const inTree, KernelClass* inKernels)
        :   comm(inComm), tree(inTree), originalCpuKernel(inKernels),
138
          cellHandles(nullptr),
139 140
#ifdef STARPU_USE_CPU
            cpuWrapper(tree->getHeight()),
141
#endif
142
#ifdef SCALFMM_ENABLE_CUDA_KERNEL
143 144
            cudaWrapper(tree->getHeight()),
#endif
145
#ifdef SCALFMM_ENABLE_OPENCL_KERNEL
146
            openclWrapper(tree->getHeight()),
147 148
#endif
            wrapperptr(&wrappers){
149 150 151
        FAssertLF(tree, "tree cannot be null");
        FAssertLF(inKernels, "kernels cannot be null");

152 153
        FAbstractAlgorithm::setNbLevelsInTree(tree->getHeight());

154 155
        struct starpu_conf conf;
        FAssertLF(starpu_conf_init(&conf) == 0);
156
        FStarPUFmmPriorities::Controller().init(&conf, tree->getHeight(), inKernels);
157
        FAssertLF(starpu_init(&conf) == 0);
158
        FAssertLF(starpu_mpi_init ( 0, 0, 0 ) == 0);
159 160 161

        starpu_pthread_mutex_t initMutex;
        starpu_pthread_mutex_init(&initMutex, NULL);
162
#ifdef STARPU_USE_CPU
163 164 165 166 167
        FStarPUUtils::ExecOnWorkers(STARPU_CPU, [&](){
            starpu_pthread_mutex_lock(&initMutex);
            cpuWrapper.initKernel(starpu_worker_get_id(), inKernels);
            starpu_pthread_mutex_unlock(&initMutex);
        });
168
        wrappers.set(FSTARPU_CPU_IDX, &cpuWrapper);
169
#endif
170
#ifdef SCALFMM_ENABLE_CUDA_KERNEL
171 172 173 174 175 176 177
        FStarPUUtils::ExecOnWorkers(STARPU_CUDA, [&](){
            starpu_pthread_mutex_lock(&initMutex);
            cudaWrapper.initKernel(starpu_worker_get_id(), inKernels);
            starpu_pthread_mutex_unlock(&initMutex);
        });
        wrappers.set(FSTARPU_CUDA_IDX, &cudaWrapper);
#endif
178
#ifdef SCALFMM_ENABLE_OPENCL_KERNEL
179 180 181 182 183 184
        FStarPUUtils::ExecOnWorkers(STARPU_OPENCL, [&](){
            starpu_pthread_mutex_lock(&initMutex);
            openclWrapper.initKernel(starpu_worker_get_id(), inKernels);
            starpu_pthread_mutex_unlock(&initMutex);
        });
        wrappers.set(FSTARPU_OPENCL_IDX, &openclWrapper);
185
#endif
186 187
        starpu_pthread_mutex_destroy(&initMutex);

188 189
        starpu_pause();

190
        cellHandles   = new std::vector<CellHandles>[tree->getHeight()];
191

192
#ifdef STARPU_SUPPORT_ARBITER
BRAMAS Berenger's avatar
BRAMAS Berenger committed
193
        arbiterGlobal = starpu_arbiter_create();
194
#endif
195

BRAMAS Berenger's avatar
BRAMAS Berenger committed
196 197 198
        initCodelet();
        initCodeletMpi();

BRAMAS Berenger's avatar
BRAMAS Berenger committed
199
        FLOG(FLog::Controller << "FGroupTaskStarPUAlgorithm (Max Worker " << starpu_worker_get_count() << ")\n");
BRAMAS Berenger's avatar
BRAMAS Berenger committed
200
#ifdef STARPU_USE_CPU
BRAMAS Berenger's avatar
BRAMAS Berenger committed
201
        FLOG(FLog::Controller << "FGroupTaskStarPUAlgorithm (Max CPU " << starpu_cpu_worker_get_count() << ")\n");
BRAMAS Berenger's avatar
BRAMAS Berenger committed
202
#endif
203
#ifdef SCALFMM_ENABLE_OPENCL_KERNEL
BRAMAS Berenger's avatar
BRAMAS Berenger committed
204 205
        FLOG(FLog::Controller << "FGroupTaskStarPUAlgorithm (Max OpenCL " << starpu_opencl_worker_get_count() << ")\n");
#endif
206
#ifdef SCALFMM_ENABLE_CUDA_KERNEL
BRAMAS Berenger's avatar
BRAMAS Berenger committed
207 208
        FLOG(FLog::Controller << "FGroupTaskStarPUAlgorithm (Max CUDA " << starpu_cuda_worker_get_count() << ")\n");
#endif
209 210 211
    }

    ~FGroupTaskStarPUMpiAlgorithm(){
BRAMAS Berenger's avatar
BRAMAS Berenger committed
212 213
        starpu_resume();

214
        cleanHandle();
215
        cleanHandleMpi();
216
        delete[] cellHandles;
217

BRAMAS Berenger's avatar
BRAMAS Berenger committed
218 219 220 221 222 223 224 225 226 227
        starpu_pthread_mutex_t releaseMutex;
        starpu_pthread_mutex_init(&releaseMutex, NULL);
#ifdef STARPU_USE_CPU
        FStarPUUtils::ExecOnWorkers(STARPU_CPU, [&](){
            starpu_pthread_mutex_lock(&releaseMutex);
            cpuWrapper.releaseKernel(starpu_worker_get_id());
            starpu_pthread_mutex_unlock(&releaseMutex);
        });
        wrappers.set(FSTARPU_CPU_IDX, &cpuWrapper);
#endif
228
#ifdef SCALFMM_ENABLE_CUDA_KERNEL
BRAMAS Berenger's avatar
BRAMAS Berenger committed
229 230 231 232 233 234 235
        FStarPUUtils::ExecOnWorkers(STARPU_CUDA, [&](){
            starpu_pthread_mutex_lock(&releaseMutex);
            cudaWrapper.releaseKernel(starpu_worker_get_id());
            starpu_pthread_mutex_unlock(&releaseMutex);
        });
        wrappers.set(FSTARPU_CUDA_IDX, &cudaWrapper);
#endif
236
#ifdef SCALFMM_ENABLE_OPENCL_KERNEL
BRAMAS Berenger's avatar
BRAMAS Berenger committed
237 238 239 240 241 242 243 244 245
        FStarPUUtils::ExecOnWorkers(STARPU_OPENCL, [&](){
            starpu_pthread_mutex_lock(&releaseMutex);
            openclWrapper.releaseKernel(starpu_worker_get_id());
            starpu_pthread_mutex_unlock(&releaseMutex);
        });
        wrappers.set(FSTARPU_OPENCL_IDX, &openclWrapper);
#endif
        starpu_pthread_mutex_destroy(&releaseMutex);

246 247

#ifdef STARPU_SUPPORT_ARBITER
BRAMAS Berenger's avatar
BRAMAS Berenger committed
248
        starpu_arbiter_destroy(arbiterGlobal);
249
#endif
250
        starpu_mpi_shutdown();
251 252 253
        starpu_shutdown();
    }

254 255 256 257 258
protected:
    /**
      * Runs the complete algorithm.
      */
    void executeCore(const unsigned operationsToProceed) override {
259
        FLOG( FLog::Controller << "\tStart FGroupTaskStarPUMpiAlgorithm\n" );
260
        const bool directOnly = (tree->getHeight() <= 2);
261 262 263 264 265 266

        #pragma omp parallel
        #pragma omp single
        buildExternalInteractionVecs();
        buildHandles();

267 268
        #pragma omp parallel
        #pragma omp single
269 270 271
        buildRemoteInteractionsAndHandles();

        starpu_resume();
272
        postRecvAllocatedBlocks();
273

274
        if( operationsToProceed & FFmmP2P ) insertParticlesSend();
275

276
        if(operationsToProceed & FFmmP2M && !directOnly) bottomPass();
277

278 279
        if(operationsToProceed & FFmmM2M && !directOnly) upwardPass();
        if(operationsToProceed & FFmmM2L && !directOnly) insertCellsSend();
280

281 282
        if(operationsToProceed & FFmmM2L && !directOnly) transferPass();
        if(operationsToProceed & FFmmM2L && !directOnly) transferPassMpi();
283

284
        if(operationsToProceed & FFmmL2L && !directOnly) downardPass();
285

286 287
        if( operationsToProceed & FFmmP2P ) directPass();
        if( operationsToProceed & FFmmP2P ) directPassMpi();
BRAMAS Berenger's avatar
BRAMAS Berenger committed
288

289
        if( operationsToProceed & FFmmL2P  && !directOnly) mergePass();
290 291 292 293 294

        starpu_task_wait_for_all();
        starpu_pause();
    }

295

296 297
    void initCodelet(){
        memset(&p2m_cl, 0, sizeof(p2m_cl));
298
#ifdef STARPU_USE_CPU
299
        if(originalCpuKernel->supportP2M(FSTARPU_CPU_IDX)){
300 301 302
            p2m_cl.cpu_funcs[0] = StarPUCpuWrapperClass::bottomPassCallback;
            p2m_cl.where |= STARPU_CPU;
        }
303
#endif
304
#ifdef SCALFMM_ENABLE_CUDA_KERNEL
305
        if(originalCpuKernel->supportP2M(FSTARPU_CUDA_IDX)){
306 307 308 309
            p2m_cl.cuda_funcs[0] = StarPUCudaWrapperClass::bottomPassCallback;
            p2m_cl.where |= STARPU_CUDA;
        }
#endif
310
#ifdef SCALFMM_ENABLE_OPENCL_KERNEL
311
        if(originalCpuKernel->supportP2M(FSTARPU_OPENCL_IDX)){
312 313 314
            p2m_cl.opencl_funcs[0] = StarPUOpenClWrapperClass::bottomPassCallback;
            p2m_cl.where |= STARPU_OPENCL;
        }
315
#endif
316 317 318 319
        p2m_cl.nbuffers = 3;
        p2m_cl.modes[0] = STARPU_R;
        p2m_cl.modes[1] = STARPU_RW;
        p2m_cl.modes[2] = STARPU_R;
320
        p2m_cl.name = "p2m_cl";
321 322 323 324

        memset(m2m_cl, 0, sizeof(m2m_cl[0])*9);
        memset(l2l_cl, 0, sizeof(l2l_cl[0])*9);
        for(int idx = 0 ; idx < 9 ; ++idx){
325
#ifdef STARPU_USE_CPU
326
            if(originalCpuKernel->supportM2M(FSTARPU_CPU_IDX)){
327 328 329
                m2m_cl[idx].cpu_funcs[0] = StarPUCpuWrapperClass::upwardPassCallback;
                m2m_cl[idx].where |= STARPU_CPU;
            }
330
#endif
331
#ifdef SCALFMM_ENABLE_CUDA_KERNEL
332
            if(originalCpuKernel->supportM2M(FSTARPU_CUDA_IDX)){
333 334 335 336
                m2m_cl[idx].cuda_funcs[0] = StarPUCudaWrapperClass::upwardPassCallback;
                m2m_cl[idx].where |= STARPU_CUDA;
            }
#endif
337
#ifdef SCALFMM_ENABLE_OPENCL_KERNEL
338
            if(originalCpuKernel->supportM2M(FSTARPU_OPENCL_IDX)){
339 340 341
                m2m_cl[idx].opencl_funcs[0] = StarPUOpenClWrapperClass::upwardPassCallback;
                m2m_cl[idx].where |= STARPU_OPENCL;
            }
342
#endif
343 344 345 346
            m2m_cl[idx].nbuffers = (idx+2)*2;
            m2m_cl[idx].dyn_modes = (starpu_data_access_mode*)malloc(m2m_cl[idx].nbuffers*sizeof(starpu_data_access_mode));
            m2m_cl[idx].dyn_modes[0] = STARPU_R;
            m2m_cl[idx].dyn_modes[1] = STARPU_RW;
347
            m2m_cl[idx].name = "m2m_cl";
348

349
#ifdef STARPU_USE_CPU
350
            if(originalCpuKernel->supportL2L(FSTARPU_CPU_IDX)){
351 352 353
                l2l_cl[idx].cpu_funcs[0] = StarPUCpuWrapperClass::downardPassCallback;
                l2l_cl[idx].where |= STARPU_CPU;
            }
354
#endif
355
#ifdef SCALFMM_ENABLE_CUDA_KERNEL
356
            if(originalCpuKernel->supportL2L(FSTARPU_CUDA_IDX)){
357 358 359 360
                l2l_cl[idx].cuda_funcs[0] = StarPUCudaWrapperClass::downardPassCallback;
                l2l_cl[idx].where |= STARPU_CUDA;
            }
#endif
361
#ifdef SCALFMM_ENABLE_OPENCL_KERNEL
362
            if(originalCpuKernel->supportL2L(FSTARPU_OPENCL_IDX)){
363 364 365
                l2l_cl[idx].opencl_funcs[0] = StarPUOpenClWrapperClass::downardPassCallback;
                l2l_cl[idx].where |= STARPU_OPENCL;
            }
366
#endif
367 368
            l2l_cl[idx].nbuffers = (idx+2)*2;
            l2l_cl[idx].dyn_modes = (starpu_data_access_mode*)malloc(l2l_cl[idx].nbuffers*sizeof(starpu_data_access_mode));
369
            l2l_cl[idx].dyn_modes[0] = STARPU_R;
370
            l2l_cl[idx].dyn_modes[1] = STARPU_R;
371
            l2l_cl[idx].name = "l2l_cl";
372 373

            for(int idxBuffer = 0 ; idxBuffer <= idx ; ++idxBuffer){
374 375 376
                m2m_cl[idx].dyn_modes[(idxBuffer*2)+2] = STARPU_R;
                m2m_cl[idx].dyn_modes[(idxBuffer*2)+3] = STARPU_R;
                l2l_cl[idx].dyn_modes[(idxBuffer*2)+2] = STARPU_R;
377
                l2l_cl[idx].dyn_modes[(idxBuffer*2)+3] = starpu_data_access_mode(STARPU_RW|STARPU_COMMUTE_IF_SUPPORTED);
378 379 380 381
            }
        }

        memset(&l2p_cl, 0, sizeof(l2p_cl));
382
#ifdef STARPU_USE_CPU
383
        if(originalCpuKernel->supportL2P(FSTARPU_CPU_IDX)){
384 385 386
            l2p_cl.cpu_funcs[0] = StarPUCpuWrapperClass::mergePassCallback;
            l2p_cl.where |= STARPU_CPU;
        }
387
#endif
388
#ifdef SCALFMM_ENABLE_CUDA_KERNEL
389
        if(originalCpuKernel->supportL2P(FSTARPU_CUDA_IDX)){
390 391 392 393
            l2p_cl.cuda_funcs[0] = StarPUCudaWrapperClass::mergePassCallback;
            l2p_cl.where |= STARPU_CUDA;
        }
#endif
394
#ifdef SCALFMM_ENABLE_OPENCL_KERNEL
395
        if(originalCpuKernel->supportL2P(FSTARPU_OPENCL_IDX)){
396 397 398
            l2p_cl.opencl_funcs[0] = StarPUOpenClWrapperClass::mergePassCallback;
            l2p_cl.where |= STARPU_OPENCL;
        }
399
#endif
400
        l2p_cl.nbuffers = 4;
401
        l2p_cl.modes[0] = STARPU_R;
402 403
        l2p_cl.modes[1] = STARPU_R;
        l2p_cl.modes[2] = STARPU_R;
404
        l2p_cl.modes[3] = starpu_data_access_mode(STARPU_RW|STARPU_COMMUTE_IF_SUPPORTED);
405
        l2p_cl.name = "l2p_cl";
406 407

        memset(&p2p_cl_in, 0, sizeof(p2p_cl_in));
408
#ifdef STARPU_USE_CPU
409
        if(originalCpuKernel->supportP2P(FSTARPU_CPU_IDX)){
410 411 412
            p2p_cl_in.cpu_funcs[0] = StarPUCpuWrapperClass::directInPassCallback;
            p2p_cl_in.where |= STARPU_CPU;
        }
413
#endif
414
#ifdef SCALFMM_ENABLE_CUDA_KERNEL
415
        if(originalCpuKernel->supportP2P(FSTARPU_CUDA_IDX)){
416 417 418 419
            p2p_cl_in.cuda_funcs[0] = StarPUCudaWrapperClass::directInPassCallback;
            p2p_cl_in.where |= STARPU_CUDA;
        }
#endif
420
#ifdef SCALFMM_ENABLE_OPENCL_KERNEL
421
        if(originalCpuKernel->supportP2P(FSTARPU_OPENCL_IDX)){
422 423 424
            p2p_cl_in.opencl_funcs[0] = StarPUOpenClWrapperClass::directInPassCallback;
            p2p_cl_in.where |= STARPU_OPENCL;
        }
425
#endif
426 427
        p2p_cl_in.nbuffers = 2;
        p2p_cl_in.modes[0] = STARPU_R;
428
        p2p_cl_in.modes[1] = starpu_data_access_mode(STARPU_RW|STARPU_COMMUTE_IF_SUPPORTED);
429
        p2p_cl_in.name = "p2p_cl_in";
430
        memset(&p2p_cl_inout, 0, sizeof(p2p_cl_inout));
431
#ifdef STARPU_USE_CPU
432
        if(originalCpuKernel->supportP2PExtern(FSTARPU_CPU_IDX)){
433 434 435
            p2p_cl_inout.cpu_funcs[0] = StarPUCpuWrapperClass::directInoutPassCallback;
            p2p_cl_inout.where |= STARPU_CPU;
        }
436
#endif
437
#ifdef SCALFMM_ENABLE_CUDA_KERNEL
438
        if(originalCpuKernel->supportP2PExtern(FSTARPU_CUDA_IDX)){
439 440 441 442
            p2p_cl_inout.cuda_funcs[0] = StarPUCudaWrapperClass::directInoutPassCallback;
            p2p_cl_inout.where |= STARPU_CUDA;
        }
#endif
443
#ifdef SCALFMM_ENABLE_OPENCL_KERNEL
444
        if(originalCpuKernel->supportP2PExtern(FSTARPU_OPENCL_IDX)){
445 446 447
            p2p_cl_inout.opencl_funcs[0] = StarPUOpenClWrapperClass::directInoutPassCallback;
            p2p_cl_inout.where |= STARPU_OPENCL;
        }
448
#endif
449 450
        p2p_cl_inout.nbuffers = 4;
        p2p_cl_inout.modes[0] = STARPU_R;
451
        p2p_cl_inout.modes[1] = starpu_data_access_mode(STARPU_RW|STARPU_COMMUTE_IF_SUPPORTED);
452
        p2p_cl_inout.modes[2] = STARPU_R;
453
        p2p_cl_inout.modes[3] = starpu_data_access_mode(STARPU_RW|STARPU_COMMUTE_IF_SUPPORTED);
454
        p2p_cl_inout.name = "p2p_cl_inout";
455 456

        memset(&m2l_cl_in, 0, sizeof(m2l_cl_in));
457
#ifdef STARPU_USE_CPU
458
        if(originalCpuKernel->supportM2L(FSTARPU_CPU_IDX)){
459 460 461
            m2l_cl_in.cpu_funcs[0] = StarPUCpuWrapperClass::transferInPassCallback;
            m2l_cl_in.where |= STARPU_CPU;
        }
462
#endif
463
#ifdef SCALFMM_ENABLE_CUDA_KERNEL
464
        if(originalCpuKernel->supportM2L(FSTARPU_CUDA_IDX)){
465 466 467 468
            m2l_cl_in.cuda_funcs[0] = StarPUCudaWrapperClass::transferInPassCallback;
            m2l_cl_in.where |= STARPU_CUDA;
        }
#endif
469
#ifdef SCALFMM_ENABLE_OPENCL_KERNEL
470
        if(originalCpuKernel->supportM2L(FSTARPU_OPENCL_IDX)){
471 472 473
            m2l_cl_in.opencl_funcs[0] = StarPUOpenClWrapperClass::transferInPassCallback;
            m2l_cl_in.where |= STARPU_OPENCL;
        }
474
#endif
475 476
        m2l_cl_in.nbuffers = 3;
        m2l_cl_in.modes[0] = STARPU_R;
477
        m2l_cl_in.modes[1] = STARPU_R;
478
        m2l_cl_in.modes[2] = starpu_data_access_mode(STARPU_RW|STARPU_COMMUTE_IF_SUPPORTED);
479
        m2l_cl_in.name = "m2l_cl_in";
480
        memset(&m2l_cl_inout, 0, sizeof(m2l_cl_inout));
481
#ifdef STARPU_USE_CPU
482
        if(originalCpuKernel->supportM2LExtern(FSTARPU_CPU_IDX)){
483 484 485
            m2l_cl_inout.cpu_funcs[0] = StarPUCpuWrapperClass::transferInoutPassCallback;
            m2l_cl_inout.where |= STARPU_CPU;
        }
486
#endif
487
#ifdef SCALFMM_ENABLE_CUDA_KERNEL
488
        if(originalCpuKernel->supportM2LExtern(FSTARPU_CUDA_IDX)){
489 490 491 492
            m2l_cl_inout.cuda_funcs[0] = StarPUCudaWrapperClass::transferInoutPassCallback;
            m2l_cl_inout.where |= STARPU_CUDA;
        }
#endif
493
#ifdef SCALFMM_ENABLE_OPENCL_KERNEL
494
        if(originalCpuKernel->supportM2LExtern(FSTARPU_OPENCL_IDX)){
495 496 497
            m2l_cl_inout.opencl_funcs[0] = StarPUOpenClWrapperClass::transferInoutPassCallback;
            m2l_cl_inout.where |= STARPU_OPENCL;
        }
498
#endif
499 500 501
        m2l_cl_inout.nbuffers = 6;
        m2l_cl_inout.modes[0] = STARPU_R;
        m2l_cl_inout.modes[1] = STARPU_R;
502
        m2l_cl_inout.modes[2] = starpu_data_access_mode(STARPU_RW|STARPU_COMMUTE_IF_SUPPORTED);
503
        m2l_cl_inout.modes[3] = STARPU_R;
504
        m2l_cl_inout.modes[4] = STARPU_R;
505
        m2l_cl_inout.modes[5] = starpu_data_access_mode(STARPU_RW|STARPU_COMMUTE_IF_SUPPORTED);
506
        m2l_cl_inout.name = "m2l_cl_inout";
507 508 509 510 511
    }

    /** dealloc in a starpu way all the defined handles */
    void cleanHandle(){
        for(int idxLevel = 0 ; idxLevel < tree->getHeight() ; ++idxLevel){
512 513 514 515
            for(int idxHandle = 0 ; idxHandle < int(cellHandles[idxLevel].size()) ; ++idxHandle){
                starpu_data_unregister(cellHandles[idxLevel][idxHandle].symb);
                starpu_data_unregister(cellHandles[idxLevel][idxHandle].up);
                starpu_data_unregister(cellHandles[idxLevel][idxHandle].down);
516
            }
517
            cellHandles[idxLevel].clear();
518 519
        }
        {
520 521 522
            for(int idxHandle = 0 ; idxHandle < int(particleHandles.size()) ; ++idxHandle){
                starpu_data_unregister(particleHandles[idxHandle].symb);
                starpu_data_unregister(particleHandles[idxHandle].down);
523
            }
524
            particleHandles.clear();
525 526 527 528
        }
    }

    ////////////////////////////////////////////////////////////////////////////
529 530

    void initCodeletMpi(){
531
        memset(&p2p_cl_inout_mpi, 0, sizeof(p2p_cl_inout_mpi));
532
#ifdef STARPU_USE_CPU
533
        if(originalCpuKernel->supportP2PMpi(FSTARPU_CPU_IDX)){
534 535 536
            p2p_cl_inout_mpi.where |= STARPU_CPU;
            p2p_cl_inout_mpi.cpu_funcs[0] = StarPUCpuWrapperClass::directInoutPassCallbackMpi;
        }
537
#endif
538
#ifdef SCALFMM_ENABLE_CUDA_KERNEL
539
        if(originalCpuKernel->supportP2PMpi(FSTARPU_CUDA_IDX)){
540 541 542 543
            p2p_cl_inout_mpi.where |= STARPU_CUDA;
            p2p_cl_inout_mpi.cuda_funcs[0] = StarPUCudaWrapperClass::directInoutPassCallbackMpi;
        }
#endif
544
#ifdef SCALFMM_ENABLE_OPENCL_KERNEL
545
        if(originalCpuKernel->supportP2PMpi(FSTARPU_OPENCL_IDX)){
546 547 548
            p2p_cl_inout_mpi.where |= STARPU_OPENCL;
            p2p_cl_inout_mpi.opencl_funcs[0] = StarPUOpenClWrapperClass::directInoutPassCallbackMpi;
        }
549
#endif
550 551
        p2p_cl_inout_mpi.nbuffers = 3;
        p2p_cl_inout_mpi.modes[0] = STARPU_R;
552
        p2p_cl_inout_mpi.modes[1] = starpu_data_access_mode(STARPU_RW|STARPU_COMMUTE_IF_SUPPORTED);
553
        p2p_cl_inout_mpi.modes[2] = STARPU_R;
554
        p2p_cl_inout_mpi.name = "p2p_cl_inout_mpi";
555

556
        memset(&m2l_cl_inout_mpi, 0, sizeof(m2l_cl_inout_mpi));
557
#ifdef STARPU_USE_CPU
558
        if(originalCpuKernel->supportM2LMpi(FSTARPU_CPU_IDX)){
559 560 561
            m2l_cl_inout_mpi.where |= STARPU_CPU;
            m2l_cl_inout_mpi.cpu_funcs[0] = StarPUCpuWrapperClass::transferInoutPassCallbackMpi;
        }
562
#endif
563
#ifdef SCALFMM_ENABLE_CUDA_KERNEL
564
        if(originalCpuKernel->supportM2LMpi(FSTARPU_CUDA_IDX)){
565 566 567 568
            m2l_cl_inout_mpi.where |= STARPU_CUDA;
            m2l_cl_inout_mpi.cuda_funcs[0] = StarPUCudaWrapperClass::transferInoutPassCallbackMpi;
        }
#endif
569
#ifdef SCALFMM_ENABLE_OPENCL_KERNEL
570
        if(originalCpuKernel->supportM2LMpi(FSTARPU_OPENCL_IDX)){
571 572 573
            m2l_cl_inout_mpi.where |= STARPU_OPENCL;
            m2l_cl_inout_mpi.opencl_funcs[0] = StarPUOpenClWrapperClass::transferInoutPassCallbackMpi;
        }
574
#endif
575 576
        m2l_cl_inout_mpi.nbuffers = 4;
        m2l_cl_inout_mpi.modes[0] = STARPU_R;
577
        m2l_cl_inout_mpi.modes[1] = starpu_data_access_mode(STARPU_RW|STARPU_COMMUTE_IF_SUPPORTED);
578 579
        m2l_cl_inout_mpi.modes[2] = STARPU_R;
        m2l_cl_inout_mpi.modes[3] = STARPU_R;
580
        m2l_cl_inout_mpi.name = "m2l_cl_inout_mpi";
581 582
    }

583 584 585 586 587 588
    std::vector<std::pair<MortonIndex,MortonIndex>> processesIntervalPerLevels;
    struct BlockDescriptor{
        MortonIndex firstIndex;
        MortonIndex lastIndex;
        int owner;
        int bufferSize;
589 590 591 592
        size_t bufferSizeSymb;
        size_t bufferSizeUp;
        size_t bufferSizeDown;
        size_t leavesBufferSize;
593 594 595 596 597 598 599 600
    };
    std::vector<std::vector<BlockDescriptor>> processesBlockInfos;
    std::vector<int> nbBlocksPerLevelAll;
    std::vector<int> nbBlocksBeforeMinPerLevel;

    std::vector< std::vector< std::vector<BlockInteractions<CellContainerClass>>>> externalInteractionsAllLevelMpi;
    std::vector< std::vector<BlockInteractions<ParticleGroupClass>>> externalInteractionsLeafLevelMpi;

601
    struct RemoteHandle{
602 603 604 605
        RemoteHandle() : ptrSymb(nullptr), ptrUp(nullptr), ptrDown(nullptr){
            memset(&handleSymb, 0, sizeof(handleSymb));
            memset(&handleUp, 0, sizeof(handleUp));
            memset(&handleDown, 0, sizeof(handleDown));
606 607
        }

608 609 610 611 612 613
        unsigned char * ptrSymb;
        starpu_data_handle_t handleSymb;
        unsigned char * ptrUp;
        starpu_data_handle_t handleUp;
        unsigned char * ptrDown;
        starpu_data_handle_t handleDown;
BRAMAS Berenger's avatar
BRAMAS Berenger committed
614 615

        int intervalSize;
616 617 618 619 620
    };

    std::vector<std::vector<RemoteHandle>> remoteCellGroups;
    std::vector<RemoteHandle> remoteParticleGroupss;

621
    void buildRemoteInteractionsAndHandles(){
622 623
        cleanHandleMpi();

624 625
        // We need to have information about all other blocks
        std::unique_ptr<int[]> nbBlocksPerLevel(new int[tree->getHeight()]);
626
        nbBlocksPerLevel[0] = 0;
627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669
        for(int idxLevel = 1 ; idxLevel < tree->getHeight() ; ++idxLevel){
            nbBlocksPerLevel[idxLevel] = tree->getNbCellGroupAtLevel(idxLevel);
        }
        // Exchange the number of blocks per proc
        nbBlocksPerLevelAll.resize(tree->getHeight() * comm.processCount());
        FMpi::Assert(MPI_Allgather(nbBlocksPerLevel.get(), tree->getHeight(), MPI_INT,
                                   nbBlocksPerLevelAll.data(), tree->getHeight(), MPI_INT,
                                   comm.getComm()), __LINE__);
        // Compute the number of blocks before mine
        nbBlocksBeforeMinPerLevel.resize(tree->getHeight());
        for(int idxLevel = 1 ; idxLevel < tree->getHeight() ; ++idxLevel){
            nbBlocksBeforeMinPerLevel[idxLevel] = 0;
            for(int idxProc = 0 ; idxProc < comm.processId() ; ++idxProc){
                nbBlocksBeforeMinPerLevel[idxLevel] += nbBlocksPerLevelAll[idxProc*tree->getHeight() + idxLevel];
            }
        }
        // Prepare the block infos
        processesBlockInfos.resize(tree->getHeight());
        std::unique_ptr<int[]> recvBlocksCount(new int[comm.processCount()]);
        std::unique_ptr<int[]> recvBlockDispl(new int[comm.processCount()]);
        // Exchange the block info per level
        for(int idxLevel = 1 ; idxLevel < tree->getHeight() ; ++idxLevel){
            // Count the total number of blocks
            int nbBlocksInLevel = 0;
            recvBlockDispl[0] = 0;
            for(int idxProc = 0 ; idxProc < comm.processCount() ; ++idxProc){
                nbBlocksInLevel += nbBlocksPerLevelAll[idxProc*tree->getHeight() + idxLevel];
                // Count and displacement for the MPI all gatherv
                recvBlocksCount[idxProc] = nbBlocksPerLevelAll[idxProc*tree->getHeight() + idxLevel] * int(sizeof(BlockDescriptor));
                if(idxProc) recvBlockDispl[idxProc] = recvBlockDispl[idxProc-1] + recvBlocksCount[idxProc-1];
            }
            processesBlockInfos[idxLevel].resize(nbBlocksInLevel);
            // Fill my blocks
            std::vector<BlockDescriptor> myBlocksAtLevel;
            myBlocksAtLevel.resize(nbBlocksPerLevel[idxLevel]);
            FAssertLF(tree->getNbCellGroupAtLevel(idxLevel) == int(myBlocksAtLevel.size()));
            FAssertLF(nbBlocksPerLevel[idxLevel] == nbBlocksPerLevelAll[comm.processId()*tree->getHeight() + idxLevel]);

            for(int idxGroup = 0 ; idxGroup < tree->getNbCellGroupAtLevel(idxLevel) ; ++idxGroup){
                CellContainerClass*const currentCells = tree->getCellGroup(idxLevel, idxGroup);
                myBlocksAtLevel[idxGroup].firstIndex = currentCells->getStartingIndex();
                myBlocksAtLevel[idxGroup].lastIndex  = currentCells->getEndingIndex();
                myBlocksAtLevel[idxGroup].owner = comm.processId();
670 671 672
                myBlocksAtLevel[idxGroup].bufferSizeSymb = currentCells->getBufferSizeInByte();
                myBlocksAtLevel[idxGroup].bufferSizeUp   = currentCells->getMultipoleBufferSizeInByte();
                myBlocksAtLevel[idxGroup].bufferSizeDown = currentCells->getLocalBufferSizeInByte();
673 674 675 676 677 678 679 680 681 682 683 684 685

                if(idxLevel == tree->getHeight() - 1){
                    myBlocksAtLevel[idxGroup].leavesBufferSize = tree->getParticleGroup(idxGroup)->getBufferSizeInByte();
                }
                else{
                    myBlocksAtLevel[idxGroup].leavesBufferSize = 0;
                }
            }
            // Exchange with all other
            FMpi::Assert(MPI_Allgatherv(myBlocksAtLevel.data(), int(myBlocksAtLevel.size()*sizeof(BlockDescriptor)), MPI_BYTE,
                                        processesBlockInfos[idxLevel].data(), recvBlocksCount.get(), recvBlockDispl.get(), MPI_BYTE,
                                        comm.getComm()), __LINE__);
        }
686 687 688 689 690 691 692
        // Prepare remate ptr and handles
        remoteCellGroups.resize( tree->getHeight() );
        for(int idxLevel = 1 ; idxLevel < tree->getHeight() ; ++idxLevel){
            remoteCellGroups[idxLevel].resize( processesBlockInfos[idxLevel].size());
        }
        remoteParticleGroupss.resize(processesBlockInfos[tree->getHeight()-1].size());

693 694 695 696 697 698 699 700
        // From now we have the number of blocks for all process
        // we also have the size of the blocks therefor we can
        // create the handles we need
        // We will now detect the relation between our blocks and others
        // During the M2M (which is the same for the L2L)
        // During the M2L and during the P2P
        // I need to insert the task that read my data or that write the data I need.
        // M2L
701
        externalInteractionsAllLevelMpi.clear();
702 703 704 705 706 707 708 709 710 711 712 713 714 715
        externalInteractionsAllLevelMpi.resize(tree->getHeight());
        for(int idxLevel = tree->getHeight()-1 ; idxLevel >= 2 ; --idxLevel){
            // From this level there are no more blocks
            if(tree->getNbCellGroupAtLevel(idxLevel) == 0){
                // We stop here
                break;
            }
            // What are my morton interval at this level
            const MortonIndex myFirstIndex = tree->getCellGroup(idxLevel, 0)->getStartingIndex();
            const MortonIndex myLastIndex = tree->getCellGroup(idxLevel, tree->getNbCellGroupAtLevel(idxLevel)-1)->getEndingIndex();

            externalInteractionsAllLevelMpi[idxLevel].resize(tree->getNbCellGroupAtLevel(idxLevel));

            for(int idxGroup = 0 ; idxGroup < tree->getNbCellGroupAtLevel(idxLevel) ; ++idxGroup){
716
                CellContainerClass* currentCells = tree->getCellGroup(idxLevel, idxGroup);
717 718 719 720 721 722 723

                std::vector<BlockInteractions<CellContainerClass>>* externalInteractions = &externalInteractionsAllLevelMpi[idxLevel][idxGroup];

                #pragma omp task default(none) firstprivate(idxGroup, currentCells, idxLevel, externalInteractions)
                {
                    std::vector<OutOfBlockInteraction> outsideInteractions;

724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740
                    for(int idxCell = 0 ; idxCell < currentCells->getNumberOfCellsInBlock() ; ++idxCell){
                        const MortonIndex mindex = currentCells->getCellMortonIndex(idxCell);

                        MortonIndex interactionsIndexes[189];
                        int interactionsPosition[189];
                        const FTreeCoordinate coord(mindex, idxLevel);
                        int counter = coord.getInteractionNeighbors(idxLevel,interactionsIndexes,interactionsPosition);

                        for(int idxInter = 0 ; idxInter < counter ; ++idxInter){
                            // This interactions need a block owned by someoneelse
                            if(interactionsIndexes[idxInter] < myFirstIndex || myLastIndex <= interactionsIndexes[idxInter]){
                                OutOfBlockInteraction property;
                                property.insideIndex = mindex;
                                property.outIndex    = interactionsIndexes[idxInter];
                                property.outPosition = interactionsPosition[idxInter];
                                property.insideIdxInBlock = idxCell;
                                outsideInteractions.push_back(property);
741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759
                            }
                        }
                    }

                    // Manage outofblock interaction
                    FQuickSort<OutOfBlockInteraction, int>::QsSequential(outsideInteractions.data(),int(outsideInteractions.size()));

                    int currentOutInteraction = 0;
                    for(int idxOtherGroup = 0 ; idxOtherGroup < int(processesBlockInfos[idxLevel].size())
                                                && currentOutInteraction < int(outsideInteractions.size()) ; ++idxOtherGroup){
                        // Skip my blocks
                        if(idxOtherGroup == nbBlocksBeforeMinPerLevel[idxLevel]){
                            idxOtherGroup += tree->getNbCellGroupAtLevel(idxLevel);
                            if(idxOtherGroup == int(processesBlockInfos[idxLevel].size())){
                                break;
                            }
                            FAssertLF(idxOtherGroup < int(processesBlockInfos[idxLevel].size()));
                        }

BRAMAS Berenger's avatar
BRAMAS Berenger committed
760 761
                        const MortonIndex blockStartIdxOther = processesBlockInfos[idxLevel][idxOtherGroup].firstIndex;
                        const MortonIndex blockEndIdxOther   = processesBlockInfos[idxLevel][idxOtherGroup].lastIndex;
762

BRAMAS Berenger's avatar
BRAMAS Berenger committed
763
                        while(currentOutInteraction < int(outsideInteractions.size()) && outsideInteractions[currentOutInteraction].outIndex < blockStartIdxOther){
764 765 766 767
                            currentOutInteraction += 1;
                        }

                        int lastOutInteraction = currentOutInteraction;
BRAMAS Berenger's avatar
BRAMAS Berenger committed
768
                        while(lastOutInteraction < int(outsideInteractions.size()) && outsideInteractions[lastOutInteraction].outIndex < blockEndIdxOther){
769 770 771 772 773 774
                            lastOutInteraction += 1;
                        }

                        // Create interactions
                        const int nbInteractionsBetweenBlocks = (lastOutInteraction-currentOutInteraction);
                        if(nbInteractionsBetweenBlocks){
775
                            if(remoteCellGroups[idxLevel][idxOtherGroup].ptrSymb == nullptr){
776 777
                                #pragma omp critical(CreateM2LRemotes)
                                {
778
                                    if(remoteCellGroups[idxLevel][idxOtherGroup].ptrSymb == nullptr){
BRAMAS Berenger's avatar
BRAMAS Berenger committed
779
                                        const size_t nbBytesInBlockSymb = processesBlockInfos[idxLevel][idxOtherGroup].bufferSizeSymb;
780
                                        unsigned char* memoryBlockSymb = (unsigned char*)FAlignedMemory::AllocateBytes<32>(nbBytesInBlockSymb);
781 782 783
                                        remoteCellGroups[idxLevel][idxOtherGroup].ptrSymb = memoryBlockSymb;
                                        starpu_variable_data_register(&remoteCellGroups[idxLevel][idxOtherGroup].handleSymb, 0,
                                                                      (uintptr_t)remoteCellGroups[idxLevel][idxOtherGroup].ptrSymb, nbBytesInBlockSymb);
BRAMAS Berenger's avatar
BRAMAS Berenger committed
784
                                        const size_t nbBytesInBlockUp = processesBlockInfos[idxLevel][idxOtherGroup].bufferSizeUp;
785
                                        unsigned char* memoryBlockUp = (unsigned char*)FAlignedMemory::AllocateBytes<32>(nbBytesInBlockUp);
786 787 788
                                        remoteCellGroups[idxLevel][idxOtherGroup].ptrUp = memoryBlockUp;
                                        starpu_variable_data_register(&remoteCellGroups[idxLevel][idxOtherGroup].handleUp, 0,
                                                                      (uintptr_t)remoteCellGroups[idxLevel][idxOtherGroup].ptrUp, nbBytesInBlockUp);
789 790
                                    }
                                }
791 792
                            }

793 794
                            externalInteractions->emplace_back();
                            BlockInteractions<CellContainerClass>* interactions = &externalInteractions->back();
795
                            //interactions->otherBlock = remoteCellGroups[idxLevel][idxOtherGroup].ptr;
796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813
                            interactions->otherBlockId = idxOtherGroup;
                            interactions->interactions.resize(nbInteractionsBetweenBlocks);
                            std::copy(outsideInteractions.begin() + currentOutInteraction,
                                      outsideInteractions.begin() + lastOutInteraction,
                                      interactions->interactions.begin());
                        }

                        currentOutInteraction = lastOutInteraction;
                    }
                }
            }
        }
        // P2P
        // We create one big vector per block
        {
            const MortonIndex myFirstIndex = tree->getParticleGroup(0)->getStartingIndex();
            const MortonIndex myLastIndex = tree->getParticleGroup(tree->getNbParticleGroup()-1)->getEndingIndex();

814
            externalInteractionsLeafLevelMpi.clear();
815 816 817 818 819 820 821 822 823 824 825
            externalInteractionsLeafLevelMpi.resize(tree->getNbParticleGroup());
            for(int idxGroup = 0 ; idxGroup < tree->getNbParticleGroup() ; ++idxGroup){
                // Create the vector
                ParticleGroupClass* containers = tree->getParticleGroup(idxGroup);

                std::vector<BlockInteractions<ParticleGroupClass>>* externalInteractions = &externalInteractionsLeafLevelMpi[idxGroup];

                #pragma omp task default(none) firstprivate(idxGroup, containers, externalInteractions)
                { // Can be a task(inout:iterCells)
                    std::vector<OutOfBlockInteraction> outsideInteractions;

826
                    for(int idxLeaf = 0 ; idxLeaf < containers->getNumberOfLeavesInBlock() ; ++idxLeaf){
827
                        // ParticleContainerClass particles = containers->template getLeaf<ParticleContainerClass>(mindex);
828
                        const MortonIndex mindex = containers->getLeafMortonIndex(idxLeaf);
829
                        if(containers->exists(mindex)){
830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862
                            MortonIndex interactionsIndexes[26];
                            int interactionsPosition[26];
                            FTreeCoordinate coord(mindex, tree->getHeight()-1);
                            int counter = coord.getNeighborsIndexes(tree->getHeight(),interactionsIndexes,interactionsPosition);

                            for(int idxInter = 0 ; idxInter < counter ; ++idxInter){
                                if(interactionsIndexes[idxInter] < myFirstIndex ||
                                        myLastIndex <= interactionsIndexes[idxInter]){
                                    OutOfBlockInteraction property;
                                    property.insideIndex = mindex;
                                    property.outIndex    = interactionsIndexes[idxInter];
                                    property.outPosition = interactionsPosition[idxInter];
                                    outsideInteractions.push_back(property);
                                }
                            }
                        }
                    }

                    // Sort to match external order
                    FQuickSort<OutOfBlockInteraction, int>::QsSequential(outsideInteractions.data(),int(outsideInteractions.size()));

                    int currentOutInteraction = 0;
                    for(int idxOtherGroup = 0 ; idxOtherGroup < int(processesBlockInfos[tree->getHeight()-1].size())
                                                && currentOutInteraction < int(outsideInteractions.size()) ; ++idxOtherGroup){
                        // Skip my blocks
                        if(idxOtherGroup == nbBlocksBeforeMinPerLevel[tree->getHeight()-1]){
                            idxOtherGroup += tree->getNbCellGroupAtLevel(tree->getHeight()-1);
                            if(idxOtherGroup == int(processesBlockInfos[tree->getHeight()-1].size())){
                                break;
                            }
                            FAssertLF(idxOtherGroup < int(processesBlockInfos[tree->getHeight()-1].size()));
                        }

BRAMAS Berenger's avatar
BRAMAS Berenger committed
863 864
                        const MortonIndex blockStartIdxOther = processesBlockInfos[tree->getHeight()-1][idxOtherGroup].firstIndex;
                        const MortonIndex blockEndIdxOther   = processesBlockInfos[tree->getHeight()-1][idxOtherGroup].lastIndex;
865

BRAMAS Berenger's avatar
BRAMAS Berenger committed
866
                        while(currentOutInteraction < int(outsideInteractions.size()) && outsideInteractions[currentOutInteraction].outIndex < blockStartIdxOther){
867 868 869 870
                            currentOutInteraction += 1;
                        }

                        int lastOutInteraction = currentOutInteraction;
BRAMAS Berenger's avatar
BRAMAS Berenger committed
871
                        while(lastOutInteraction < int(outsideInteractions.size()) && outsideInteractions[lastOutInteraction].outIndex < blockEndIdxOther){
872 873 874 875 876
                            lastOutInteraction += 1;
                        }

                        const int nbInteractionsBetweenBlocks = (lastOutInteraction-currentOutInteraction);
                        if(nbInteractionsBetweenBlocks){
877
                            if(remoteParticleGroupss[idxOtherGroup].ptrSymb == nullptr){
878 879
                                #pragma omp critical(CreateM2LRemotes)
                                {
880
                                    if(remoteParticleGroupss[idxOtherGroup].ptrSymb == nullptr){
BRAMAS Berenger's avatar
BRAMAS Berenger committed
881
                                        const size_t nbBytesInBlock = processesBlockInfos[tree->getHeight()-1][idxOtherGroup].leavesBufferSize;
882
                                        unsigned char* memoryBlock = (unsigned char*)FAlignedMemory::AllocateBytes<32>(nbBytesInBlock);
883 884 885
                                        remoteParticleGroupss[idxOtherGroup].ptrSymb = memoryBlock;
                                        starpu_variable_data_register(&remoteParticleGroupss[idxOtherGroup].handleSymb, 0,
                                                                      (uintptr_t)remoteParticleGroupss[idxOtherGroup].ptrSymb, nbBytesInBlock);
886 887
                                    }
                                }
888 889
                            }

890 891
                            externalInteractions->emplace_back();
                            BlockInteractions<ParticleGroupClass>* interactions = &externalInteractions->back();
892
                            //interactions->otherBlock = remoteParticleGroupss[idxOtherGroup].ptr;
893 894 895 896 897 898 899 900 901 902 903 904 905
                            interactions->otherBlockId = idxOtherGroup;
                            interactions->interactions.resize(nbInteractionsBetweenBlocks);
                            std::copy(outsideInteractions.begin() + currentOutInteraction,
                                      outsideInteractions.begin() + lastOutInteraction,
                                      interactions->interactions.begin());
                        }

                        currentOutInteraction = lastOutInteraction;
                    }
                }
            }
        }
    }
906

907 908 909 910 911 912 913 914 915 916 917
    struct MpiDependency{
        int src;
        int dest;
        int level;
        int globalBlockId;
    };

    std::vector<MpiDependency> toSend;

    void postRecvAllocatedBlocks(){
        std::vector<MpiDependency> toRecv;
918
        FAssertLF(tree->getHeight() == int(remoteCellGroups.size()));
919 920
        const bool directOnly = (tree->getHeight() <= 2);
        if(!directOnly){
921
            for(int idxLevel = FAbstractAlgorithm::upperWorkingLevel ; idxLevel < FAbstractAlgorithm::lowerWorkingLevel ; ++idxLevel){
922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941
                for(int idxHandle = 0 ; idxHandle < int(remoteCellGroups[idxLevel].size()) ; ++idxHandle){
                    if(remoteCellGroups[idxLevel][idxHandle].ptrSymb){
                        FAssertLF(remoteCellGroups[idxLevel][idxHandle].ptrUp);
                        FLOG(FLog::Controller << "[SMpi] " << idxLevel << " Post a recv during M2L for Idx " << processesBlockInfos[idxLevel][idxHandle].firstIndex <<
                             " and dest is " << processesBlockInfos[idxLevel][idxHandle].owner << " tag " << getTag(idxLevel,processesBlockInfos[idxLevel][idxHandle].firstIndex, 0) << "\n");
                        FLOG(FLog::Controller << "[SMpi] " << idxLevel << " Post a recv during M2L for Idx " << processesBlockInfos[idxLevel][idxHandle].firstIndex <<
                             " and dest is " << processesBlockInfos[idxLevel][idxHandle].owner << " tag " << getTag(idxLevel,processesBlockInfos[idxLevel][idxHandle].firstIndex, 1) << "\n");

                        starpu_mpi_irecv_detached( remoteCellGroups[idxLevel][idxHandle].handleSymb,
                                                    processesBlockInfos[idxLevel][idxHandle].owner,
                                                    getTag(idxLevel,processesBlockInfos[idxLevel][idxHandle].firstIndex, 0),
                                                    comm.getComm(), 0, 0 );
                        starpu_mpi_irecv_detached( remoteCellGroups[idxLevel][idxHandle].handleUp,
                                                    processesBlockInfos[idxLevel][idxHandle].owner,
                                                    getTag(idxLevel,processesBlockInfos[idxLevel][idxHandle].firstIndex, 1),
                                                    comm.getComm(), 0, 0 );

                        toRecv.push_back({processesBlockInfos[idxLevel][idxHandle].owner,
                                            comm.processId(), idxLevel, idxHandle});
                    }
942 943 944 945 946
                }
            }
        }
        {
            for(int idxHandle = 0 ; idxHandle < int(remoteParticleGroupss.size()) ; ++idxHandle){
947
                if(remoteParticleGroupss[idxHandle].ptrSymb){
BRAMAS Berenger's avatar
BRAMAS Berenger committed
948
                    FLOG(FLog::Controller << "[SMpi] Post a recv during P2P for Idx " << processesBlockInfos[tree->getHeight()-1][idxHandle].firstIndex <<
949
                         " and dest is " << processesBlockInfos[tree->getHeight()-1][idxHandle].owner << " tag " << getTag(tree->getHeight(),processesBlockInfos[tree->getHeight()-1][idxHandle].firstIndex, 0) << "\n");
BRAMAS Berenger's avatar
BRAMAS Berenger committed
950

951
                    starpu_mpi_irecv_detached( remoteParticleGroupss[idxHandle].handleSymb,
952
                                                processesBlockInfos[tree->getHeight()-1][idxHandle].owner,
953
                                                getTag(tree->getHeight(),processesBlockInfos[tree->getHeight()-1][idxHandle].firstIndex, 0),
954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984
                                                comm.getComm(), 0, 0 );

                    toRecv.push_back({processesBlockInfos[tree->getHeight()-1][idxHandle].owner,
                                        comm.processId(), tree->getHeight(), idxHandle});
                }
            }
        }

        FQuickSort<MpiDependency, int>::QsSequential(toRecv.data(),int(toRecv.size()),[](const MpiDependency& d1, const MpiDependency& d2){
            return d1.src <= d2.src;
        });

        std::unique_ptr<int[]> nbBlocksToRecvFromEach(new int[comm.processCount()]);
        memset(nbBlocksToRecvFromEach.get(), 0, sizeof(int)*comm.processCount());
        for(int idxDep = 0 ; idxDep < int(toRecv.size()) ; ++idxDep){
            nbBlocksToRecvFromEach[toRecv[idxDep].src] += 1;
        }

        FAssertLF(nbBlocksToRecvFromEach[comm.processId()] == 0);
        int offset = 0;

        for(int idxProc = 0 ; idxProc < comm.processCount() ; ++idxProc){
            if(idxProc == comm.processId()){
                // How much to send to each
                std::unique_ptr<int[]> nbBlocksToSendToEach(new int[comm.processCount()]);
                FMpi::Assert(MPI_Gather(&nbBlocksToRecvFromEach[idxProc], 1,
                                 MPI_INT, nbBlocksToSendToEach.get(), 1,
                                 MPI_INT, idxProc, comm.getComm() ), __LINE__);

                std::unique_ptr<int[]> displs(new int[comm.processCount()]);
                displs[0] = 0;
BRAMAS Berenger's avatar
BRAMAS Berenger committed
985 986
                for(int idxProcOther = 1 ; idxProcOther < comm.processCount() ; ++idxProcOther){
                    displs[idxProcOther] = displs[idxProcOther-1] + nbBlocksToSendToEach[idxProcOther-1];
987
                }
988
                toSend.resize(displs[comm.processCount()-1] + nbBlocksToSendToEach[comm.processCount()-1]);
989

990
                // We work in bytes
BRAMAS Berenger's avatar
BRAMAS Berenger committed
991 992 993
                for(int idxProcOther = 0 ; idxProcOther < comm.processCount() ; ++idxProcOther){
                    nbBlocksToSendToEach[idxProcOther] *= int(sizeof(MpiDependency));
                    displs[idxProcOther] *= int(sizeof(MpiDependency));
994
                }
995

996 997
                FMpi::Assert(MPI_Gatherv( nullptr, 0, MPI_BYTE,
                                 toSend.data(),
998 999 1000 1001 1002 1003 1004
                                 nbBlocksToSendToEach.get(), displs.get(),
                                 MPI_BYTE, idxProc, comm.getComm()), __LINE__);
            }
            else{
                FMpi::Assert(MPI_Gather(&nbBlocksToRecvFromEach[idxProc], 1,
                                 MPI_INT, 0, 0, MPI_INT, idxProc, comm.getComm() ), __LINE__);
                FMpi::Assert(MPI_Gatherv(
1005
                                 &toRecv[offset], int(nbBlocksToRecvFromEach[idxProc]*sizeof(MpiDependency)), MPI_BYTE,
1006
                                 0, 0, 0, MPI_BYTE, idxProc, comm.getComm() ), __LINE__);
1007

1008 1009 1010 1011 1012 1013 1014 1015 1016
                offset += nbBlocksToRecvFromEach[idxProc];
            }
        }
    }

    void insertParticlesSend(){
        for(int idxSd = 0 ; idxSd < int(toSend.size()) ; ++idxSd){
            const MpiDependency sd = toSend[idxSd];
            if(sd.level == tree->getHeight()){
1017 1018
                const int localId = sd.globalBlockId - nbBlocksBeforeMinPerLevel[tree->getHeight()-1];
                FAssertLF(sd.src == comm.processId());
1019 1020 1021 1022
                FAssertLF(0 <= localId);
                FAssertLF(localId < tree->getNbParticleGroup());

                FLOG(FLog::Controller << "[SMpi] Post a send during P2P for Idx " << tree->getParticleGroup(localId)->getStartingIndex() <<
1023
                     " and dest is " << sd.dest << " tag " << getTag(tree->getHeight(),tree->