FGroupTaskStarpuMpiAlgorithm.hpp 102 KB
Newer Older
1 2 3 4 5
// Keep in private GIT
// @SCALFMM_PRIVATE
#ifndef FGROUPTASKSTARPUMPIALGORITHM_HPP
#define FGROUPTASKSTARPUMPIALGORITHM_HPP

6 7 8 9 10 11 12 13 14 15 16
#include "../../Utils/FGlobal.hpp"
#include "../../Core/FCoreCommon.hpp"
#include "../../Utils/FQuickSort.hpp"
#include "../../Containers/FTreeCoordinate.hpp"
#include "../../Utils/FLog.hpp"
#include "../../Utils/FTic.hpp"
#include "../../Utils/FAssert.hpp"
#include "../../Utils/FAlignedMemory.hpp"
#include "../../Utils/FAssert.hpp"

#include "../../Utils/FMpi.hpp"
17

18 19
#include "FOutOfBlockInteraction.hpp"

20 21 22 23 24 25
#include <vector>
#include <memory>

#include <omp.h>

#include <starpu.h>
26
#include <starpu_mpi.h>
27
#include "../StarPUUtils/FStarPUUtils.hpp"
28
#include "../StarPUUtils/FStarPUFmmPriorities.hpp"
29

30
#ifdef STARPU_USE_CPU
31
#include "../StarPUUtils/FStarPUCpuWrapper.hpp"
32
#endif
33
#ifdef SCALFMM_ENABLE_CUDA_KERNEL
34 35 36 37 38
#include "../StarPUUtils/FStarPUCudaWrapper.hpp"
#include "../Cuda/FCudaEmptyKernel.hpp"
#include "../Cuda/FCudaGroupAttachedLeaf.hpp"
#include "../Cuda/FCudaGroupOfParticles.hpp"
#include "../Cuda/FCudaGroupOfCells.hpp"
39
#include "../Cuda/FCudaEmptyCellSymb.hpp"
40
#endif
41
#ifdef SCALFMM_ENABLE_OPENCL_KERNEL
42 43
#include "../StarPUUtils/FStarPUOpenClWrapper.hpp"
#include "../OpenCl/FOpenCLDeviceWrapper.hpp"
44
#include "../OpenCl/FEmptyOpenCLCode.hpp"
45
#endif
46

47

48
template <class OctreeClass, class CellContainerClass, class KernelClass, class ParticleGroupClass, class StarPUCpuWrapperClass
49
#ifdef SCALFMM_ENABLE_CUDA_KERNEL
50 51
    , class StarPUCudaWrapperClass = FStarPUCudaWrapper<KernelClass, FCudaEmptyCellSymb, int, int, FCudaGroupOfCells<FCudaEmptyCellSymb, int, int>,
                                                        FCudaGroupOfParticles<0, 0, int>, FCudaGroupAttachedLeaf<0, 0, int>, FCudaEmptyKernel>
52
#endif
53
#ifdef SCALFMM_ENABLE_OPENCL_KERNEL
54
    , class StarPUOpenClWrapperClass = FStarPUOpenClWrapper<KernelClass, FOpenCLDeviceWrapper<KernelClass>>
55 56
#endif
          >
57 58
class FGroupTaskStarPUMpiAlgorithm {
protected:
59
    typedef FGroupTaskStarPUMpiAlgorithm<OctreeClass, CellContainerClass, KernelClass, ParticleGroupClass, StarPUCpuWrapperClass
60
#ifdef SCALFMM_ENABLE_CUDA_KERNEL
61
        , StarPUCudaWrapperClass
62
#endif
63
#ifdef SCALFMM_ENABLE_OPENCL_KERNEL
64
    , StarPUOpenClWrapperClass
65 66
#endif
    > ThisClass;
67

68
    int getTag(const int inLevel, const MortonIndex mindex, const int mode) const{
69 70 71
        int shift = 0;
        int height = tree->getHeight();
        while(height) { shift += 1; height >>= 1; }
72
        return (((mindex<<shift) + inLevel) << 5) + mode;
73 74
    }

75 76 77 78 79 80 81 82 83
    const FMpi::FComm& comm;

    template <class OtherBlockClass>
    struct BlockInteractions{
        OtherBlockClass* otherBlock;
        int otherBlockId;
        std::vector<OutOfBlockInteraction> interactions;
    };

84 85 86 87
    struct CellHandles{
        starpu_data_handle_t symb;
        starpu_data_handle_t up;
        starpu_data_handle_t down;
88
        int intervalSize;
89 90 91 92 93
    };

    struct ParticleHandles{
        starpu_data_handle_t symb;
        starpu_data_handle_t down;
94
        int intervalSize;
95 96
    };

97 98 99 100
    std::vector< std::vector< std::vector<BlockInteractions<CellContainerClass>>>> externalInteractionsAllLevel;
    std::vector< std::vector<BlockInteractions<ParticleGroupClass>>> externalInteractionsLeafLevel;

    OctreeClass*const tree;       //< The Tree
101
    KernelClass*const originalCpuKernel;
102

103 104
    std::vector<CellHandles>* cellHandles;
    std::vector<ParticleHandles> particleHandles;
105 106 107 108 109 110 111 112

    starpu_codelet p2m_cl;
    starpu_codelet m2m_cl[9];
    starpu_codelet l2l_cl[9];
    starpu_codelet l2p_cl;

    starpu_codelet m2l_cl_in;
    starpu_codelet m2l_cl_inout;
113
    starpu_codelet m2l_cl_inout_mpi;
114 115 116

    starpu_codelet p2p_cl_in;
    starpu_codelet p2p_cl_inout;
117
    starpu_codelet p2p_cl_inout_mpi;
118

119
#ifdef STARPU_USE_CPU
120
    StarPUCpuWrapperClass cpuWrapper;
121
#endif
122
#ifdef SCALFMM_ENABLE_CUDA_KERNEL
123 124
    StarPUCudaWrapperClass cudaWrapper;
#endif
125
#ifdef SCALFMM_ENABLE_OPENCL_KERNEL
126 127
    StarPUOpenClWrapperClass openclWrapper;
#endif
128 129 130

    FStarPUPtrInterface wrappers;
    FStarPUPtrInterface* wrapperptr;
131

132
public:
133 134
    FGroupTaskStarPUMpiAlgorithm(const FMpi::FComm& inComm, OctreeClass*const inTree, KernelClass* inKernels)
        :   comm(inComm), tree(inTree), originalCpuKernel(inKernels),
135
          cellHandles(nullptr),
136 137
#ifdef STARPU_USE_CPU
            cpuWrapper(tree->getHeight()),
138
#endif
139
#ifdef SCALFMM_ENABLE_CUDA_KERNEL
140 141
            cudaWrapper(tree->getHeight()),
#endif
142
#ifdef SCALFMM_ENABLE_OPENCL_KERNEL
143
            openclWrapper(tree->getHeight()),
144 145
#endif
            wrapperptr(&wrappers){
146 147 148 149 150
        FAssertLF(tree, "tree cannot be null");
        FAssertLF(inKernels, "kernels cannot be null");

        struct starpu_conf conf;
        FAssertLF(starpu_conf_init(&conf) == 0);
151
        FStarPUFmmPriorities::Controller().init(&conf, tree->getHeight(), inKernels);
152
        FAssertLF(starpu_init(&conf) == 0);
153
        FAssertLF(starpu_mpi_init ( 0, 0, 0 ) == 0);
154 155 156

        starpu_pthread_mutex_t initMutex;
        starpu_pthread_mutex_init(&initMutex, NULL);
157
#ifdef STARPU_USE_CPU
158 159 160 161 162
        FStarPUUtils::ExecOnWorkers(STARPU_CPU, [&](){
            starpu_pthread_mutex_lock(&initMutex);
            cpuWrapper.initKernel(starpu_worker_get_id(), inKernels);
            starpu_pthread_mutex_unlock(&initMutex);
        });
163
        wrappers.set(FSTARPU_CPU_IDX, &cpuWrapper);
164
#endif
165
#ifdef SCALFMM_ENABLE_CUDA_KERNEL
166 167 168 169 170 171 172
        FStarPUUtils::ExecOnWorkers(STARPU_CUDA, [&](){
            starpu_pthread_mutex_lock(&initMutex);
            cudaWrapper.initKernel(starpu_worker_get_id(), inKernels);
            starpu_pthread_mutex_unlock(&initMutex);
        });
        wrappers.set(FSTARPU_CUDA_IDX, &cudaWrapper);
#endif
173
#ifdef SCALFMM_ENABLE_OPENCL_KERNEL
174 175 176 177 178 179
        FStarPUUtils::ExecOnWorkers(STARPU_OPENCL, [&](){
            starpu_pthread_mutex_lock(&initMutex);
            openclWrapper.initKernel(starpu_worker_get_id(), inKernels);
            starpu_pthread_mutex_unlock(&initMutex);
        });
        wrappers.set(FSTARPU_OPENCL_IDX, &openclWrapper);
180
#endif
181 182
        starpu_pthread_mutex_destroy(&initMutex);

183 184
        starpu_pause();

185
        cellHandles   = new std::vector<CellHandles>[tree->getHeight()];
186

187
        initCodelet();
188
        initCodeletMpi();
189

BRAMAS Berenger's avatar
BRAMAS Berenger committed
190
        FLOG(FLog::Controller << "FGroupTaskStarPUAlgorithm (Max Worker " << starpu_worker_get_count() << ")\n");
BRAMAS Berenger's avatar
BRAMAS Berenger committed
191
#ifdef STARPU_USE_CPU
BRAMAS Berenger's avatar
BRAMAS Berenger committed
192
        FLOG(FLog::Controller << "FGroupTaskStarPUAlgorithm (Max CPU " << starpu_cpu_worker_get_count() << ")\n");
BRAMAS Berenger's avatar
BRAMAS Berenger committed
193
#endif
194
#ifdef SCALFMM_ENABLE_OPENCL_KERNEL
BRAMAS Berenger's avatar
BRAMAS Berenger committed
195 196
        FLOG(FLog::Controller << "FGroupTaskStarPUAlgorithm (Max OpenCL " << starpu_opencl_worker_get_count() << ")\n");
#endif
197
#ifdef SCALFMM_ENABLE_CUDA_KERNEL
BRAMAS Berenger's avatar
BRAMAS Berenger committed
198 199
        FLOG(FLog::Controller << "FGroupTaskStarPUAlgorithm (Max CUDA " << starpu_cuda_worker_get_count() << ")\n");
#endif
200 201 202
    }

    ~FGroupTaskStarPUMpiAlgorithm(){
BRAMAS Berenger's avatar
BRAMAS Berenger committed
203 204
        starpu_resume();

205
        cleanHandle();
206
        cleanHandleMpi();
207
        delete[] cellHandles;
208

BRAMAS Berenger's avatar
BRAMAS Berenger committed
209 210 211 212 213 214 215 216 217 218
        starpu_pthread_mutex_t releaseMutex;
        starpu_pthread_mutex_init(&releaseMutex, NULL);
#ifdef STARPU_USE_CPU
        FStarPUUtils::ExecOnWorkers(STARPU_CPU, [&](){
            starpu_pthread_mutex_lock(&releaseMutex);
            cpuWrapper.releaseKernel(starpu_worker_get_id());
            starpu_pthread_mutex_unlock(&releaseMutex);
        });
        wrappers.set(FSTARPU_CPU_IDX, &cpuWrapper);
#endif
219
#ifdef SCALFMM_ENABLE_CUDA_KERNEL
BRAMAS Berenger's avatar
BRAMAS Berenger committed
220 221 222 223 224 225 226
        FStarPUUtils::ExecOnWorkers(STARPU_CUDA, [&](){
            starpu_pthread_mutex_lock(&releaseMutex);
            cudaWrapper.releaseKernel(starpu_worker_get_id());
            starpu_pthread_mutex_unlock(&releaseMutex);
        });
        wrappers.set(FSTARPU_CUDA_IDX, &cudaWrapper);
#endif
227
#ifdef SCALFMM_ENABLE_OPENCL_KERNEL
BRAMAS Berenger's avatar
BRAMAS Berenger committed
228 229 230 231 232 233 234 235 236
        FStarPUUtils::ExecOnWorkers(STARPU_OPENCL, [&](){
            starpu_pthread_mutex_lock(&releaseMutex);
            openclWrapper.releaseKernel(starpu_worker_get_id());
            starpu_pthread_mutex_unlock(&releaseMutex);
        });
        wrappers.set(FSTARPU_OPENCL_IDX, &openclWrapper);
#endif
        starpu_pthread_mutex_destroy(&releaseMutex);

237
        starpu_mpi_shutdown();
238 239 240 241 242 243 244 245 246 247 248
        starpu_shutdown();
    }

    void execute(const unsigned operationsToProceed = FFmmNearAndFarFields){
        FLOG( FLog::Controller << "\tStart FGroupTaskStarPUMpiAlgorithm\n" );

        #pragma omp parallel
        #pragma omp single
        buildExternalInteractionVecs();
        buildHandles();

249 250
        #pragma omp parallel
        #pragma omp single
251 252 253
        buildRemoteInteractionsAndHandles();

        starpu_resume();
254
        postRecvAllocatedBlocks();
255

256
        if( operationsToProceed & FFmmP2P ) insertParticlesSend();
257

258
        if(operationsToProceed & FFmmP2M) bottomPass();
259

260
        if(operationsToProceed & FFmmM2M) upwardPass();
261
        if(operationsToProceed & FFmmM2L) insertCellsSend();
262

263 264
        if(operationsToProceed & FFmmM2L) transferPass();
        if(operationsToProceed & FFmmM2L) transferPassMpi();
265

266
        if(operationsToProceed & FFmmL2L) downardPass();
267

268 269
        if( operationsToProceed & FFmmP2P ) directPass();
        if( operationsToProceed & FFmmP2P ) directPassMpi();
BRAMAS Berenger's avatar
BRAMAS Berenger committed
270

271
        if( operationsToProceed & FFmmL2P ) mergePass();
272 273 274 275 276 277 278 279

        starpu_task_wait_for_all();
        starpu_pause();
    }

protected:
    void initCodelet(){
        memset(&p2m_cl, 0, sizeof(p2m_cl));
280
#ifdef STARPU_USE_CPU
281
        if(originalCpuKernel->supportP2M(FSTARPU_CPU_IDX)){
282 283 284
            p2m_cl.cpu_funcs[0] = StarPUCpuWrapperClass::bottomPassCallback;
            p2m_cl.where |= STARPU_CPU;
        }
285
#endif
286
#ifdef SCALFMM_ENABLE_CUDA_KERNEL
287
        if(originalCpuKernel->supportP2M(FSTARPU_CUDA_IDX)){
288 289 290 291
            p2m_cl.cuda_funcs[0] = StarPUCudaWrapperClass::bottomPassCallback;
            p2m_cl.where |= STARPU_CUDA;
        }
#endif
292
#ifdef SCALFMM_ENABLE_OPENCL_KERNEL
293
        if(originalCpuKernel->supportP2M(FSTARPU_OPENCL_IDX)){
294 295 296
            p2m_cl.opencl_funcs[0] = StarPUOpenClWrapperClass::bottomPassCallback;
            p2m_cl.where |= STARPU_OPENCL;
        }
297
#endif
298 299 300 301
        p2m_cl.nbuffers = 3;
        p2m_cl.modes[0] = STARPU_R;
        p2m_cl.modes[1] = STARPU_RW;
        p2m_cl.modes[2] = STARPU_R;
302
        p2m_cl.name = "p2m_cl";
303 304 305 306

        memset(m2m_cl, 0, sizeof(m2m_cl[0])*9);
        memset(l2l_cl, 0, sizeof(l2l_cl[0])*9);
        for(int idx = 0 ; idx < 9 ; ++idx){
307
#ifdef STARPU_USE_CPU
308
            if(originalCpuKernel->supportM2M(FSTARPU_CPU_IDX)){
309 310 311
                m2m_cl[idx].cpu_funcs[0] = StarPUCpuWrapperClass::upwardPassCallback;
                m2m_cl[idx].where |= STARPU_CPU;
            }
312
#endif
313
#ifdef SCALFMM_ENABLE_CUDA_KERNEL
314
            if(originalCpuKernel->supportM2M(FSTARPU_CUDA_IDX)){
315 316 317 318
                m2m_cl[idx].cuda_funcs[0] = StarPUCudaWrapperClass::upwardPassCallback;
                m2m_cl[idx].where |= STARPU_CUDA;
            }
#endif
319
#ifdef SCALFMM_ENABLE_OPENCL_KERNEL
320
            if(originalCpuKernel->supportM2M(FSTARPU_OPENCL_IDX)){
321 322 323
                m2m_cl[idx].opencl_funcs[0] = StarPUOpenClWrapperClass::upwardPassCallback;
                m2m_cl[idx].where |= STARPU_OPENCL;
            }
324
#endif
325 326 327 328
            m2m_cl[idx].nbuffers = (idx+2)*2;
            m2m_cl[idx].dyn_modes = (starpu_data_access_mode*)malloc(m2m_cl[idx].nbuffers*sizeof(starpu_data_access_mode));
            m2m_cl[idx].dyn_modes[0] = STARPU_R;
            m2m_cl[idx].dyn_modes[1] = STARPU_RW;
329
            m2m_cl[idx].name = "m2m_cl";
330

331
#ifdef STARPU_USE_CPU
332
            if(originalCpuKernel->supportL2L(FSTARPU_CPU_IDX)){
333 334 335
                l2l_cl[idx].cpu_funcs[0] = StarPUCpuWrapperClass::downardPassCallback;
                l2l_cl[idx].where |= STARPU_CPU;
            }
336
#endif
337
#ifdef SCALFMM_ENABLE_CUDA_KERNEL
338
            if(originalCpuKernel->supportL2L(FSTARPU_CUDA_IDX)){
339 340 341 342
                l2l_cl[idx].cuda_funcs[0] = StarPUCudaWrapperClass::downardPassCallback;
                l2l_cl[idx].where |= STARPU_CUDA;
            }
#endif
343
#ifdef SCALFMM_ENABLE_OPENCL_KERNEL
344
            if(originalCpuKernel->supportL2L(FSTARPU_OPENCL_IDX)){
345 346 347
                l2l_cl[idx].opencl_funcs[0] = StarPUOpenClWrapperClass::downardPassCallback;
                l2l_cl[idx].where |= STARPU_OPENCL;
            }
348
#endif
349 350
            l2l_cl[idx].nbuffers = (idx+2)*2;
            l2l_cl[idx].dyn_modes = (starpu_data_access_mode*)malloc(l2l_cl[idx].nbuffers*sizeof(starpu_data_access_mode));
351
            l2l_cl[idx].dyn_modes[0] = STARPU_R;
352
            l2l_cl[idx].dyn_modes[1] = STARPU_R;
353
            l2l_cl[idx].name = "l2l_cl";
354 355

            for(int idxBuffer = 0 ; idxBuffer <= idx ; ++idxBuffer){
356 357 358 359
                m2m_cl[idx].dyn_modes[(idxBuffer*2)+2] = STARPU_R;
                m2m_cl[idx].dyn_modes[(idxBuffer*2)+3] = STARPU_R;
                l2l_cl[idx].dyn_modes[(idxBuffer*2)+2] = STARPU_R;
                l2l_cl[idx].dyn_modes[(idxBuffer*2)+3] = starpu_data_access_mode(STARPU_RW|STARPU_COMMUTE);
360 361 362 363
            }
        }

        memset(&l2p_cl, 0, sizeof(l2p_cl));
364
#ifdef STARPU_USE_CPU
365
        if(originalCpuKernel->supportL2P(FSTARPU_CPU_IDX)){
366 367 368
            l2p_cl.cpu_funcs[0] = StarPUCpuWrapperClass::mergePassCallback;
            l2p_cl.where |= STARPU_CPU;
        }
369
#endif
370
#ifdef SCALFMM_ENABLE_CUDA_KERNEL
371
        if(originalCpuKernel->supportL2P(FSTARPU_CUDA_IDX)){
372 373 374 375
            l2p_cl.cuda_funcs[0] = StarPUCudaWrapperClass::mergePassCallback;
            l2p_cl.where |= STARPU_CUDA;
        }
#endif
376
#ifdef SCALFMM_ENABLE_OPENCL_KERNEL
377
        if(originalCpuKernel->supportL2P(FSTARPU_OPENCL_IDX)){
378 379 380
            l2p_cl.opencl_funcs[0] = StarPUOpenClWrapperClass::mergePassCallback;
            l2p_cl.where |= STARPU_OPENCL;
        }
381
#endif
382
        l2p_cl.nbuffers = 4;
383
        l2p_cl.modes[0] = STARPU_R;
384 385 386
        l2p_cl.modes[1] = STARPU_R;
        l2p_cl.modes[2] = STARPU_R;
        l2p_cl.modes[3] = starpu_data_access_mode(STARPU_RW|STARPU_COMMUTE);
387
        l2p_cl.name = "l2p_cl";
388 389

        memset(&p2p_cl_in, 0, sizeof(p2p_cl_in));
390
#ifdef STARPU_USE_CPU
391
        if(originalCpuKernel->supportP2P(FSTARPU_CPU_IDX)){
392 393 394
            p2p_cl_in.cpu_funcs[0] = StarPUCpuWrapperClass::directInPassCallback;
            p2p_cl_in.where |= STARPU_CPU;
        }
395
#endif
396
#ifdef SCALFMM_ENABLE_CUDA_KERNEL
397
        if(originalCpuKernel->supportP2P(FSTARPU_CUDA_IDX)){
398 399 400 401
            p2p_cl_in.cuda_funcs[0] = StarPUCudaWrapperClass::directInPassCallback;
            p2p_cl_in.where |= STARPU_CUDA;
        }
#endif
402
#ifdef SCALFMM_ENABLE_OPENCL_KERNEL
403
        if(originalCpuKernel->supportP2P(FSTARPU_OPENCL_IDX)){
404 405 406
            p2p_cl_in.opencl_funcs[0] = StarPUOpenClWrapperClass::directInPassCallback;
            p2p_cl_in.where |= STARPU_OPENCL;
        }
407
#endif
408 409 410
        p2p_cl_in.nbuffers = 2;
        p2p_cl_in.modes[0] = STARPU_R;
        p2p_cl_in.modes[1] = starpu_data_access_mode(STARPU_RW|STARPU_COMMUTE);
411
        p2p_cl_in.name = "p2p_cl_in";
412
        memset(&p2p_cl_inout, 0, sizeof(p2p_cl_inout));
413
#ifdef STARPU_USE_CPU
414
        if(originalCpuKernel->supportP2PExtern(FSTARPU_CPU_IDX)){
415 416 417
            p2p_cl_inout.cpu_funcs[0] = StarPUCpuWrapperClass::directInoutPassCallback;
            p2p_cl_inout.where |= STARPU_CPU;
        }
418
#endif
419
#ifdef SCALFMM_ENABLE_CUDA_KERNEL
420
        if(originalCpuKernel->supportP2PExtern(FSTARPU_CUDA_IDX)){
421 422 423 424
            p2p_cl_inout.cuda_funcs[0] = StarPUCudaWrapperClass::directInoutPassCallback;
            p2p_cl_inout.where |= STARPU_CUDA;
        }
#endif
425
#ifdef SCALFMM_ENABLE_OPENCL_KERNEL
426
        if(originalCpuKernel->supportP2PExtern(FSTARPU_OPENCL_IDX)){
427 428 429
            p2p_cl_inout.opencl_funcs[0] = StarPUOpenClWrapperClass::directInoutPassCallback;
            p2p_cl_inout.where |= STARPU_OPENCL;
        }
430
#endif
431 432
        p2p_cl_inout.nbuffers = 4;
        p2p_cl_inout.modes[0] = STARPU_R;
433
        p2p_cl_inout.modes[1] = starpu_data_access_mode(STARPU_RW|STARPU_COMMUTE);
434 435
        p2p_cl_inout.modes[2] = STARPU_R;
        p2p_cl_inout.modes[3] = starpu_data_access_mode(STARPU_RW|STARPU_COMMUTE);
436
        p2p_cl_inout.name = "p2p_cl_inout";
437 438

        memset(&m2l_cl_in, 0, sizeof(m2l_cl_in));
439
#ifdef STARPU_USE_CPU
440
        if(originalCpuKernel->supportM2L(FSTARPU_CPU_IDX)){
441 442 443
            m2l_cl_in.cpu_funcs[0] = StarPUCpuWrapperClass::transferInPassCallback;
            m2l_cl_in.where |= STARPU_CPU;
        }
444
#endif
445
#ifdef SCALFMM_ENABLE_CUDA_KERNEL
446
        if(originalCpuKernel->supportM2L(FSTARPU_CUDA_IDX)){
447 448 449 450
            m2l_cl_in.cuda_funcs[0] = StarPUCudaWrapperClass::transferInPassCallback;
            m2l_cl_in.where |= STARPU_CUDA;
        }
#endif
451
#ifdef SCALFMM_ENABLE_OPENCL_KERNEL
452
        if(originalCpuKernel->supportM2L(FSTARPU_OPENCL_IDX)){
453 454 455
            m2l_cl_in.opencl_funcs[0] = StarPUOpenClWrapperClass::transferInPassCallback;
            m2l_cl_in.where |= STARPU_OPENCL;
        }
456
#endif
457 458
        m2l_cl_in.nbuffers = 3;
        m2l_cl_in.modes[0] = STARPU_R;
459
        m2l_cl_in.modes[1] = STARPU_R;
460
        m2l_cl_in.modes[2] = starpu_data_access_mode(STARPU_RW|STARPU_COMMUTE);
461
        m2l_cl_in.name = "m2l_cl_in";
462
        memset(&m2l_cl_inout, 0, sizeof(m2l_cl_inout));
463
#ifdef STARPU_USE_CPU
464
        if(originalCpuKernel->supportM2LExtern(FSTARPU_CPU_IDX)){
465 466 467
            m2l_cl_inout.cpu_funcs[0] = StarPUCpuWrapperClass::transferInoutPassCallback;
            m2l_cl_inout.where |= STARPU_CPU;
        }
468
#endif
469
#ifdef SCALFMM_ENABLE_CUDA_KERNEL
470
        if(originalCpuKernel->supportM2LExtern(FSTARPU_CUDA_IDX)){
471 472 473 474
            m2l_cl_inout.cuda_funcs[0] = StarPUCudaWrapperClass::transferInoutPassCallback;
            m2l_cl_inout.where |= STARPU_CUDA;
        }
#endif
475
#ifdef SCALFMM_ENABLE_OPENCL_KERNEL
476
        if(originalCpuKernel->supportM2LExtern(FSTARPU_OPENCL_IDX)){
477 478 479
            m2l_cl_inout.opencl_funcs[0] = StarPUOpenClWrapperClass::transferInoutPassCallback;
            m2l_cl_inout.where |= STARPU_OPENCL;
        }
480
#endif
481 482 483 484
        m2l_cl_inout.nbuffers = 6;
        m2l_cl_inout.modes[0] = STARPU_R;
        m2l_cl_inout.modes[1] = STARPU_R;
        m2l_cl_inout.modes[2] = starpu_data_access_mode(STARPU_RW|STARPU_COMMUTE);
485
        m2l_cl_inout.modes[3] = STARPU_R;
486 487
        m2l_cl_inout.modes[4] = STARPU_R;
        m2l_cl_inout.modes[5] = starpu_data_access_mode(STARPU_RW|STARPU_COMMUTE);
488
        m2l_cl_inout.name = "m2l_cl_inout";
489 490 491 492 493
    }

    /** dealloc in a starpu way all the defined handles */
    void cleanHandle(){
        for(int idxLevel = 0 ; idxLevel < tree->getHeight() ; ++idxLevel){
494 495 496 497
            for(int idxHandle = 0 ; idxHandle < int(cellHandles[idxLevel].size()) ; ++idxHandle){
                starpu_data_unregister(cellHandles[idxLevel][idxHandle].symb);
                starpu_data_unregister(cellHandles[idxLevel][idxHandle].up);
                starpu_data_unregister(cellHandles[idxLevel][idxHandle].down);
498
            }
499
            cellHandles[idxLevel].clear();
500 501
        }
        {
502 503 504
            for(int idxHandle = 0 ; idxHandle < int(particleHandles.size()) ; ++idxHandle){
                starpu_data_unregister(particleHandles[idxHandle].symb);
                starpu_data_unregister(particleHandles[idxHandle].down);
505
            }
506
            particleHandles.clear();
507 508 509 510
        }
    }

    ////////////////////////////////////////////////////////////////////////////
511 512

    void initCodeletMpi(){
513
        memset(&p2p_cl_inout_mpi, 0, sizeof(p2p_cl_inout_mpi));
514
#ifdef STARPU_USE_CPU
515
        if(originalCpuKernel->supportP2PMpi(FSTARPU_CPU_IDX)){
516 517 518
            p2p_cl_inout_mpi.where |= STARPU_CPU;
            p2p_cl_inout_mpi.cpu_funcs[0] = StarPUCpuWrapperClass::directInoutPassCallbackMpi;
        }
519
#endif
520
#ifdef SCALFMM_ENABLE_CUDA_KERNEL
521
        if(originalCpuKernel->supportP2PMpi(FSTARPU_CUDA_IDX)){
522 523 524 525
            p2p_cl_inout_mpi.where |= STARPU_CUDA;
            p2p_cl_inout_mpi.cuda_funcs[0] = StarPUCudaWrapperClass::directInoutPassCallbackMpi;
        }
#endif
526
#ifdef SCALFMM_ENABLE_OPENCL_KERNEL
527
        if(originalCpuKernel->supportP2PMpi(FSTARPU_OPENCL_IDX)){
528 529 530
            p2p_cl_inout_mpi.where |= STARPU_OPENCL;
            p2p_cl_inout_mpi.opencl_funcs[0] = StarPUOpenClWrapperClass::directInoutPassCallbackMpi;
        }
531
#endif
532 533 534 535
        p2p_cl_inout_mpi.nbuffers = 3;
        p2p_cl_inout_mpi.modes[0] = STARPU_R;
        p2p_cl_inout_mpi.modes[1] = starpu_data_access_mode(STARPU_RW|STARPU_COMMUTE);
        p2p_cl_inout_mpi.modes[2] = STARPU_R;
536
        p2p_cl_inout_mpi.name = "p2p_cl_inout_mpi";
537

538
        memset(&m2l_cl_inout_mpi, 0, sizeof(m2l_cl_inout_mpi));
539
#ifdef STARPU_USE_CPU
540
        if(originalCpuKernel->supportM2LMpi(FSTARPU_CPU_IDX)){
541 542 543
            m2l_cl_inout_mpi.where |= STARPU_CPU;
            m2l_cl_inout_mpi.cpu_funcs[0] = StarPUCpuWrapperClass::transferInoutPassCallbackMpi;
        }
544
#endif
545
#ifdef SCALFMM_ENABLE_CUDA_KERNEL
546
        if(originalCpuKernel->supportM2LMpi(FSTARPU_CUDA_IDX)){
547 548 549 550
            m2l_cl_inout_mpi.where |= STARPU_CUDA;
            m2l_cl_inout_mpi.cuda_funcs[0] = StarPUCudaWrapperClass::transferInoutPassCallbackMpi;
        }
#endif
551
#ifdef SCALFMM_ENABLE_OPENCL_KERNEL
552
        if(originalCpuKernel->supportM2LMpi(FSTARPU_OPENCL_IDX)){
553 554 555
            m2l_cl_inout_mpi.where |= STARPU_OPENCL;
            m2l_cl_inout_mpi.opencl_funcs[0] = StarPUOpenClWrapperClass::transferInoutPassCallbackMpi;
        }
556
#endif
557 558 559 560 561
        m2l_cl_inout_mpi.nbuffers = 4;
        m2l_cl_inout_mpi.modes[0] = STARPU_R;
        m2l_cl_inout_mpi.modes[1] = starpu_data_access_mode(STARPU_RW|STARPU_COMMUTE);
        m2l_cl_inout_mpi.modes[2] = STARPU_R;
        m2l_cl_inout_mpi.modes[3] = STARPU_R;
562
        m2l_cl_inout_mpi.name = "m2l_cl_inout_mpi";
563 564
    }

565 566 567 568 569 570
    std::vector<std::pair<MortonIndex,MortonIndex>> processesIntervalPerLevels;
    struct BlockDescriptor{
        MortonIndex firstIndex;
        MortonIndex lastIndex;
        int owner;
        int bufferSize;
571 572 573 574
        size_t bufferSizeSymb;
        size_t bufferSizeUp;
        size_t bufferSizeDown;
        size_t leavesBufferSize;
575 576 577 578 579 580 581 582
    };
    std::vector<std::vector<BlockDescriptor>> processesBlockInfos;
    std::vector<int> nbBlocksPerLevelAll;
    std::vector<int> nbBlocksBeforeMinPerLevel;

    std::vector< std::vector< std::vector<BlockInteractions<CellContainerClass>>>> externalInteractionsAllLevelMpi;
    std::vector< std::vector<BlockInteractions<ParticleGroupClass>>> externalInteractionsLeafLevelMpi;

583
    struct RemoteHandle{
584 585 586 587
        RemoteHandle() : ptrSymb(nullptr), ptrUp(nullptr), ptrDown(nullptr){
            memset(&handleSymb, 0, sizeof(handleSymb));
            memset(&handleUp, 0, sizeof(handleUp));
            memset(&handleDown, 0, sizeof(handleDown));
588 589
        }

590 591 592 593 594 595
        unsigned char * ptrSymb;
        starpu_data_handle_t handleSymb;
        unsigned char * ptrUp;
        starpu_data_handle_t handleUp;
        unsigned char * ptrDown;
        starpu_data_handle_t handleDown;
596 597 598 599 600
    };

    std::vector<std::vector<RemoteHandle>> remoteCellGroups;
    std::vector<RemoteHandle> remoteParticleGroupss;

601
    void buildRemoteInteractionsAndHandles(){
602 603
        cleanHandleMpi();

604 605
        // We need to have information about all other blocks
        std::unique_ptr<int[]> nbBlocksPerLevel(new int[tree->getHeight()]);
606
        nbBlocksPerLevel[0] = 0;
607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649
        for(int idxLevel = 1 ; idxLevel < tree->getHeight() ; ++idxLevel){
            nbBlocksPerLevel[idxLevel] = tree->getNbCellGroupAtLevel(idxLevel);
        }
        // Exchange the number of blocks per proc
        nbBlocksPerLevelAll.resize(tree->getHeight() * comm.processCount());
        FMpi::Assert(MPI_Allgather(nbBlocksPerLevel.get(), tree->getHeight(), MPI_INT,
                                   nbBlocksPerLevelAll.data(), tree->getHeight(), MPI_INT,
                                   comm.getComm()), __LINE__);
        // Compute the number of blocks before mine
        nbBlocksBeforeMinPerLevel.resize(tree->getHeight());
        for(int idxLevel = 1 ; idxLevel < tree->getHeight() ; ++idxLevel){
            nbBlocksBeforeMinPerLevel[idxLevel] = 0;
            for(int idxProc = 0 ; idxProc < comm.processId() ; ++idxProc){
                nbBlocksBeforeMinPerLevel[idxLevel] += nbBlocksPerLevelAll[idxProc*tree->getHeight() + idxLevel];
            }
        }
        // Prepare the block infos
        processesBlockInfos.resize(tree->getHeight());
        std::unique_ptr<int[]> recvBlocksCount(new int[comm.processCount()]);
        std::unique_ptr<int[]> recvBlockDispl(new int[comm.processCount()]);
        // Exchange the block info per level
        for(int idxLevel = 1 ; idxLevel < tree->getHeight() ; ++idxLevel){
            // Count the total number of blocks
            int nbBlocksInLevel = 0;
            recvBlockDispl[0] = 0;
            for(int idxProc = 0 ; idxProc < comm.processCount() ; ++idxProc){
                nbBlocksInLevel += nbBlocksPerLevelAll[idxProc*tree->getHeight() + idxLevel];
                // Count and displacement for the MPI all gatherv
                recvBlocksCount[idxProc] = nbBlocksPerLevelAll[idxProc*tree->getHeight() + idxLevel] * int(sizeof(BlockDescriptor));
                if(idxProc) recvBlockDispl[idxProc] = recvBlockDispl[idxProc-1] + recvBlocksCount[idxProc-1];
            }
            processesBlockInfos[idxLevel].resize(nbBlocksInLevel);
            // Fill my blocks
            std::vector<BlockDescriptor> myBlocksAtLevel;
            myBlocksAtLevel.resize(nbBlocksPerLevel[idxLevel]);
            FAssertLF(tree->getNbCellGroupAtLevel(idxLevel) == int(myBlocksAtLevel.size()));
            FAssertLF(nbBlocksPerLevel[idxLevel] == nbBlocksPerLevelAll[comm.processId()*tree->getHeight() + idxLevel]);

            for(int idxGroup = 0 ; idxGroup < tree->getNbCellGroupAtLevel(idxLevel) ; ++idxGroup){
                CellContainerClass*const currentCells = tree->getCellGroup(idxLevel, idxGroup);
                myBlocksAtLevel[idxGroup].firstIndex = currentCells->getStartingIndex();
                myBlocksAtLevel[idxGroup].lastIndex  = currentCells->getEndingIndex();
                myBlocksAtLevel[idxGroup].owner = comm.processId();
650 651 652
                myBlocksAtLevel[idxGroup].bufferSizeSymb = currentCells->getBufferSizeInByte();
                myBlocksAtLevel[idxGroup].bufferSizeUp   = currentCells->getMultipoleBufferSizeInByte();
                myBlocksAtLevel[idxGroup].bufferSizeDown = currentCells->getLocalBufferSizeInByte();
653 654 655 656 657 658 659 660 661 662 663 664 665

                if(idxLevel == tree->getHeight() - 1){
                    myBlocksAtLevel[idxGroup].leavesBufferSize = tree->getParticleGroup(idxGroup)->getBufferSizeInByte();
                }
                else{
                    myBlocksAtLevel[idxGroup].leavesBufferSize = 0;
                }
            }
            // Exchange with all other
            FMpi::Assert(MPI_Allgatherv(myBlocksAtLevel.data(), int(myBlocksAtLevel.size()*sizeof(BlockDescriptor)), MPI_BYTE,
                                        processesBlockInfos[idxLevel].data(), recvBlocksCount.get(), recvBlockDispl.get(), MPI_BYTE,
                                        comm.getComm()), __LINE__);
        }
666 667 668 669 670 671 672
        // Prepare remate ptr and handles
        remoteCellGroups.resize( tree->getHeight() );
        for(int idxLevel = 1 ; idxLevel < tree->getHeight() ; ++idxLevel){
            remoteCellGroups[idxLevel].resize( processesBlockInfos[idxLevel].size());
        }
        remoteParticleGroupss.resize(processesBlockInfos[tree->getHeight()-1].size());

673 674 675 676 677 678 679 680
        // From now we have the number of blocks for all process
        // we also have the size of the blocks therefor we can
        // create the handles we need
        // We will now detect the relation between our blocks and others
        // During the M2M (which is the same for the L2L)
        // During the M2L and during the P2P
        // I need to insert the task that read my data or that write the data I need.
        // M2L
681
        externalInteractionsAllLevelMpi.clear();
682 683 684 685 686 687 688 689 690 691 692 693 694 695
        externalInteractionsAllLevelMpi.resize(tree->getHeight());
        for(int idxLevel = tree->getHeight()-1 ; idxLevel >= 2 ; --idxLevel){
            // From this level there are no more blocks
            if(tree->getNbCellGroupAtLevel(idxLevel) == 0){
                // We stop here
                break;
            }
            // What are my morton interval at this level
            const MortonIndex myFirstIndex = tree->getCellGroup(idxLevel, 0)->getStartingIndex();
            const MortonIndex myLastIndex = tree->getCellGroup(idxLevel, tree->getNbCellGroupAtLevel(idxLevel)-1)->getEndingIndex();

            externalInteractionsAllLevelMpi[idxLevel].resize(tree->getNbCellGroupAtLevel(idxLevel));

            for(int idxGroup = 0 ; idxGroup < tree->getNbCellGroupAtLevel(idxLevel) ; ++idxGroup){
696
                CellContainerClass* currentCells = tree->getCellGroup(idxLevel, idxGroup);
697 698 699 700 701 702 703 704 705 706

                std::vector<BlockInteractions<CellContainerClass>>* externalInteractions = &externalInteractionsAllLevelMpi[idxLevel][idxGroup];

                #pragma omp task default(none) firstprivate(idxGroup, currentCells, idxLevel, externalInteractions)
                {
                    std::vector<OutOfBlockInteraction> outsideInteractions;
                    const MortonIndex blockStartIdx = currentCells->getStartingIndex();
                    const MortonIndex blockEndIdx   = currentCells->getEndingIndex();

                    for(MortonIndex mindex = blockStartIdx ; mindex < blockEndIdx ; ++mindex){
707 708 709
                        if(currentCells->exists(mindex)){
                            const typename CellContainerClass::CompleteCellClass cell = currentCells->getCompleteCell(mindex);
                            FAssertLF(cell.getMortonIndex() == mindex);
710 711
                            MortonIndex interactionsIndexes[189];
                            int interactionsPosition[189];
712
                            const FTreeCoordinate coord(cell.getCoordinate());
713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757
                            int counter = coord.getInteractionNeighbors(idxLevel,interactionsIndexes,interactionsPosition);

                            for(int idxInter = 0 ; idxInter < counter ; ++idxInter){
                                // This interactions need a block owned by someoneelse
                                if(interactionsIndexes[idxInter] < myFirstIndex || myLastIndex <= interactionsIndexes[idxInter]){
                                    OutOfBlockInteraction property;
                                    property.insideIndex = mindex;
                                    property.outIndex    = interactionsIndexes[idxInter];
                                    property.outPosition = interactionsPosition[idxInter];
                                    outsideInteractions.push_back(property);
                                }
                            }
                        }
                    }

                    // Manage outofblock interaction
                    FQuickSort<OutOfBlockInteraction, int>::QsSequential(outsideInteractions.data(),int(outsideInteractions.size()));

                    int currentOutInteraction = 0;
                    for(int idxOtherGroup = 0 ; idxOtherGroup < int(processesBlockInfos[idxLevel].size())
                                                && currentOutInteraction < int(outsideInteractions.size()) ; ++idxOtherGroup){
                        // Skip my blocks
                        if(idxOtherGroup == nbBlocksBeforeMinPerLevel[idxLevel]){
                            idxOtherGroup += tree->getNbCellGroupAtLevel(idxLevel);
                            if(idxOtherGroup == int(processesBlockInfos[idxLevel].size())){
                                break;
                            }
                            FAssertLF(idxOtherGroup < int(processesBlockInfos[idxLevel].size()));
                        }

                        const MortonIndex blockStartIdx = processesBlockInfos[idxLevel][idxOtherGroup].firstIndex;
                        const MortonIndex blockEndIdx   = processesBlockInfos[idxLevel][idxOtherGroup].lastIndex;

                        while(currentOutInteraction < int(outsideInteractions.size()) && outsideInteractions[currentOutInteraction].outIndex < blockStartIdx){
                            currentOutInteraction += 1;
                        }

                        int lastOutInteraction = currentOutInteraction;
                        while(lastOutInteraction < int(outsideInteractions.size()) && outsideInteractions[lastOutInteraction].outIndex < blockEndIdx){
                            lastOutInteraction += 1;
                        }

                        // Create interactions
                        const int nbInteractionsBetweenBlocks = (lastOutInteraction-currentOutInteraction);
                        if(nbInteractionsBetweenBlocks){
758
                            if(remoteCellGroups[idxLevel][idxOtherGroup].ptrSymb == nullptr){
759 760
                                #pragma omp critical(CreateM2LRemotes)
                                {
761 762
                                    if(remoteCellGroups[idxLevel][idxOtherGroup].ptrSymb == nullptr){
                                        const int nbBytesInBlockSymb = processesBlockInfos[idxLevel][idxOtherGroup].bufferSizeSymb;
763
                                        unsigned char* memoryBlockSymb = (unsigned char*)FAlignedMemory::AllocateBytes<32>(nbBytesInBlockSymb);
764 765 766 767
                                        remoteCellGroups[idxLevel][idxOtherGroup].ptrSymb = memoryBlockSymb;
                                        starpu_variable_data_register(&remoteCellGroups[idxLevel][idxOtherGroup].handleSymb, 0,
                                                                      (uintptr_t)remoteCellGroups[idxLevel][idxOtherGroup].ptrSymb, nbBytesInBlockSymb);
                                        const int nbBytesInBlockUp = processesBlockInfos[idxLevel][idxOtherGroup].bufferSizeUp;
768
                                        unsigned char* memoryBlockUp = (unsigned char*)FAlignedMemory::AllocateBytes<32>(nbBytesInBlockUp);
769 770 771
                                        remoteCellGroups[idxLevel][idxOtherGroup].ptrUp = memoryBlockUp;
                                        starpu_variable_data_register(&remoteCellGroups[idxLevel][idxOtherGroup].handleUp, 0,
                                                                      (uintptr_t)remoteCellGroups[idxLevel][idxOtherGroup].ptrUp, nbBytesInBlockUp);
772 773
                                    }
                                }
774 775
                            }

776 777
                            externalInteractions->emplace_back();
                            BlockInteractions<CellContainerClass>* interactions = &externalInteractions->back();
778
                            //interactions->otherBlock = remoteCellGroups[idxLevel][idxOtherGroup].ptr;
779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796
                            interactions->otherBlockId = idxOtherGroup;
                            interactions->interactions.resize(nbInteractionsBetweenBlocks);
                            std::copy(outsideInteractions.begin() + currentOutInteraction,
                                      outsideInteractions.begin() + lastOutInteraction,
                                      interactions->interactions.begin());
                        }

                        currentOutInteraction = lastOutInteraction;
                    }
                }
            }
        }
        // P2P
        // We create one big vector per block
        {
            const MortonIndex myFirstIndex = tree->getParticleGroup(0)->getStartingIndex();
            const MortonIndex myLastIndex = tree->getParticleGroup(tree->getNbParticleGroup()-1)->getEndingIndex();

797
            externalInteractionsLeafLevelMpi.clear();
798 799 800 801 802 803 804 805 806 807 808 809 810 811
            externalInteractionsLeafLevelMpi.resize(tree->getNbParticleGroup());
            for(int idxGroup = 0 ; idxGroup < tree->getNbParticleGroup() ; ++idxGroup){
                // Create the vector
                ParticleGroupClass* containers = tree->getParticleGroup(idxGroup);

                std::vector<BlockInteractions<ParticleGroupClass>>* externalInteractions = &externalInteractionsLeafLevelMpi[idxGroup];

                #pragma omp task default(none) firstprivate(idxGroup, containers, externalInteractions)
                { // Can be a task(inout:iterCells)
                    std::vector<OutOfBlockInteraction> outsideInteractions;
                    const MortonIndex blockStartIdx = containers->getStartingIndex();
                    const MortonIndex blockEndIdx   = containers->getEndingIndex();

                    for(MortonIndex mindex = blockStartIdx ; mindex < blockEndIdx ; ++mindex){
812 813
                        // ParticleContainerClass particles = containers->template getLeaf<ParticleContainerClass>(mindex);
                        if(containers->exists(mindex)){
814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860
                            MortonIndex interactionsIndexes[26];
                            int interactionsPosition[26];
                            FTreeCoordinate coord(mindex, tree->getHeight()-1);
                            int counter = coord.getNeighborsIndexes(tree->getHeight(),interactionsIndexes,interactionsPosition);

                            for(int idxInter = 0 ; idxInter < counter ; ++idxInter){
                                if(interactionsIndexes[idxInter] < myFirstIndex ||
                                        myLastIndex <= interactionsIndexes[idxInter]){
                                    OutOfBlockInteraction property;
                                    property.insideIndex = mindex;
                                    property.outIndex    = interactionsIndexes[idxInter];
                                    property.outPosition = interactionsPosition[idxInter];
                                    outsideInteractions.push_back(property);
                                }
                            }
                        }
                    }

                    // Sort to match external order
                    FQuickSort<OutOfBlockInteraction, int>::QsSequential(outsideInteractions.data(),int(outsideInteractions.size()));

                    int currentOutInteraction = 0;
                    for(int idxOtherGroup = 0 ; idxOtherGroup < int(processesBlockInfos[tree->getHeight()-1].size())
                                                && currentOutInteraction < int(outsideInteractions.size()) ; ++idxOtherGroup){
                        // Skip my blocks
                        if(idxOtherGroup == nbBlocksBeforeMinPerLevel[tree->getHeight()-1]){
                            idxOtherGroup += tree->getNbCellGroupAtLevel(tree->getHeight()-1);
                            if(idxOtherGroup == int(processesBlockInfos[tree->getHeight()-1].size())){
                                break;
                            }
                            FAssertLF(idxOtherGroup < int(processesBlockInfos[tree->getHeight()-1].size()));
                        }

                        const MortonIndex blockStartIdx = processesBlockInfos[tree->getHeight()-1][idxOtherGroup].firstIndex;
                        const MortonIndex blockEndIdx   = processesBlockInfos[tree->getHeight()-1][idxOtherGroup].lastIndex;

                        while(currentOutInteraction < int(outsideInteractions.size()) && outsideInteractions[currentOutInteraction].outIndex < blockStartIdx){
                            currentOutInteraction += 1;
                        }

                        int lastOutInteraction = currentOutInteraction;
                        while(lastOutInteraction < int(outsideInteractions.size()) && outsideInteractions[lastOutInteraction].outIndex < blockEndIdx){
                            lastOutInteraction += 1;
                        }

                        const int nbInteractionsBetweenBlocks = (lastOutInteraction-currentOutInteraction);
                        if(nbInteractionsBetweenBlocks){
861
                            if(remoteParticleGroupss[idxOtherGroup].ptrSymb == nullptr){
862 863
                                #pragma omp critical(CreateM2LRemotes)
                                {
864
                                    if(remoteParticleGroupss[idxOtherGroup].ptrSymb == nullptr){
865
                                        const int nbBytesInBlock = processesBlockInfos[tree->getHeight()-1][idxOtherGroup].leavesBufferSize;
866
                                        unsigned char* memoryBlock = (unsigned char*)FAlignedMemory::AllocateBytes<32>(nbBytesInBlock);
867 868 869
                                        remoteParticleGroupss[idxOtherGroup].ptrSymb = memoryBlock;
                                        starpu_variable_data_register(&remoteParticleGroupss[idxOtherGroup].handleSymb, 0,
                                                                      (uintptr_t)remoteParticleGroupss[idxOtherGroup].ptrSymb, nbBytesInBlock);
870 871
                                    }
                                }
872 873
                            }

874 875
                            externalInteractions->emplace_back();
                            BlockInteractions<ParticleGroupClass>* interactions = &externalInteractions->back();
876
                            //interactions->otherBlock = remoteParticleGroupss[idxOtherGroup].ptr;
877 878 879 880 881 882 883 884 885 886 887 888 889
                            interactions->otherBlockId = idxOtherGroup;
                            interactions->interactions.resize(nbInteractionsBetweenBlocks);
                            std::copy(outsideInteractions.begin() + currentOutInteraction,
                                      outsideInteractions.begin() + lastOutInteraction,
                                      interactions->interactions.begin());
                        }

                        currentOutInteraction = lastOutInteraction;
                    }
                }
            }
        }
    }
890

891 892 893 894 895 896 897 898 899 900 901
    struct MpiDependency{
        int src;
        int dest;
        int level;
        int globalBlockId;
    };

    std::vector<MpiDependency> toSend;

    void postRecvAllocatedBlocks(){
        std::vector<MpiDependency> toRecv;
902
        FAssertLF(tree->getHeight() == int(remoteCellGroups.size()));
903 904
        for(int idxLevel = 0 ; idxLevel < tree->getHeight() ; ++idxLevel){
            for(int idxHandle = 0 ; idxHandle < int(remoteCellGroups[idxLevel].size()) ; ++idxHandle){
905 906 907 908 909 910 911 912 913 914 915 916
                if(remoteCellGroups[idxLevel][idxHandle].ptrSymb){
                    FAssertLF(remoteCellGroups[idxLevel][idxHandle].ptrUp);
                    FLOG(FLog::Controller << "[SMpi] " << idxLevel << " Post a recv during M2L for Idx " << processesBlockInfos[idxLevel][idxHandle].firstIndex <<
                         " and dest is " << processesBlockInfos[idxLevel][idxHandle].owner << " tag " << getTag(idxLevel,processesBlockInfos[idxLevel][idxHandle].firstIndex, 0) << "\n");
                    FLOG(FLog::Controller << "[SMpi] " << idxLevel << " Post a recv during M2L for Idx " << processesBlockInfos[idxLevel][idxHandle].firstIndex <<
                         " and dest is " << processesBlockInfos[idxLevel][idxHandle].owner << " tag " << getTag(idxLevel,processesBlockInfos[idxLevel][idxHandle].firstIndex, 1) << "\n");

                    starpu_mpi_irecv_detached( remoteCellGroups[idxLevel][idxHandle].handleSymb,
                                                processesBlockInfos[idxLevel][idxHandle].owner,
                                                getTag(idxLevel,processesBlockInfos[idxLevel][idxHandle].firstIndex, 0),
                                                comm.getComm(), 0, 0 );
                    starpu_mpi_irecv_detached( remoteCellGroups[idxLevel][idxHandle].handleUp,
917
                                                processesBlockInfos[idxLevel][idxHandle].owner,
918
                                                getTag(idxLevel,processesBlockInfos[idxLevel][idxHandle].firstIndex, 1),
919 920 921 922 923 924 925 926 927
                                                comm.getComm(), 0, 0 );

                    toRecv.push_back({processesBlockInfos[idxLevel][idxHandle].owner,
                                        comm.processId(), idxLevel, idxHandle});
                }
            }
        }
        {
            for(int idxHandle = 0 ; idxHandle < int(remoteParticleGroupss.size()) ; ++idxHandle){
928
                if(remoteParticleGroupss[idxHandle].ptrSymb){
BRAMAS Berenger's avatar
BRAMAS Berenger committed
929
                    FLOG(FLog::Controller << "[SMpi] Post a recv during P2P for Idx " << processesBlockInfos[tree->getHeight()-1][idxHandle].firstIndex <<
930
                         " and dest is " << processesBlockInfos[tree->getHeight()-1][idxHandle].owner << " tag " << getTag(tree->getHeight(),processesBlockInfos[tree->getHeight()-1][idxHandle].firstIndex, 0) << "\n");
BRAMAS Berenger's avatar
BRAMAS Berenger committed
931

932
                    starpu_mpi_irecv_detached( remoteParticleGroupss[idxHandle].handleSymb,
933
                                                processesBlockInfos[tree->getHeight()-1][idxHandle].owner,
934
                                                getTag(tree->getHeight(),processesBlockInfos[tree->getHeight()-1][idxHandle].firstIndex, 0),
935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966
                                                comm.getComm(), 0, 0 );

                    toRecv.push_back({processesBlockInfos[tree->getHeight()-1][idxHandle].owner,
                                        comm.processId(), tree->getHeight(), idxHandle});
                }
            }
        }

        FQuickSort<MpiDependency, int>::QsSequential(toRecv.data(),int(toRecv.size()),[](const MpiDependency& d1, const MpiDependency& d2){
            return d1.src <= d2.src;
        });

        std::unique_ptr<int[]> nbBlocksToRecvFromEach(new int[comm.processCount()]);
        memset(nbBlocksToRecvFromEach.get(), 0, sizeof(int)*comm.processCount());
        for(int idxDep = 0 ; idxDep < int(toRecv.size()) ; ++idxDep){
            nbBlocksToRecvFromEach[toRecv[idxDep].src] += 1;
        }

        FAssertLF(nbBlocksToRecvFromEach[comm.processId()] == 0);
        int offset = 0;

        for(int idxProc = 0 ; idxProc < comm.processCount() ; ++idxProc){
            if(idxProc == comm.processId()){
                // How much to send to each
                std::unique_ptr<int[]> nbBlocksToSendToEach(new int[comm.processCount()]);
                FMpi::Assert(MPI_Gather(&nbBlocksToRecvFromEach[idxProc], 1,
                                 MPI_INT, nbBlocksToSendToEach.get(), 1,
                                 MPI_INT, idxProc, comm.getComm() ), __LINE__);

                std::unique_ptr<int[]> displs(new int[comm.processCount()]);
                displs[0] = 0;
                for(int idxProc = 1 ; idxProc < comm.processCount() ; ++idxProc){
967
                    displs[idxProc] = displs[idxProc-1] + nbBlocksToSendToEach[idxProc-1];
968
                }
969
                toSend.resize(displs[comm.processCount()-1] + nbBlocksToSendToEach[comm.processCount()-1]);
970

971 972 973 974 975
                // We work in bytes
                for(int idxProc = 0 ; idxProc < comm.processCount() ; ++idxProc){
                    nbBlocksToSendToEach[idxProc] *= sizeof(MpiDependency);
                    displs[idxProc] *= sizeof(MpiDependency);
                }
976

977 978
                FMpi::Assert(MPI_Gatherv( nullptr, 0, MPI_BYTE,
                                 toSend.data(),
979 980 981 982 983 984 985
                                 nbBlocksToSendToEach.get(), displs.get(),
                                 MPI_BYTE, idxProc, comm.getComm()), __LINE__);
            }
            else{
                FMpi::Assert(MPI_Gather(&nbBlocksToRecvFromEach[idxProc], 1,
                                 MPI_INT, 0, 0, MPI_INT, idxProc, comm.getComm() ), __LINE__);
                FMpi::Assert(MPI_Gatherv(
986
                                 &toRecv[offset], int(nbBlocksToRecvFromEach[idxProc]*sizeof(MpiDependency)), MPI_BYTE,
987
                                 0, 0, 0, MPI_BYTE, idxProc, comm.getComm() ), __LINE__);
988

989 990 991 992 993 994 995 996 997
                offset += nbBlocksToRecvFromEach[idxProc];
            }
        }
    }

    void insertParticlesSend(){
        for(int idxSd = 0 ; idxSd < int(toSend.size()) ; ++idxSd){
            const MpiDependency sd = toSend[idxSd];
            if(sd.level == tree->getHeight()){
998 999
                const int localId = sd.globalBlockId - nbBlocksBeforeMinPerLevel[tree->getHeight()-1];
                FAssertLF(sd.src == comm.processId());
1000 1001 1002 1003
                FAssertLF(0 <= localId);
                FAssertLF(localId < tree->getNbParticleGroup());

                FLOG(FLog::Controller << "[SMpi] Post a send during P2P for Idx " << tree->getParticleGroup(localId)->getStartingIndex() <<
1004
                     " and dest is " << sd.dest << " tag " << getTag(tree->getHeight(),tree->getParticleGroup(localId)->getStartingIndex(), 0) <<  "\n");
1005

1006 1007
                starpu_mpi_isend_detached( particleHandles[localId].symb, sd.dest,
                        getTag(tree->getHeight(),tree->getParticleGroup(localId)->getStartingIndex(), 0),
1008 1009 1010 1011 1012 1013 1014 1015 1016
                        comm.getComm(), 0/*callback*/, 0/*arg*/ );
            }
        }
    }

    void insertCellsSend(){
        for(int idxSd = 0 ; idxSd < int(toSend.size()) ; ++idxSd){
            const MpiDependency sd = toSend[idxSd];
            if(sd.level != tree->getHeight()){
1017 1018
                const int localId = sd.globalBlockId - nbBlocksBeforeMinPerLevel[sd.level];
                FAssertLF(sd.src == comm.processId());
1019 1020 1021
                FAssertLF(0 <= localId);
                FAssertLF(localId < tree->getNbCellGroupAtLevel(sd.level));

1022 1023 1024 1025
                FLOG(FLog::Controller << "[SMpi] " << sd.level << " Post a send during M2L for Idx " << tree->getCellGroup(sd.level, localId)->getStartingIndex() <<
                     " and dest is " << sd.dest << " tag " << getTag(sd.level,tree->getCellGroup(sd.level, localId)->getStartingIndex(), 0) << "\n");
                FLOG(FLog::Controller << "[SMpi] " << sd.level << " Post a send during M2L for Idx " << tree->getCellGroup(sd.level, localId)->getStartingIndex() <<
                     " and dest is " << sd.dest << " tag " << getTag(sd.level,tree->getCellGroup(sd.level, localId)->getStartingIndex(), 1) << "\n");
1026

1027 1028 1029 1030 1031
                starpu_mpi_isend_detached( cellHandles[sd.level][localId].symb, sd.dest,
                        getTag(sd.level,tree->getCellGroup(sd.level, localId)->getStartingIndex(), 0),
                        comm.getComm(), 0/*callback*/, 0/*arg*/ );
                starpu_mpi_isend_detached( cellHandles[sd.level][localId].up, sd.dest,
                        getTag(sd.level,tree->getCellGroup(sd.level, localId)->getStartingIndex(), 1),
1032 1033 1034 1035 1036
                        comm.getComm(), 0/*callback*/, 0/*arg*/ );
            }
        }
    }

1037
    void cleanHandleMpi(){
1038
        for(int idxLevel = 0 ; idxLevel < int(remoteCellGroups.size()) ; ++idxLevel){
1039
            for(int idxHandle = 0 ; idxHandle < int(remoteCellGroups[idxLevel].size()) ; ++idxHandle){
1040 1041 1042
                if(remoteCellGroups[idxLevel][idxHandle].ptrSymb){
                    starpu_data_unregister(remoteCellGroups[idxLevel][idxHandle].handleSymb);
                    starpu_data_unregister(remoteCellGroups[idxLevel][idxHandle].handleUp);
1043 1044
                    FAlignedMemory::DeallocBytes(remoteCellGroups[idxLevel][idxHandle].ptrSymb);
                    FAlignedMemory::DeallocBytes(remoteCellGroups[idxLevel][idxHandle].ptrUp);
1045 1046 1047

                    if(remoteCellGroups[idxLevel][idxHandle].ptrDown){
                        starpu_data_unregister(remoteCellGroups[idxLevel][idxHandle].handleDown);
1048
                        FAlignedMemory::DeallocBytes(remoteCellGroups[idxLevel][idxHandle].ptrDown);
1049
                    }
1050 1051 1052 1053 1054 1055
                }
            }
            remoteCellGroups[idxLevel].clear();
        }
        {
            for(int idxHandle = 0 ; idxHandle < int(remoteParticleGroupss.size()) ; ++idxHandle){
1056 1057
                if(remoteParticleGroupss[idxHandle].ptrSymb){
                    starpu_data_unregister(remoteParticleGroupss[idxHandle].handleSymb);
1058
                    FAlignedMemory::DeallocBytes(remoteParticleGroupss[idxHandle].ptrSymb);
1059 1060 1061 1062 1063 1064
                }
            }
            remoteParticleGroupss.clear();
        }
    }

1065 1066 1067 1068 1069 1070 1071 1072 1073
    ////////////////////////////////////////////////////////////////////////////

    /** Reset the handles array and create new ones to define
     * in a starpu way each block of data
     */
    void buildHandles(){
        cleanHandle();

        for(int idxLevel = 2 ; idxLevel < tree->getHeight() ; ++idxLevel){
1074
            cellHandles[idxLevel].resize(tree->getNbCellGroupAtLevel(idxLevel));
1075 1076 1077

            for(int idxGroup = 0 ; idxGroup < tree->getNbCellGroupAtLevel(idxLevel) ; ++idxGroup){
                const CellContainerClass* currentCells = tree->getCellGroup(idxLevel, idxGroup);