FGroupTaskStarpuMpiAlgorithm.hpp 138 KB
Newer Older
1 2 3 4
// Keep in private GIT
#ifndef FGROUPTASKSTARPUMPIALGORITHM_HPP
#define FGROUPTASKSTARPUMPIALGORITHM_HPP

5 6 7 8 9 10 11 12 13
#include "../../Utils/FGlobal.hpp"
#include "../../Core/FCoreCommon.hpp"
#include "../../Utils/FQuickSort.hpp"
#include "../../Containers/FTreeCoordinate.hpp"
#include "../../Utils/FLog.hpp"
#include "../../Utils/FTic.hpp"
#include "../../Utils/FAssert.hpp"
#include "../../Utils/FAlignedMemory.hpp"
#include "../../Utils/FAssert.hpp"
14
#include "../../Utils/FEnv.hpp"
15 16

#include "../../Utils/FMpi.hpp"
17

18 19
#include "FOutOfBlockInteraction.hpp"

20 21 22 23 24 25
#include <vector>
#include <memory>

#include <omp.h>

#include <starpu.h>
26
#include <starpu_mpi.h>
27
#include "../StarPUUtils/FStarPUUtils.hpp"
28
#include "../StarPUUtils/FStarPUFmmPriorities.hpp"
29 30
#include "../StarPUUtils/FStarPUFmmPrioritiesV2.hpp"
#include "../StarPUUtils/FStarPUReduxCpu.hpp"
31

32
#ifdef STARPU_USE_CPU
33
#include "../StarPUUtils/FStarPUCpuWrapper.hpp"
34
#endif
35
#ifdef SCALFMM_ENABLE_CUDA_KERNEL
36 37 38 39 40
#include "../StarPUUtils/FStarPUCudaWrapper.hpp"
#include "../Cuda/FCudaEmptyKernel.hpp"
#include "../Cuda/FCudaGroupAttachedLeaf.hpp"
#include "../Cuda/FCudaGroupOfParticles.hpp"
#include "../Cuda/FCudaGroupOfCells.hpp"
41
#include "../Cuda/FCudaEmptyCellSymb.hpp"
42
#endif
43
#ifdef SCALFMM_ENABLE_OPENCL_KERNEL
44 45
#include "../StarPUUtils/FStarPUOpenClWrapper.hpp"
#include "../OpenCl/FOpenCLDeviceWrapper.hpp"
46
#include "../OpenCl/FEmptyOpenCLCode.hpp"
47
#endif
48

BRAMAS Berenger's avatar
BRAMAS Berenger committed
49 50
#include "../StarPUUtils/FStarPUReduxCpu.hpp"

51
#ifdef SCALFMM_SIMGRID_TASKNAMEPARAMS
Martin Khannouz's avatar
Martin Khannouz committed
52
#include "../StarPUUtils/FStarPUTaskNameParams.hpp"
53
#endif
54

55
template <class OctreeClass, class CellContainerClass, class KernelClass, class ParticleGroupClass, class StarPUCpuWrapperClass
BRAMAS Berenger's avatar
BRAMAS Berenger committed
56 57 58 59 60 61 62
          #ifdef SCALFMM_ENABLE_CUDA_KERNEL
          , class StarPUCudaWrapperClass = FStarPUCudaWrapper<KernelClass, FCudaEmptyCellSymb, int, int, FCudaGroupOfCells<FCudaEmptyCellSymb, int, int>,
                                                              FCudaGroupOfParticles<int, 0, 0, int>, FCudaGroupAttachedLeaf<int, 0, 0, int>, FCudaEmptyKernel<int> >
          #endif
          #ifdef SCALFMM_ENABLE_OPENCL_KERNEL
          , class StarPUOpenClWrapperClass = FStarPUOpenClWrapper<KernelClass, FOpenCLDeviceWrapper<KernelClass>>
          #endif
63
          >
64
class FGroupTaskStarPUMpiAlgorithm : public FAbstractAlgorithm {
65
protected:
66
    typedef FGroupTaskStarPUMpiAlgorithm<OctreeClass, CellContainerClass, KernelClass, ParticleGroupClass, StarPUCpuWrapperClass
67
#ifdef SCALFMM_ENABLE_CUDA_KERNEL
BRAMAS Berenger's avatar
BRAMAS Berenger committed
68
    , StarPUCudaWrapperClass
69
#endif
70
#ifdef SCALFMM_ENABLE_OPENCL_KERNEL
71
    , StarPUOpenClWrapperClass
72 73
#endif
    > ThisClass;
74

75
    int getTag(const int inLevel, const MortonIndex mindex, const int mode) const{
76 77 78
        int shift = 0;
        int height = tree->getHeight();
        while(height) { shift += 1; height >>= 1; }
BRAMAS Berenger's avatar
BRAMAS Berenger committed
79
        return int((((mindex<<shift) + inLevel) << 5) + mode);
80 81
    }

82 83 84 85 86 87 88 89 90
    const FMpi::FComm& comm;

    template <class OtherBlockClass>
    struct BlockInteractions{
        OtherBlockClass* otherBlock;
        int otherBlockId;
        std::vector<OutOfBlockInteraction> interactions;
    };

91 92 93 94
    struct CellHandles{
        starpu_data_handle_t symb;
        starpu_data_handle_t up;
        starpu_data_handle_t down;
95
        int intervalSize;
96 97 98 99 100
    };

    struct ParticleHandles{
        starpu_data_handle_t symb;
        starpu_data_handle_t down;
101
        int intervalSize;
102 103
    };

104 105 106 107
    std::vector< std::vector< std::vector<BlockInteractions<CellContainerClass>>>> externalInteractionsAllLevel;
    std::vector< std::vector<BlockInteractions<ParticleGroupClass>>> externalInteractionsLeafLevel;

    OctreeClass*const tree;       //< The Tree
108
    KernelClass*const originalCpuKernel;
109

110 111
    std::vector<CellHandles>* cellHandles;
    std::vector<ParticleHandles> particleHandles;
112 113

    starpu_codelet p2m_cl;
BRAMAS Berenger's avatar
BRAMAS Berenger committed
114 115
    starpu_codelet m2m_cl;
    starpu_codelet l2l_cl;
BRAMAS Berenger's avatar
BRAMAS Berenger committed
116
    starpu_codelet l2l_cl_nocommute;
117 118 119 120
    starpu_codelet l2p_cl;

    starpu_codelet m2l_cl_in;
    starpu_codelet m2l_cl_inout;
121
    starpu_codelet m2l_cl_inout_mpi;
122 123 124

    starpu_codelet p2p_cl_in;
    starpu_codelet p2p_cl_inout;
125
    starpu_codelet p2p_cl_inout_mpi;
126

BRAMAS Berenger's avatar
BRAMAS Berenger committed
127 128 129 130 131 132 133

#ifdef STARPU_USE_REDUX
    starpu_codelet p2p_redux_init;
    starpu_codelet p2p_redux_perform;
    starpu_codelet p2p_redux_read;
#endif

134 135 136
    const bool noCommuteAtLastLevel;
    const bool noCommuteBetweenLevel;

137
#ifdef STARPU_USE_CPU
138
    StarPUCpuWrapperClass cpuWrapper;
139
#endif
140
#ifdef SCALFMM_ENABLE_CUDA_KERNEL
141 142
    StarPUCudaWrapperClass cudaWrapper;
#endif
143
#ifdef SCALFMM_ENABLE_OPENCL_KERNEL
144 145
    StarPUOpenClWrapperClass openclWrapper;
#endif
146 147 148

    FStarPUPtrInterface wrappers;
    FStarPUPtrInterface* wrapperptr;
149

150
#ifdef STARPU_SUPPORT_ARBITER
BRAMAS Berenger's avatar
BRAMAS Berenger committed
151
    starpu_arbiter_t arbiterGlobal;
152
#endif
BRAMAS Berenger's avatar
BRAMAS Berenger committed
153 154

#ifdef STARPU_USE_TASK_NAME
155 156 157 158 159 160 161 162 163 164
#ifndef SCALFMM_SIMGRID_TASKNAMEPARAMS
    std::vector<std::unique_ptr<char[]>> m2mTaskNames;
    std::vector<std::unique_ptr<char[]>> m2lTaskNames;
    std::vector<std::unique_ptr<char[]>> m2lOuterTaskNames;
    std::vector<std::unique_ptr<char[]>> l2lTaskNames;
    std::unique_ptr<char[]> p2mTaskNames;
    std::unique_ptr<char[]> l2pTaskNames;
    std::unique_ptr<char[]> p2pTaskNames;
    std::unique_ptr<char[]> p2pOuterTaskNames;
#else
Martin Khannouz's avatar
Martin Khannouz committed
165
    FStarPUTaskNameParams taskNames;
BRAMAS Berenger's avatar
BRAMAS Berenger committed
166
#endif
167
#endif
168 169 170
#ifdef SCALFMM_STARPU_USE_PRIO
    typedef FStarPUFmmPrioritiesV2 PrioClass;// FStarPUFmmPriorities
#endif
171
	std::list<char*> taskName;
BRAMAS Berenger's avatar
BRAMAS Berenger committed
172

173
public:
174 175
    FGroupTaskStarPUMpiAlgorithm(const FMpi::FComm& inComm, OctreeClass*const inTree, KernelClass* inKernels)
        :   comm(inComm), tree(inTree), originalCpuKernel(inKernels),
176
          cellHandles(nullptr),
177 178
          noCommuteAtLastLevel(FEnv::GetBool("SCALFMM_NO_COMMUTE_LAST_L2L", true)),
          noCommuteBetweenLevel(FEnv::GetBool("SCALFMM_NO_COMMUTE_M2L_L2L", false)),
BRAMAS Berenger's avatar
BRAMAS Berenger committed
179 180 181 182 183 184 185 186
      #ifdef STARPU_USE_CPU
          cpuWrapper(tree->getHeight()),
      #endif
      #ifdef SCALFMM_ENABLE_CUDA_KERNEL
          cudaWrapper(tree->getHeight()),
      #endif
      #ifdef SCALFMM_ENABLE_OPENCL_KERNEL
          openclWrapper(tree->getHeight()),
Martin Khannouz's avatar
Martin Khannouz committed
187 188
      #endif
	  #ifdef STARPU_USE_TASK_NAME
189 190 191
	  #ifdef SCALFMM_SIMGRID_TASKNAMEPARAMS
         taskNames(inComm.processId(), inComm.processCount()),
      #endif
BRAMAS Berenger's avatar
BRAMAS Berenger committed
192 193
      #endif
          wrapperptr(&wrappers){
194 195 196
        FAssertLF(tree, "tree cannot be null");
        FAssertLF(inKernels, "kernels cannot be null");

197 198
        FAbstractAlgorithm::setNbLevelsInTree(tree->getHeight());

199 200
        struct starpu_conf conf;
        FAssertLF(starpu_conf_init(&conf) == 0);
201
#ifdef SCALFMM_STARPU_USE_PRIO
202
        PrioClass::Controller().init(&conf, tree->getHeight(), inKernels);
203
#endif
204
        FAssertLF(starpu_init(&conf) == 0);
205
        FAssertLF(starpu_mpi_init ( 0, 0, 0 ) == 0);
206

BRAMAS Berenger's avatar
BRAMAS Berenger committed
207 208
        starpu_malloc_set_align(32);

209 210
        starpu_pthread_mutex_t initMutex;
        starpu_pthread_mutex_init(&initMutex, NULL);
211
#ifdef STARPU_USE_CPU
212 213 214 215 216
        FStarPUUtils::ExecOnWorkers(STARPU_CPU, [&](){
            starpu_pthread_mutex_lock(&initMutex);
            cpuWrapper.initKernel(starpu_worker_get_id(), inKernels);
            starpu_pthread_mutex_unlock(&initMutex);
        });
217
        wrappers.set(FSTARPU_CPU_IDX, &cpuWrapper);
218
#endif
219
#ifdef SCALFMM_ENABLE_CUDA_KERNEL
220 221 222 223 224 225 226
        FStarPUUtils::ExecOnWorkers(STARPU_CUDA, [&](){
            starpu_pthread_mutex_lock(&initMutex);
            cudaWrapper.initKernel(starpu_worker_get_id(), inKernels);
            starpu_pthread_mutex_unlock(&initMutex);
        });
        wrappers.set(FSTARPU_CUDA_IDX, &cudaWrapper);
#endif
227
#ifdef SCALFMM_ENABLE_OPENCL_KERNEL
228 229 230 231 232 233
        FStarPUUtils::ExecOnWorkers(STARPU_OPENCL, [&](){
            starpu_pthread_mutex_lock(&initMutex);
            openclWrapper.initKernel(starpu_worker_get_id(), inKernels);
            starpu_pthread_mutex_unlock(&initMutex);
        });
        wrappers.set(FSTARPU_OPENCL_IDX, &openclWrapper);
234
#endif
235 236
        starpu_pthread_mutex_destroy(&initMutex);

237 238
        starpu_pause();

239
        cellHandles   = new std::vector<CellHandles>[tree->getHeight()];
240

241
#ifdef STARPU_SUPPORT_ARBITER
BRAMAS Berenger's avatar
BRAMAS Berenger committed
242
        arbiterGlobal = starpu_arbiter_create();
243
#endif
BRAMAS Berenger's avatar
BRAMAS Berenger committed
244 245
        initCodelet();
        initCodeletMpi();
BRAMAS Berenger's avatar
BRAMAS Berenger committed
246
        rebuildInteractions();
BRAMAS Berenger's avatar
BRAMAS Berenger committed
247

BRAMAS Berenger's avatar
BRAMAS Berenger committed
248
        FLOG(FLog::Controller << "FGroupTaskStarPUAlgorithm (Max Worker " << starpu_worker_get_count() << ")\n");
BRAMAS Berenger's avatar
BRAMAS Berenger committed
249
#ifdef STARPU_USE_CPU
BRAMAS Berenger's avatar
BRAMAS Berenger committed
250
        FLOG(FLog::Controller << "FGroupTaskStarPUAlgorithm (Max CPU " << starpu_cpu_worker_get_count() << ")\n");
BRAMAS Berenger's avatar
BRAMAS Berenger committed
251
#endif
252
#ifdef SCALFMM_ENABLE_OPENCL_KERNEL
BRAMAS Berenger's avatar
BRAMAS Berenger committed
253 254
        FLOG(FLog::Controller << "FGroupTaskStarPUAlgorithm (Max OpenCL " << starpu_opencl_worker_get_count() << ")\n");
#endif
255
#ifdef SCALFMM_ENABLE_CUDA_KERNEL
BRAMAS Berenger's avatar
BRAMAS Berenger committed
256 257
        FLOG(FLog::Controller << "FGroupTaskStarPUAlgorithm (Max CUDA " << starpu_cuda_worker_get_count() << ")\n");
#endif
258 259
        FLOG(FLog::Controller << "SCALFMM_NO_COMMUTE_LAST_L2L " << noCommuteAtLastLevel << "\n");
        FLOG(FLog::Controller << "SCALFMM_NO_COMMUTE_M2L_L2L " << noCommuteBetweenLevel << "\n");
BRAMAS Berenger's avatar
BRAMAS Berenger committed
260 261 262 263 264

        buildTaskNames();
    }

    void buildTaskNames(){
265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292
#ifdef STARPU_USE_TASK_NAME
#ifndef SCALFMM_SIMGRID_TASKNAMEPARAMS
        const int namesLength = 128;
        m2mTaskNames.resize(tree->getHeight());
        m2lTaskNames.resize(tree->getHeight());
        m2lOuterTaskNames.resize(tree->getHeight());
        l2lTaskNames.resize(tree->getHeight());
        for(int idxLevel = 0 ; idxLevel < tree->getHeight() ; ++idxLevel){
            m2mTaskNames[idxLevel].reset(new char[namesLength]);
            snprintf(m2mTaskNames[idxLevel].get(), namesLength, "M2M-level-%d", idxLevel);
            m2lTaskNames[idxLevel].reset(new char[namesLength]);
            snprintf(m2lTaskNames[idxLevel].get(), namesLength, "M2L-level-%d", idxLevel);
            m2lOuterTaskNames[idxLevel].reset(new char[namesLength]);
            snprintf(m2lOuterTaskNames[idxLevel].get(), namesLength, "M2L-out-level-%d", idxLevel);
            l2lTaskNames[idxLevel].reset(new char[namesLength]);
            snprintf(l2lTaskNames[idxLevel].get(), namesLength, "L2L-level-%d", idxLevel);
        }

        p2mTaskNames.reset(new char[namesLength]);
        snprintf(p2mTaskNames.get(), namesLength, "P2M");
        l2pTaskNames.reset(new char[namesLength]);
        snprintf(l2pTaskNames.get(), namesLength, "L2P");
        p2pTaskNames.reset(new char[namesLength]);
        snprintf(p2pTaskNames.get(), namesLength, "P2P");
        p2pOuterTaskNames.reset(new char[namesLength]);
        snprintf(p2pOuterTaskNames.get(), namesLength, "P2P-out");
#endif
#endif
293 294
    }

295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315
    void syncData(){
        for(int idxLevel = 0 ; idxLevel < tree->getHeight() ; ++idxLevel){
            for(int idxHandle = 0 ; idxHandle < int(cellHandles[idxLevel].size()) ; ++idxHandle){
                starpu_data_acquire(cellHandles[idxLevel][idxHandle].symb, STARPU_R);
                starpu_data_release(cellHandles[idxLevel][idxHandle].symb);
                starpu_data_acquire(cellHandles[idxLevel][idxHandle].up, STARPU_R);
                starpu_data_release(cellHandles[idxLevel][idxHandle].up);
                starpu_data_acquire(cellHandles[idxLevel][idxHandle].down, STARPU_R);
                starpu_data_release(cellHandles[idxLevel][idxHandle].down);
            }
        }
        {
            for(int idxHandle = 0 ; idxHandle < int(particleHandles.size()) ; ++idxHandle){
                starpu_data_acquire(particleHandles[idxHandle].symb, STARPU_R);
                starpu_data_release(particleHandles[idxHandle].symb);
                starpu_data_acquire(particleHandles[idxHandle].down, STARPU_R);
                starpu_data_release(particleHandles[idxHandle].down);
            }
        }
    }

316
    ~FGroupTaskStarPUMpiAlgorithm(){
BRAMAS Berenger's avatar
BRAMAS Berenger committed
317 318
        starpu_resume();

319
        cleanHandle();
320
        cleanHandleMpi();
321
        delete[] cellHandles;
322

BRAMAS Berenger's avatar
BRAMAS Berenger committed
323 324 325 326 327 328 329 330 331 332
        starpu_pthread_mutex_t releaseMutex;
        starpu_pthread_mutex_init(&releaseMutex, NULL);
#ifdef STARPU_USE_CPU
        FStarPUUtils::ExecOnWorkers(STARPU_CPU, [&](){
            starpu_pthread_mutex_lock(&releaseMutex);
            cpuWrapper.releaseKernel(starpu_worker_get_id());
            starpu_pthread_mutex_unlock(&releaseMutex);
        });
        wrappers.set(FSTARPU_CPU_IDX, &cpuWrapper);
#endif
333
#ifdef SCALFMM_ENABLE_CUDA_KERNEL
BRAMAS Berenger's avatar
BRAMAS Berenger committed
334 335 336 337 338 339 340
        FStarPUUtils::ExecOnWorkers(STARPU_CUDA, [&](){
            starpu_pthread_mutex_lock(&releaseMutex);
            cudaWrapper.releaseKernel(starpu_worker_get_id());
            starpu_pthread_mutex_unlock(&releaseMutex);
        });
        wrappers.set(FSTARPU_CUDA_IDX, &cudaWrapper);
#endif
341
#ifdef SCALFMM_ENABLE_OPENCL_KERNEL
BRAMAS Berenger's avatar
BRAMAS Berenger committed
342 343 344 345 346 347 348 349 350
        FStarPUUtils::ExecOnWorkers(STARPU_OPENCL, [&](){
            starpu_pthread_mutex_lock(&releaseMutex);
            openclWrapper.releaseKernel(starpu_worker_get_id());
            starpu_pthread_mutex_unlock(&releaseMutex);
        });
        wrappers.set(FSTARPU_OPENCL_IDX, &openclWrapper);
#endif
        starpu_pthread_mutex_destroy(&releaseMutex);

351 352

#ifdef STARPU_SUPPORT_ARBITER
BRAMAS Berenger's avatar
BRAMAS Berenger committed
353
        starpu_arbiter_destroy(arbiterGlobal);
354
#endif
355 356
		for(char* ptr : taskName)
			free(ptr);
357
        starpu_mpi_shutdown();
358 359 360
        starpu_shutdown();
    }

BRAMAS Berenger's avatar
BRAMAS Berenger committed
361
    void rebuildInteractions(){
362
        FAssertLF(getenv("OMP_WAIT_POLICY") == nullptr
BRAMAS Berenger's avatar
BRAMAS Berenger committed
363 364
                || strcmp(getenv("OMP_WAIT_POLICY"), "PASSIVE") == 0
                  || strcmp(getenv("OMP_WAIT_POLICY"), "passive") == 0);
365

BRAMAS Berenger's avatar
BRAMAS Berenger committed
366 367
#pragma omp parallel
#pragma omp single
368 369 370
        buildExternalInteractionVecs();
        buildHandles();

BRAMAS Berenger's avatar
BRAMAS Berenger committed
371 372
#pragma omp parallel
#pragma omp single
373
        buildRemoteInteractionsAndHandles();
BRAMAS Berenger's avatar
BRAMAS Berenger committed
374 375
    }

376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419
#ifdef STARPU_USE_CPU
    void forEachCpuWorker(std::function<void(void)> func){
        starpu_resume();
        FStarPUUtils::ExecOnWorkers(STARPU_CPU, func);
        starpu_pause();
    }

    void forEachCpuWorker(std::function<void(KernelClass*)> func){
        starpu_resume();
        FStarPUUtils::ExecOnWorkers(STARPU_CPU, [&](){
            func(cpuWrapper.getKernel(starpu_worker_get_id()));
        });
        starpu_pause();
    }
#endif
#ifdef SCALFMM_ENABLE_CUDA_KERNEL
    void forEachCudaWorker(std::function<void(void)> func){
        starpu_resume();
        FStarPUUtils::ExecOnWorkers(STARPU_CUDA, func);
        starpu_pause();
    }
    void forEachCudaWorker(std::function<void(void*)> func){
        starpu_resume();
        FStarPUUtils::ExecOnWorkers(STARPU_CUDA, [&](){
            func(cudaWrapper.getKernel(starpu_worker_get_id()));
        });
        starpu_pause();
    }
#endif
#ifdef SCALFMM_ENABLE_OPENCL_KERNEL
    void forEachOpenCLWorker(std::function<void(void)> func){
        starpu_resume();
        FStarPUUtils::ExecOnWorkers(STARPU_OPENCL, func);
        starpu_pause();
    }
    void forEachOpenCLWorker(std::function<void(void*)> func){
        starpu_resume();
        FStarPUUtils::ExecOnWorkers(STARPU_OPENCL, [&](){
            func(openclWrapper.getKernel(starpu_worker_get_id()));
        });
        starpu_pause();
    }
#endif

BRAMAS Berenger's avatar
BRAMAS Berenger committed
420
protected:
421

BRAMAS Berenger's avatar
BRAMAS Berenger committed
422 423 424 425 426 427 428 429 430 431 432
    /**
      * Runs the complete algorithm.
      */
    void executeCore(const unsigned operationsToProceed) override {
        FLOG( FLog::Controller << "\tStart FGroupTaskStarPUMpiAlgorithm\n" );
        const bool directOnly = (tree->getHeight() <= 2);

#ifdef STARPU_USE_CPU
        FTIME_TASKS(cpuWrapper.taskTimeRecorder.start());
#endif

433 434
        FLOG(FTic timerSoumission);

435
        starpu_resume();
436
        postRecvAllocatedBlocks();
437

438
        if( operationsToProceed & FFmmP2P ) insertParticlesSend();
439

440 441 442
        if( operationsToProceed & FFmmP2P ) directPass();
        if( operationsToProceed & FFmmP2P ) directPassMpi();

443
        if(operationsToProceed & FFmmP2M && !directOnly) bottomPass();
444

BRAMAS Berenger's avatar
BRAMAS Berenger committed
445
        if(operationsToProceed & FFmmM2M && !directOnly) upwardPass();
446
        if(operationsToProceed & FFmmM2L && !directOnly) insertCellsSend();
447

BRAMAS Berenger's avatar
BRAMAS Berenger committed
448 449 450
        if(operationsToProceed & FFmmM2L && !directOnly) transferPass(FAbstractAlgorithm::upperWorkingLevel, FAbstractAlgorithm::lowerWorkingLevel-1 , true, true);
        if(operationsToProceed & FFmmM2L && !directOnly) transferPass(FAbstractAlgorithm::lowerWorkingLevel-1, FAbstractAlgorithm::lowerWorkingLevel, false, false);
        if(operationsToProceed & FFmmM2L && !directOnly) transferPassMpi();
451

BRAMAS Berenger's avatar
BRAMAS Berenger committed
452
        if(operationsToProceed & FFmmL2L && !directOnly) downardPass();
453

BRAMAS Berenger's avatar
BRAMAS Berenger committed
454
        if(operationsToProceed & FFmmM2L && !directOnly) transferPass(FAbstractAlgorithm::lowerWorkingLevel-1, FAbstractAlgorithm::lowerWorkingLevel, true, true);
BRAMAS Berenger's avatar
BRAMAS Berenger committed
455

BRAMAS Berenger's avatar
BRAMAS Berenger committed
456 457 458 459
        if( operationsToProceed & FFmmL2P && !directOnly) mergePass();
#ifdef STARPU_USE_REDUX
        if( operationsToProceed & FFmmL2P && !directOnly) readParticle();
#endif
460

461
        FLOG( FLog::Controller << "\t\t Submitting the tasks took " << timerSoumission.tacAndElapsed() << "s\n" );
462
        starpu_task_wait_for_all();
463 464 465 466 467

        FLOG( FTic timerSync; );
        syncData();
        FLOG( FLog::Controller << "\t\t Moving data to the host took " << timerSync.tacAndElapsed() << "s\n" );

468
        starpu_pause();
BRAMAS Berenger's avatar
BRAMAS Berenger committed
469 470 471 472 473

#ifdef STARPU_USE_CPU
        FTIME_TASKS(cpuWrapper.taskTimeRecorder.end());
        FTIME_TASKS(cpuWrapper.taskTimeRecorder.saveToDisk("/tmp/taskstime-FGroupTaskStarPUAlgorithm.txt"));
#endif
474
    }
475

476 477
    void initCodelet(){
        memset(&p2m_cl, 0, sizeof(p2m_cl));
478
#ifdef STARPU_USE_CPU
479
        if(originalCpuKernel->supportP2M(FSTARPU_CPU_IDX)){
480 481 482
            p2m_cl.cpu_funcs[0] = StarPUCpuWrapperClass::bottomPassCallback;
            p2m_cl.where |= STARPU_CPU;
        }
483
#endif
484
#ifdef SCALFMM_ENABLE_CUDA_KERNEL
485
        if(originalCpuKernel->supportP2M(FSTARPU_CUDA_IDX)){
486 487 488 489
            p2m_cl.cuda_funcs[0] = StarPUCudaWrapperClass::bottomPassCallback;
            p2m_cl.where |= STARPU_CUDA;
        }
#endif
490
#ifdef SCALFMM_ENABLE_OPENCL_KERNEL
491
        if(originalCpuKernel->supportP2M(FSTARPU_OPENCL_IDX)){
492 493 494
            p2m_cl.opencl_funcs[0] = StarPUOpenClWrapperClass::bottomPassCallback;
            p2m_cl.where |= STARPU_OPENCL;
        }
495
#endif
496 497 498 499
        p2m_cl.nbuffers = 3;
        p2m_cl.modes[0] = STARPU_R;
        p2m_cl.modes[1] = STARPU_RW;
        p2m_cl.modes[2] = STARPU_R;
500
        p2m_cl.name = "p2m_cl";
501

BRAMAS Berenger's avatar
BRAMAS Berenger committed
502
        memset(&m2m_cl, 0, sizeof(m2m_cl));
503
#ifdef STARPU_USE_CPU
BRAMAS Berenger's avatar
BRAMAS Berenger committed
504 505 506 507
        if(originalCpuKernel->supportM2M(FSTARPU_CPU_IDX)){
            m2m_cl.cpu_funcs[0] = StarPUCpuWrapperClass::upwardPassCallback;
            m2m_cl.where |= STARPU_CPU;
        }
508
#endif
509
#ifdef SCALFMM_ENABLE_CUDA_KERNEL
BRAMAS Berenger's avatar
BRAMAS Berenger committed
510 511 512 513
        if(originalCpuKernel->supportM2M(FSTARPU_CUDA_IDX)){
            m2m_cl.cuda_funcs[0] = StarPUCudaWrapperClass::upwardPassCallback;
            m2m_cl.where |= STARPU_CUDA;
        }
514
#endif
515
#ifdef SCALFMM_ENABLE_OPENCL_KERNEL
BRAMAS Berenger's avatar
BRAMAS Berenger committed
516 517 518 519
        if(originalCpuKernel->supportM2M(FSTARPU_OPENCL_IDX)){
            m2m_cl.opencl_funcs[0] = StarPUOpenClWrapperClass::upwardPassCallback;
            m2m_cl.where |= STARPU_OPENCL;
        }
520
#endif
BRAMAS Berenger's avatar
BRAMAS Berenger committed
521 522 523 524 525 526 527 528 529
        m2m_cl.nbuffers = 4;
        m2m_cl.dyn_modes = (starpu_data_access_mode*)malloc(m2m_cl.nbuffers*sizeof(starpu_data_access_mode));
        m2m_cl.dyn_modes[0] = STARPU_R;
        m2m_cl.dyn_modes[1] = starpu_data_access_mode(STARPU_RW|STARPU_COMMUTE_IF_SUPPORTED);
        m2m_cl.name = "m2m_cl";
        m2m_cl.dyn_modes[2] = STARPU_R;
        m2m_cl.dyn_modes[3] = STARPU_R;

        memset(&l2l_cl, 0, sizeof(l2l_cl));
530
#ifdef STARPU_USE_CPU
BRAMAS Berenger's avatar
BRAMAS Berenger committed
531 532 533 534
        if(originalCpuKernel->supportL2L(FSTARPU_CPU_IDX)){
            l2l_cl.cpu_funcs[0] = StarPUCpuWrapperClass::downardPassCallback;
            l2l_cl.where |= STARPU_CPU;
        }
535
#endif
536
#ifdef SCALFMM_ENABLE_CUDA_KERNEL
BRAMAS Berenger's avatar
BRAMAS Berenger committed
537 538 539 540
        if(originalCpuKernel->supportL2L(FSTARPU_CUDA_IDX)){
            l2l_cl.cuda_funcs[0] = StarPUCudaWrapperClass::downardPassCallback;
            l2l_cl.where |= STARPU_CUDA;
        }
541
#endif
542
#ifdef SCALFMM_ENABLE_OPENCL_KERNEL
BRAMAS Berenger's avatar
BRAMAS Berenger committed
543 544 545
        if(originalCpuKernel->supportL2L(FSTARPU_OPENCL_IDX)){
            l2l_cl.opencl_funcs[0] = StarPUOpenClWrapperClass::downardPassCallback;
            l2l_cl.where |= STARPU_OPENCL;
546
        }
BRAMAS Berenger's avatar
BRAMAS Berenger committed
547 548 549 550 551 552 553 554
#endif
        l2l_cl.nbuffers = 4;
        l2l_cl.dyn_modes = (starpu_data_access_mode*)malloc(l2l_cl.nbuffers*sizeof(starpu_data_access_mode));
        l2l_cl.dyn_modes[0] = STARPU_R;
        l2l_cl.dyn_modes[1] = STARPU_R;
        l2l_cl.name = "l2l_cl";
        l2l_cl.dyn_modes[2] = STARPU_R;
        l2l_cl.dyn_modes[3] = starpu_data_access_mode(STARPU_RW|STARPU_COMMUTE_IF_SUPPORTED);
555

BRAMAS Berenger's avatar
BRAMAS Berenger committed
556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582
        memset(&l2l_cl_nocommute, 0, sizeof(l2l_cl_nocommute));
#ifdef STARPU_USE_CPU
        if(originalCpuKernel->supportL2L(FSTARPU_CPU_IDX)){
            l2l_cl_nocommute.cpu_funcs[0] = StarPUCpuWrapperClass::downardPassCallback;
            l2l_cl_nocommute.where |= STARPU_CPU;
        }
#endif
#ifdef SCALFMM_ENABLE_CUDA_KERNEL
        if(originalCpuKernel->supportL2L(FSTARPU_CUDA_IDX)){
            l2l_cl_nocommute.cuda_funcs[0] = StarPUCudaWrapperClass::downardPassCallback;
            l2l_cl_nocommute.where |= STARPU_CUDA;
        }
#endif
#ifdef SCALFMM_ENABLE_OPENCL_KERNEL
        if(originalCpuKernel->supportL2L(FSTARPU_OPENCL_IDX)){
            l2l_cl_nocommute.opencl_funcs[0] = StarPUOpenClWrapperClass::downardPassCallback;
            l2l_cl_nocommute.where |= STARPU_OPENCL;
        }
#endif
        l2l_cl_nocommute.nbuffers = 4;
        l2l_cl_nocommute.dyn_modes = (starpu_data_access_mode*)malloc(l2l_cl_nocommute.nbuffers*sizeof(starpu_data_access_mode));
        l2l_cl_nocommute.dyn_modes[0] = STARPU_R;
        l2l_cl_nocommute.dyn_modes[1] = STARPU_R;
        l2l_cl_nocommute.name = "l2l_cl";
        l2l_cl_nocommute.dyn_modes[2] = STARPU_R;
        l2l_cl_nocommute.dyn_modes[3] = STARPU_RW;

583
        memset(&l2p_cl, 0, sizeof(l2p_cl));
584
#ifdef STARPU_USE_CPU
585
        if(originalCpuKernel->supportL2P(FSTARPU_CPU_IDX)){
586 587 588
            l2p_cl.cpu_funcs[0] = StarPUCpuWrapperClass::mergePassCallback;
            l2p_cl.where |= STARPU_CPU;
        }
589
#endif
590
#ifdef SCALFMM_ENABLE_CUDA_KERNEL
591
        if(originalCpuKernel->supportL2P(FSTARPU_CUDA_IDX)){
592 593 594 595
            l2p_cl.cuda_funcs[0] = StarPUCudaWrapperClass::mergePassCallback;
            l2p_cl.where |= STARPU_CUDA;
        }
#endif
596
#ifdef SCALFMM_ENABLE_OPENCL_KERNEL
597
        if(originalCpuKernel->supportL2P(FSTARPU_OPENCL_IDX)){
598 599 600
            l2p_cl.opencl_funcs[0] = StarPUOpenClWrapperClass::mergePassCallback;
            l2p_cl.where |= STARPU_OPENCL;
        }
601
#endif
602
        l2p_cl.nbuffers = 4;
603
        l2p_cl.modes[0] = STARPU_R;
604 605
        l2p_cl.modes[1] = STARPU_R;
        l2p_cl.modes[2] = STARPU_R;
BRAMAS Berenger's avatar
BRAMAS Berenger committed
606 607 608
#ifdef STARPU_USE_REDUX
        l2p_cl.modes[3] = STARPU_REDUX;
#else
609
        l2p_cl.modes[3] = starpu_data_access_mode(STARPU_RW|STARPU_COMMUTE_IF_SUPPORTED);
BRAMAS Berenger's avatar
BRAMAS Berenger committed
610
#endif
611
        l2p_cl.name = "l2p_cl";
612 613

        memset(&p2p_cl_in, 0, sizeof(p2p_cl_in));
614
#ifdef STARPU_USE_CPU
615
        if(originalCpuKernel->supportP2P(FSTARPU_CPU_IDX)){
616 617 618
            p2p_cl_in.cpu_funcs[0] = StarPUCpuWrapperClass::directInPassCallback;
            p2p_cl_in.where |= STARPU_CPU;
        }
619
#endif
620
#ifdef SCALFMM_ENABLE_CUDA_KERNEL
621
        if(originalCpuKernel->supportP2P(FSTARPU_CUDA_IDX)){
622 623 624 625
            p2p_cl_in.cuda_funcs[0] = StarPUCudaWrapperClass::directInPassCallback;
            p2p_cl_in.where |= STARPU_CUDA;
        }
#endif
626
#ifdef SCALFMM_ENABLE_OPENCL_KERNEL
627
        if(originalCpuKernel->supportP2P(FSTARPU_OPENCL_IDX)){
628 629 630
            p2p_cl_in.opencl_funcs[0] = StarPUOpenClWrapperClass::directInPassCallback;
            p2p_cl_in.where |= STARPU_OPENCL;
        }
631
#endif
632 633
        p2p_cl_in.nbuffers = 2;
        p2p_cl_in.modes[0] = STARPU_R;
BRAMAS Berenger's avatar
BRAMAS Berenger committed
634 635 636
#ifdef STARPU_USE_REDUX
        p2p_cl_in.modes[1] = STARPU_REDUX;
#else
637
        p2p_cl_in.modes[1] = starpu_data_access_mode(STARPU_RW|STARPU_COMMUTE_IF_SUPPORTED);
BRAMAS Berenger's avatar
BRAMAS Berenger committed
638
#endif
639
        p2p_cl_in.name = "p2p_cl_in";
640
        memset(&p2p_cl_inout, 0, sizeof(p2p_cl_inout));
641
#ifdef STARPU_USE_CPU
642
        if(originalCpuKernel->supportP2PExtern(FSTARPU_CPU_IDX)){
643 644 645
            p2p_cl_inout.cpu_funcs[0] = StarPUCpuWrapperClass::directInoutPassCallback;
            p2p_cl_inout.where |= STARPU_CPU;
        }
646
#endif
647
#ifdef SCALFMM_ENABLE_CUDA_KERNEL
648
        if(originalCpuKernel->supportP2PExtern(FSTARPU_CUDA_IDX)){
649 650 651 652
            p2p_cl_inout.cuda_funcs[0] = StarPUCudaWrapperClass::directInoutPassCallback;
            p2p_cl_inout.where |= STARPU_CUDA;
        }
#endif
653
#ifdef SCALFMM_ENABLE_OPENCL_KERNEL
654
        if(originalCpuKernel->supportP2PExtern(FSTARPU_OPENCL_IDX)){
655 656 657
            p2p_cl_inout.opencl_funcs[0] = StarPUOpenClWrapperClass::directInoutPassCallback;
            p2p_cl_inout.where |= STARPU_OPENCL;
        }
658
#endif
659 660
        p2p_cl_inout.nbuffers = 4;
        p2p_cl_inout.modes[0] = STARPU_R;
BRAMAS Berenger's avatar
BRAMAS Berenger committed
661 662 663
#ifdef STARPU_USE_REDUX
        p2p_cl_inout.modes[1] = STARPU_REDUX;
#else
664
        p2p_cl_inout.modes[1] = starpu_data_access_mode(STARPU_RW|STARPU_COMMUTE_IF_SUPPORTED);
BRAMAS Berenger's avatar
BRAMAS Berenger committed
665
#endif
666
        p2p_cl_inout.modes[2] = STARPU_R;
BRAMAS Berenger's avatar
BRAMAS Berenger committed
667 668 669
#ifdef STARPU_USE_REDUX
        p2p_cl_inout.modes[3] = STARPU_REDUX;
#else
670
        p2p_cl_inout.modes[3] = starpu_data_access_mode(STARPU_RW|STARPU_COMMUTE_IF_SUPPORTED);
BRAMAS Berenger's avatar
BRAMAS Berenger committed
671
#endif
672
        p2p_cl_inout.name = "p2p_cl_inout";
673 674

        memset(&m2l_cl_in, 0, sizeof(m2l_cl_in));
675
#ifdef STARPU_USE_CPU
676
        if(originalCpuKernel->supportM2L(FSTARPU_CPU_IDX)){
677 678 679
            m2l_cl_in.cpu_funcs[0] = StarPUCpuWrapperClass::transferInPassCallback;
            m2l_cl_in.where |= STARPU_CPU;
        }
680
#endif
681
#ifdef SCALFMM_ENABLE_CUDA_KERNEL
682
        if(originalCpuKernel->supportM2L(FSTARPU_CUDA_IDX)){
683 684 685 686
            m2l_cl_in.cuda_funcs[0] = StarPUCudaWrapperClass::transferInPassCallback;
            m2l_cl_in.where |= STARPU_CUDA;
        }
#endif
687
#ifdef SCALFMM_ENABLE_OPENCL_KERNEL
688
        if(originalCpuKernel->supportM2L(FSTARPU_OPENCL_IDX)){
689 690 691
            m2l_cl_in.opencl_funcs[0] = StarPUOpenClWrapperClass::transferInPassCallback;
            m2l_cl_in.where |= STARPU_OPENCL;
        }
692
#endif
693 694
        m2l_cl_in.nbuffers = 3;
        m2l_cl_in.modes[0] = STARPU_R;
695
        m2l_cl_in.modes[1] = STARPU_R;
696
        m2l_cl_in.modes[2] = starpu_data_access_mode(STARPU_RW|STARPU_COMMUTE_IF_SUPPORTED);
697
        m2l_cl_in.name = "m2l_cl_in";
BRAMAS Berenger's avatar
BRAMAS Berenger committed
698

699
        memset(&m2l_cl_inout, 0, sizeof(m2l_cl_inout));
700
#ifdef STARPU_USE_CPU
701
        if(originalCpuKernel->supportM2LExtern(FSTARPU_CPU_IDX)){
702 703 704
            m2l_cl_inout.cpu_funcs[0] = StarPUCpuWrapperClass::transferInoutPassCallback;
            m2l_cl_inout.where |= STARPU_CPU;
        }
705
#endif
706
#ifdef SCALFMM_ENABLE_CUDA_KERNEL
707
        if(originalCpuKernel->supportM2LExtern(FSTARPU_CUDA_IDX)){
708 709 710 711
            m2l_cl_inout.cuda_funcs[0] = StarPUCudaWrapperClass::transferInoutPassCallback;
            m2l_cl_inout.where |= STARPU_CUDA;
        }
#endif
712
#ifdef SCALFMM_ENABLE_OPENCL_KERNEL
713
        if(originalCpuKernel->supportM2LExtern(FSTARPU_OPENCL_IDX)){
714 715 716
            m2l_cl_inout.opencl_funcs[0] = StarPUOpenClWrapperClass::transferInoutPassCallback;
            m2l_cl_inout.where |= STARPU_OPENCL;
        }
717
#endif
BRAMAS Berenger's avatar
BRAMAS Berenger committed
718
        m2l_cl_inout.nbuffers = 4;
719
        m2l_cl_inout.modes[0] = STARPU_R;
BRAMAS Berenger's avatar
BRAMAS Berenger committed
720 721
        m2l_cl_inout.modes[1] = starpu_data_access_mode(STARPU_RW|STARPU_COMMUTE_IF_SUPPORTED);
        m2l_cl_inout.modes[2] = STARPU_R;
722
        m2l_cl_inout.modes[3] = STARPU_R;
723
        m2l_cl_inout.name = "m2l_cl_inout";
BRAMAS Berenger's avatar
BRAMAS Berenger committed
724

BRAMAS Berenger's avatar
BRAMAS Berenger committed
725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767
#ifdef STARPU_USE_REDUX
        memset(&p2p_redux_init, 0, sizeof(p2p_redux_init));
#ifdef STARPU_USE_CPU
        p2p_redux_init.cpu_funcs[0] = FStarPUReduxCpu::InitData<typename ParticleGroupClass::ParticleDataType>;
        p2p_redux_init.where |= STARPU_CPU;
#endif
        p2p_redux_init.nbuffers = 1;
        p2p_redux_init.modes[0] = STARPU_RW;
        p2p_redux_init.name = "p2p_redux_init";

        memset(&p2p_redux_perform, 0, sizeof(p2p_redux_perform));
#ifdef STARPU_USE_CPU
        p2p_redux_perform.cpu_funcs[0] = FStarPUReduxCpu::ReduceData<typename ParticleGroupClass::ParticleDataType>;
        p2p_redux_perform.where |= STARPU_CPU;
#endif
        p2p_redux_perform.nbuffers = 2;
        p2p_redux_perform.modes[0] = STARPU_RW;
        p2p_redux_perform.modes[1] = STARPU_R;
        p2p_redux_perform.name = "p2p_redux_perform";

        memset(&p2p_redux_read, 0, sizeof(p2p_redux_read));
#ifdef STARPU_USE_CPU
        if(originalCpuKernel->supportL2P(FSTARPU_CPU_IDX)){
            p2p_redux_read.cpu_funcs[0] = FStarPUReduxCpu::EmptyCodelet<typename ParticleGroupClass::ParticleDataType>;
            p2p_redux_read.where |= STARPU_CPU;
        }
#endif
#ifdef SCALFMM_ENABLE_CUDA_KERNEL
        if(originalCpuKernel->supportL2P(FSTARPU_CUDA_IDX)){
            p2p_redux_read.cuda_funcs[0] = FStarPUReduxCpu::EmptyCodelet<typename ParticleGroupClass::ParticleDataType>;
            p2p_redux_read.where |= STARPU_CUDA;
        }
#endif
#ifdef SCALFMM_ENABLE_OPENCL_KERNEL
        if(originalCpuKernel->supportL2P(FSTARPU_OPENCL_IDX)){
            p2p_redux_read.opencl_funcs[0] = FStarPUReduxCpu::EmptyCodelet<typename ParticleGroupClass::ParticleDataType>;
            p2p_redux_read.where |= STARPU_OPENCL;
        }
#endif
        p2p_redux_read.nbuffers = 1;
        p2p_redux_read.modes[0] = STARPU_R;
        p2p_redux_read.name = "p2p_redux_read";
#endif
BRAMAS Berenger's avatar
BRAMAS Berenger committed
768
    }
BRAMAS Berenger's avatar
BRAMAS Berenger committed
769

770 771 772
    /** dealloc in a starpu way all the defined handles */
    void cleanHandle(){
        for(int idxLevel = 0 ; idxLevel < tree->getHeight() ; ++idxLevel){
773 774 775 776
            for(int idxHandle = 0 ; idxHandle < int(cellHandles[idxLevel].size()) ; ++idxHandle){
                starpu_data_unregister(cellHandles[idxLevel][idxHandle].symb);
                starpu_data_unregister(cellHandles[idxLevel][idxHandle].up);
                starpu_data_unregister(cellHandles[idxLevel][idxHandle].down);
777
            }
778
            cellHandles[idxLevel].clear();
779 780
        }
        {
781 782 783
            for(int idxHandle = 0 ; idxHandle < int(particleHandles.size()) ; ++idxHandle){
                starpu_data_unregister(particleHandles[idxHandle].symb);
                starpu_data_unregister(particleHandles[idxHandle].down);
784
            }
785
            particleHandles.clear();
786 787 788 789
        }
    }

    ////////////////////////////////////////////////////////////////////////////
790 791

    void initCodeletMpi(){
792
        memset(&p2p_cl_inout_mpi, 0, sizeof(p2p_cl_inout_mpi));
793
#ifdef STARPU_USE_CPU
794
        if(originalCpuKernel->supportP2PMpi(FSTARPU_CPU_IDX)){
795 796 797
            p2p_cl_inout_mpi.where |= STARPU_CPU;
            p2p_cl_inout_mpi.cpu_funcs[0] = StarPUCpuWrapperClass::directInoutPassCallbackMpi;
        }
798
#endif
799
#ifdef SCALFMM_ENABLE_CUDA_KERNEL
800
        if(originalCpuKernel->supportP2PMpi(FSTARPU_CUDA_IDX)){
801 802 803 804
            p2p_cl_inout_mpi.where |= STARPU_CUDA;
            p2p_cl_inout_mpi.cuda_funcs[0] = StarPUCudaWrapperClass::directInoutPassCallbackMpi;
        }
#endif
805
#ifdef SCALFMM_ENABLE_OPENCL_KERNEL
806
        if(originalCpuKernel->supportP2PMpi(FSTARPU_OPENCL_IDX)){
807 808 809
            p2p_cl_inout_mpi.where |= STARPU_OPENCL;
            p2p_cl_inout_mpi.opencl_funcs[0] = StarPUOpenClWrapperClass::directInoutPassCallbackMpi;
        }
810
#endif
811 812
        p2p_cl_inout_mpi.nbuffers = 3;
        p2p_cl_inout_mpi.modes[0] = STARPU_R;
813
        p2p_cl_inout_mpi.modes[1] = starpu_data_access_mode(STARPU_RW|STARPU_COMMUTE_IF_SUPPORTED);
814
        p2p_cl_inout_mpi.modes[2] = STARPU_R;
815
        p2p_cl_inout_mpi.name = "p2p_cl_inout_mpi";
816

817
        memset(&m2l_cl_inout_mpi, 0, sizeof(m2l_cl_inout_mpi));
818
#ifdef STARPU_USE_CPU
819
        if(originalCpuKernel->supportM2LMpi(FSTARPU_CPU_IDX)){
820 821 822
            m2l_cl_inout_mpi.where |= STARPU_CPU;
            m2l_cl_inout_mpi.cpu_funcs[0] = StarPUCpuWrapperClass::transferInoutPassCallbackMpi;
        }
823
#endif
824
#ifdef SCALFMM_ENABLE_CUDA_KERNEL
825
        if(originalCpuKernel->supportM2LMpi(FSTARPU_CUDA_IDX)){
826 827 828 829
            m2l_cl_inout_mpi.where |= STARPU_CUDA;
            m2l_cl_inout_mpi.cuda_funcs[0] = StarPUCudaWrapperClass::transferInoutPassCallbackMpi;
        }
#endif
830
#ifdef SCALFMM_ENABLE_OPENCL_KERNEL
831
        if(originalCpuKernel->supportM2LMpi(FSTARPU_OPENCL_IDX)){
832 833 834
            m2l_cl_inout_mpi.where |= STARPU_OPENCL;
            m2l_cl_inout_mpi.opencl_funcs[0] = StarPUOpenClWrapperClass::transferInoutPassCallbackMpi;
        }
835
#endif
836 837
        m2l_cl_inout_mpi.nbuffers = 4;
        m2l_cl_inout_mpi.modes[0] = STARPU_R;
838
        m2l_cl_inout_mpi.modes[1] = starpu_data_access_mode(STARPU_RW|STARPU_COMMUTE_IF_SUPPORTED);
839 840
        m2l_cl_inout_mpi.modes[2] = STARPU_R;
        m2l_cl_inout_mpi.modes[3] = STARPU_R;
841
        m2l_cl_inout_mpi.name = "m2l_cl_inout_mpi";
842 843
    }

844 845 846 847 848 849
    std::vector<std::pair<MortonIndex,MortonIndex>> processesIntervalPerLevels;
    struct BlockDescriptor{
        MortonIndex firstIndex;
        MortonIndex lastIndex;
        int owner;
        int bufferSize;
850 851 852 853
        size_t bufferSizeSymb;
        size_t bufferSizeUp;
        size_t bufferSizeDown;
        size_t leavesBufferSize;
854 855 856 857 858 859 860 861
    };
    std::vector<std::vector<BlockDescriptor>> processesBlockInfos;
    std::vector<int> nbBlocksPerLevelAll;
    std::vector<int> nbBlocksBeforeMinPerLevel;

    std::vector< std::vector< std::vector<BlockInteractions<CellContainerClass>>>> externalInteractionsAllLevelMpi;
    std::vector< std::vector<BlockInteractions<ParticleGroupClass>>> externalInteractionsLeafLevelMpi;

862
    struct RemoteHandle{
863 864 865 866
        RemoteHandle() : ptrSymb(nullptr), ptrUp(nullptr), ptrDown(nullptr){
            memset(&handleSymb, 0, sizeof(handleSymb));
            memset(&handleUp, 0, sizeof(handleUp));
            memset(&handleDown, 0, sizeof(handleDown));
867 868
        }

869 870 871 872 873 874
        unsigned char * ptrSymb;
        starpu_data_handle_t handleSymb;
        unsigned char * ptrUp;
        starpu_data_handle_t handleUp;
        unsigned char * ptrDown;
        starpu_data_handle_t handleDown;
BRAMAS Berenger's avatar
BRAMAS Berenger committed
875 876

        int intervalSize;
877 878 879 880 881
    };

    std::vector<std::vector<RemoteHandle>> remoteCellGroups;
    std::vector<RemoteHandle> remoteParticleGroupss;

882
    void buildRemoteInteractionsAndHandles(){
883 884
        cleanHandleMpi();

885 886
        // We need to have information about all other blocks
        std::unique_ptr<int[]> nbBlocksPerLevel(new int[tree->getHeight()]);
887
        nbBlocksPerLevel[0] = 0;
888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930
        for(int idxLevel = 1 ; idxLevel < tree->getHeight() ; ++idxLevel){
            nbBlocksPerLevel[idxLevel] = tree->getNbCellGroupAtLevel(idxLevel);
        }
        // Exchange the number of blocks per proc
        nbBlocksPerLevelAll.resize(tree->getHeight() * comm.processCount());
        FMpi::Assert(MPI_Allgather(nbBlocksPerLevel.get(), tree->getHeight(), MPI_INT,
                                   nbBlocksPerLevelAll.data(), tree->getHeight(), MPI_INT,
                                   comm.getComm()), __LINE__);
        // Compute the number of blocks before mine
        nbBlocksBeforeMinPerLevel.resize(tree->getHeight());
        for(int idxLevel = 1 ; idxLevel < tree->getHeight() ; ++idxLevel){
            nbBlocksBeforeMinPerLevel[idxLevel] = 0;
            for(int idxProc = 0 ; idxProc < comm.processId() ; ++idxProc){
                nbBlocksBeforeMinPerLevel[idxLevel] += nbBlocksPerLevelAll[idxProc*tree->getHeight() + idxLevel];
            }
        }
        // Prepare the block infos
        processesBlockInfos.resize(tree->getHeight());
        std::unique_ptr<int[]> recvBlocksCount(new int[comm.processCount()]);
        std::unique_ptr<int[]> recvBlockDispl(new int[comm.processCount()]);
        // Exchange the block info per level
        for(int idxLevel = 1 ; idxLevel < tree->getHeight() ; ++idxLevel){
            // Count the total number of blocks
            int nbBlocksInLevel = 0;
            recvBlockDispl[0] = 0;
            for(int idxProc = 0 ; idxProc < comm.processCount() ; ++idxProc){
                nbBlocksInLevel += nbBlocksPerLevelAll[idxProc*tree->getHeight() + idxLevel];
                // Count and displacement for the MPI all gatherv
                recvBlocksCount[idxProc] = nbBlocksPerLevelAll[idxProc*tree->getHeight() + idxLevel] * int(sizeof(BlockDescriptor));
                if(idxProc) recvBlockDispl[idxProc] = recvBlockDispl[idxProc-1] + recvBlocksCount[idxProc-1];
            }
            processesBlockInfos[idxLevel].resize(nbBlocksInLevel);
            // Fill my blocks
            std::vector<BlockDescriptor> myBlocksAtLevel;
            myBlocksAtLevel.resize(nbBlocksPerLevel[idxLevel]);
            FAssertLF(tree->getNbCellGroupAtLevel(idxLevel) == int(myBlocksAtLevel.size()));
            FAssertLF(nbBlocksPerLevel[idxLevel] == nbBlocksPerLevelAll[comm.processId()*tree->getHeight() + idxLevel]);

            for(int idxGroup = 0 ; idxGroup < tree->getNbCellGroupAtLevel(idxLevel) ; ++idxGroup){
                CellContainerClass*const currentCells = tree->getCellGroup(idxLevel, idxGroup);
                myBlocksAtLevel[idxGroup].firstIndex = currentCells->getStartingIndex();
                myBlocksAtLevel[idxGroup].lastIndex  = currentCells->getEndingIndex();
                myBlocksAtLevel[idxGroup].owner = comm.processId();
931 932 933
                myBlocksAtLevel[idxGroup].bufferSizeSymb = currentCells->getBufferSizeInByte();
                myBlocksAtLevel[idxGroup].bufferSizeUp   = currentCells->getMultipoleBufferSizeInByte();
                myBlocksAtLevel[idxGroup].bufferSizeDown = currentCells->getLocalBufferSizeInByte();
934 935 936 937 938 939 940 941 942 943 944 945 946

                if(idxLevel == tree->getHeight() - 1){
                    myBlocksAtLevel[idxGroup].leavesBufferSize = tree->getParticleGroup(idxGroup)->getBufferSizeInByte();
                }
                else{
                    myBlocksAtLevel[idxGroup].leavesBufferSize = 0;
                }
            }
            // Exchange with all other
            FMpi::Assert(MPI_Allgatherv(myBlocksAtLevel.data(), int(myBlocksAtLevel.size()*sizeof(BlockDescriptor)), MPI_BYTE,
                                        processesBlockInfos[idxLevel].data(), recvBlocksCount.get(), recvBlockDispl.get(), MPI_BYTE,
                                        comm.getComm()), __LINE__);
        }
947 948 949 950 951 952 953
        // Prepare remate ptr and handles
        remoteCellGroups.resize( tree->getHeight() );
        for(int idxLevel = 1 ; idxLevel < tree->getHeight() ; ++idxLevel){
            remoteCellGroups[idxLevel].resize( processesBlockInfos[idxLevel].size());
        }
        remoteParticleGroupss.resize(processesBlockInfos[tree->getHeight()-1].size());

954 955 956 957 958 959 960 961
        // From now we have the number of blocks for all process
        // we also have the size of the blocks therefor we can
        // create the handles we need
        // We will now detect the relation between our blocks and others
        // During the M2M (which is the same for the L2L)
        // During the M2L and during the P2P
        // I need to insert the task that read my data or that write the data I need.
        // M2L
962
        externalInteractionsAllLevelMpi.clear();
963 964 965 966 967 968 969 970 971 972 973 974 975 976
        externalInteractionsAllLevelMpi.resize(tree->getHeight());
        for(int idxLevel = tree->getHeight()-1 ; idxLevel >= 2 ; --idxLevel){
            // From this level there are no more blocks
            if(tree->getNbCellGroupAtLevel(idxLevel) == 0){
                // We stop here
                break;
            }
            // What are my morton interval at this level
            const MortonIndex myFirstIndex = tree->getCellGroup(idxLevel, 0)->getStartingIndex();
            const MortonIndex myLastIndex = tree->getCellGroup(idxLevel, tree->getNbCellGroupAtLevel(idxLevel)-1)->getEndingIndex();

            externalInteractionsAllLevelMpi[idxLevel].resize(tree->getNbCellGroupAtLevel(idxLevel));

            for(int idxGroup = 0 ; idxGroup < tree->getNbCellGroupAtLevel(idxLevel) ; ++idxGroup){
977
                CellContainerClass* currentCells = tree->getCellGroup(idxLevel, idxGroup);
978 979 980

                std::vector<BlockInteractions<CellContainerClass>>* externalInteractions = &externalInteractionsAllLevelMpi[idxLevel][idxGroup];

BRAMAS Berenger's avatar
BRAMAS Berenger committed
981
#pragma omp task default(none) firstprivate(idxGroup, currentCells, idxLevel, externalInteractions)
982 983 984
                {
                    std::vector<OutOfBlockInteraction> outsideInteractions;

985 986 987 988 989 990 991 992 993 994 995 996 997 998
                    for(int idxCell = 0 ; idxCell < currentCells->getNumberOfCellsInBlock() ; ++idxCell){
                        const MortonIndex mindex = currentCells->getCellMortonIndex(idxCell);

                        MortonIndex interactionsIndexes[189];
                        int interactionsPosition[189];
                        const FTreeCoordinate coord(mindex, idxLevel);
                        int counter = coord.getInteractionNeighbors(idxLevel,interactionsIndexes,interactionsPosition);

                        for(int idxInter = 0 ; idxInter < counter ; ++idxInter){
                            // This interactions need a block owned by someoneelse
                            if(interactionsIndexes[idxInter] < myFirstIndex || myLastIndex <= interactionsIndexes[idxInter]){
                                OutOfBlockInteraction property;
                                property.insideIndex = mindex;
                                property.outIndex    = interactionsIndexes[idxInter];
BRAMAS Berenger's avatar
BRAMAS Berenger committed
999
                                property.relativeOutPosition = interactionsPosition[idxInter];
1000
                                property.insideIdxInBlock = idxCell;
BRAMAS Berenger's avatar
BRAMAS Berenger committed
1001
                                property.outsideIdxInBlock = -1;
1002
                                outsideInteractions.push_back(property);
1003 1004 1005 1006 1007 1008 1009 1010 1011
                            }
                        }
                    }

                    // Manage outofblock interaction
                    FQuickSort<OutOfBlockInteraction, int>::QsSequential(outsideInteractions.data(),int(outsideInteractions.size()));

                    int currentOutInteraction = 0;
                    for(int idxOtherGroup = 0 ; idxOtherGroup < int(processesBlockInfos[idxLevel].size())
BRAMAS Berenger's avatar
BRAMAS Berenger committed
1012
                        && currentOutInteraction < int(outsideInteractions.size()) ; ++idxOtherGroup){
1013 1014 1015 1016 1017 1018 1019 1020 1021
                        // Skip my blocks
                        if(idxOtherGroup == nbBlocksBeforeMinPerLevel[idxLevel]){
                            idxOtherGroup += tree->getNbCellGroupAtLevel(idxLevel);
                            if(idxOtherGroup == int(processesBlockInfos[idxLevel].size())){
                                break;
                            }
                            FAssertLF(idxOtherGroup < int(processesBlockInfos[idxLevel].size()));
                        }

BRAMAS Berenger's avatar
BRAMAS Berenger committed
1022 1023
                        const MortonIndex blockStartIdxOther = processesBlockInfos[idxLevel][idxOtherGroup].firstIndex;
                        const MortonIndex blockEndIdxOther   = processesBlockInfos[idxLevel][idxOtherGroup].lastIndex;
1024

BRAMAS Berenger's avatar
BRAMAS Berenger committed
1025
                        while(currentOutInteraction < int(outsideInteractions.size()) && outsideInteractions[currentOutInteraction].outIndex < blockStartIdxOther){
1026 1027 1028 1029
                            currentOutInteraction += 1;
                        }

                        int lastOutInteraction = currentOutInteraction;
BRAMAS Berenger's avatar
BRAMAS Berenger committed
1030
                        while(lastOutInteraction < int(outsideInteractions.size()) && outsideInteractions[lastOutInteraction].outIndex < blockEndIdxOther){
1031 1032 1033 1034 1035 1036
                            lastOutInteraction += 1;
                        }

                        // Create interactions
                        const int nbInteractionsBetweenBlocks = (lastOutInteraction-currentOutInteraction);
                        if(nbInteractionsBetweenBlocks){
1037
                            if(remoteCellGroups[idxLevel][idxOtherGroup].ptrSymb == nullptr){
BRAMAS Berenger's avatar
BRAMAS Berenger committed
1038
#pragma omp critical(CreateM2LRemotes)
1039
                                {
1040
                                    if(remoteCellGroups[idxLevel][idxOtherGroup].ptrSymb == nullptr){
BRAMAS Berenger's avatar
BRAMAS Berenger committed
1041
                                        const size_t nbBytesInBlockSymb = processesBlockInfos[idxLevel][idxOtherGroup].bufferSizeSymb;
1042
                                        unsigned char* memoryBlockSymb = (unsigned char*)FAlignedMemory::AllocateBytes<32>(nbBytesInBlockSymb);
1043 1044 1045
                                        remoteCellGroups[idxLevel][idxOtherGroup].ptrSymb = memoryBlockSymb;
                                        starpu_variable_data_register(&remoteCellGroups[idxLevel][idxOtherGroup].handleSymb, 0,
                                                                      (uintptr_t)remoteCellGroups[idxLevel][idxOtherGroup].ptrSymb, nbBytesInBlockSymb);
BRAMAS Berenger's avatar
BRAMAS Berenger committed
1046
                                        const size_t nbBytesInBlockUp = processesBlockInfos[idxLevel][idxOtherGroup].bufferSizeUp;
1047
                                        unsigned char* memoryBlockUp = (unsigned char*)FAlignedMemory::AllocateBytes<32>(nbBytesInBlockUp);
1048 1049 1050
                                        remoteCellGroups[idxLevel][idxOtherGroup].ptrUp = memoryBlockUp;
                                        starpu_variable_data_register(&remoteCellGroups[idxLevel][idxOtherGroup].handleUp, 0,
                                                                      (uintptr_t)remoteCellGroups[idxLevel][idxOtherGroup].ptrUp, nbBytesInBlockUp);
1051 1052
                                    }
                                }
1053 1054
                            }

1055 1056
                            externalInteractions->emplace_back();
                            BlockInteractions<CellContainerClass>* interactions = &externalInteractions->back();
1057
                            //interactions->otherBlock = remoteCellGroups[idxLevel][idxOtherGroup].ptr;
1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075
                            interactions->otherBlockId = idxOtherGroup;
                            interactions->interactions.resize(nbInteractionsBetweenBlocks);
                            std::copy(outsideInteractions.begin() + currentOutInteraction,
                                      outsideInteractions.begin() + lastOutInteraction,
                                      interactions->interactions.begin());
                        }

                        currentOutInteraction = lastOutInteraction;
                    }
                }
            }
        }
        // P2P
        // We create one big vector per block
        {
            const MortonIndex myFirstIndex = tree->getParticleGroup(0)->getStartingIndex();
            const MortonIndex myLastIndex = tree->getParticleGroup(tree->getNbParticleGroup()-1)->getEndingIndex();

1076
            externalInteractionsLeafLevelMpi.clear();
1077 1078 1079 1080 1081 1082 1083
            externalInteractionsLeafLevelMpi.resize(tree->getNbParticleGroup());
            for(int idxGroup = 0 ; idxGroup < tree->getNbParticleGroup() ; ++idxGroup){
                // Create the vector
                ParticleGroupClass* containers = tree->getParticleGroup(idxGroup);

                std::vector<BlockInteractions<ParticleGroupClass>>* externalInteractions = &externalInteractionsLeafLevelMpi[idxGroup];

BRAMAS Berenger's avatar
BRAMAS Berenger committed
1084