FGroupTaskStarpuMpiAlgorithm.hpp 139 KB
Newer Older
1 2 3 4
// Keep in private GIT
#ifndef FGROUPTASKSTARPUMPIALGORITHM_HPP
#define FGROUPTASKSTARPUMPIALGORITHM_HPP

5 6 7 8 9 10 11 12 13
#include "../../Utils/FGlobal.hpp"
#include "../../Core/FCoreCommon.hpp"
#include "../../Utils/FQuickSort.hpp"
#include "../../Containers/FTreeCoordinate.hpp"
#include "../../Utils/FLog.hpp"
#include "../../Utils/FTic.hpp"
#include "../../Utils/FAssert.hpp"
#include "../../Utils/FAlignedMemory.hpp"
#include "../../Utils/FAssert.hpp"
14
#include "../../Utils/FEnv.hpp"
15 16

#include "../../Utils/FMpi.hpp"
17

18 19
#include "FOutOfBlockInteraction.hpp"

20 21 22 23 24 25
#include <vector>
#include <memory>

#include <omp.h>

#include <starpu.h>
26
#include <starpu_mpi.h>
27
#include "../StarPUUtils/FStarPUUtils.hpp"
28
#include "../StarPUUtils/FStarPUFmmPriorities.hpp"
29 30
#include "../StarPUUtils/FStarPUFmmPrioritiesV2.hpp"
#include "../StarPUUtils/FStarPUReduxCpu.hpp"
31

32
#ifdef STARPU_USE_CPU
33
#include "../StarPUUtils/FStarPUCpuWrapper.hpp"
34
#endif
35
#ifdef SCALFMM_ENABLE_CUDA_KERNEL
36 37 38 39 40
#include "../StarPUUtils/FStarPUCudaWrapper.hpp"
#include "../Cuda/FCudaEmptyKernel.hpp"
#include "../Cuda/FCudaGroupAttachedLeaf.hpp"
#include "../Cuda/FCudaGroupOfParticles.hpp"
#include "../Cuda/FCudaGroupOfCells.hpp"
41
#include "../Cuda/FCudaEmptyCellSymb.hpp"
42
#endif
43
#ifdef SCALFMM_ENABLE_OPENCL_KERNEL
44 45
#include "../StarPUUtils/FStarPUOpenClWrapper.hpp"
#include "../OpenCl/FOpenCLDeviceWrapper.hpp"
46
#include "../OpenCl/FEmptyOpenCLCode.hpp"
47
#endif
48

BRAMAS Berenger's avatar
BRAMAS Berenger committed
49 50
#include "../StarPUUtils/FStarPUReduxCpu.hpp"

51
#ifdef SCALFMM_SIMGRID_TASKNAMEPARAMS
Martin Khannouz's avatar
Martin Khannouz committed
52
#include "../StarPUUtils/FStarPUTaskNameParams.hpp"
53
#endif
54

55
template <class OctreeClass, class CellContainerClass, class KernelClass, class ParticleGroupClass, class StarPUCpuWrapperClass
BRAMAS Berenger's avatar
BRAMAS Berenger committed
56 57 58 59 60 61 62
          #ifdef SCALFMM_ENABLE_CUDA_KERNEL
          , class StarPUCudaWrapperClass = FStarPUCudaWrapper<KernelClass, FCudaEmptyCellSymb, int, int, FCudaGroupOfCells<FCudaEmptyCellSymb, int, int>,
                                                              FCudaGroupOfParticles<int, 0, 0, int>, FCudaGroupAttachedLeaf<int, 0, 0, int>, FCudaEmptyKernel<int> >
          #endif
          #ifdef SCALFMM_ENABLE_OPENCL_KERNEL
          , class StarPUOpenClWrapperClass = FStarPUOpenClWrapper<KernelClass, FOpenCLDeviceWrapper<KernelClass>>
          #endif
63
          >
64
class FGroupTaskStarPUMpiAlgorithm : public FAbstractAlgorithm {
65
protected:
66
    typedef FGroupTaskStarPUMpiAlgorithm<OctreeClass, CellContainerClass, KernelClass, ParticleGroupClass, StarPUCpuWrapperClass
67
#ifdef SCALFMM_ENABLE_CUDA_KERNEL
BRAMAS Berenger's avatar
BRAMAS Berenger committed
68
    , StarPUCudaWrapperClass
69
#endif
70
#ifdef SCALFMM_ENABLE_OPENCL_KERNEL
71
    , StarPUOpenClWrapperClass
72 73
#endif
    > ThisClass;
74

75
    int getTag(const int inLevel, const MortonIndex mindex, const int mode, const int idxBlockMpi = 0) const{
76 77 78
        int shift = 0;
        int height = tree->getHeight();
        while(height) { shift += 1; height >>= 1; }
79 80
		FAssertLF(idxBlockMpi < 32, "Too much block mpi, tag overflow possible");
        return int((((((mindex<<shift) + inLevel) << 5) + mode) << 5) + idxBlockMpi);
81 82
    }

83 84 85 86 87 88 89 90 91
    const FMpi::FComm& comm;

    template <class OtherBlockClass>
    struct BlockInteractions{
        OtherBlockClass* otherBlock;
        int otherBlockId;
        std::vector<OutOfBlockInteraction> interactions;
    };

92 93 94 95
    struct CellHandles{
        starpu_data_handle_t symb;
        starpu_data_handle_t up;
        starpu_data_handle_t down;
96
        int intervalSize;
97 98 99 100 101
    };

    struct ParticleHandles{
        starpu_data_handle_t symb;
        starpu_data_handle_t down;
102
        int intervalSize;
103 104
    };

105 106 107 108
    std::vector< std::vector< std::vector<BlockInteractions<CellContainerClass>>>> externalInteractionsAllLevel;
    std::vector< std::vector<BlockInteractions<ParticleGroupClass>>> externalInteractionsLeafLevel;

    OctreeClass*const tree;       //< The Tree
109
    KernelClass*const originalCpuKernel;
110

111 112
    std::vector<CellHandles>* cellHandles;
    std::vector<ParticleHandles> particleHandles;
113 114

    starpu_codelet p2m_cl;
BRAMAS Berenger's avatar
BRAMAS Berenger committed
115 116
    starpu_codelet m2m_cl;
    starpu_codelet l2l_cl;
BRAMAS Berenger's avatar
BRAMAS Berenger committed
117
    starpu_codelet l2l_cl_nocommute;
118 119 120 121
    starpu_codelet l2p_cl;

    starpu_codelet m2l_cl_in;
    starpu_codelet m2l_cl_inout;
122
    starpu_codelet m2l_cl_inout_mpi;
123 124 125

    starpu_codelet p2p_cl_in;
    starpu_codelet p2p_cl_inout;
126
    starpu_codelet p2p_cl_inout_mpi;
127

BRAMAS Berenger's avatar
BRAMAS Berenger committed
128 129 130 131 132 133 134

#ifdef STARPU_USE_REDUX
    starpu_codelet p2p_redux_init;
    starpu_codelet p2p_redux_perform;
    starpu_codelet p2p_redux_read;
#endif

135 136 137
    const bool noCommuteAtLastLevel;
    const bool noCommuteBetweenLevel;

138
#ifdef STARPU_USE_CPU
139
    StarPUCpuWrapperClass cpuWrapper;
140
#endif
141
#ifdef SCALFMM_ENABLE_CUDA_KERNEL
142 143
    StarPUCudaWrapperClass cudaWrapper;
#endif
144
#ifdef SCALFMM_ENABLE_OPENCL_KERNEL
145 146
    StarPUOpenClWrapperClass openclWrapper;
#endif
147 148 149

    FStarPUPtrInterface wrappers;
    FStarPUPtrInterface* wrapperptr;
150

151
#ifdef STARPU_SUPPORT_ARBITER
BRAMAS Berenger's avatar
BRAMAS Berenger committed
152
    starpu_arbiter_t arbiterGlobal;
153
#endif
BRAMAS Berenger's avatar
BRAMAS Berenger committed
154 155

#ifdef STARPU_USE_TASK_NAME
156 157 158 159 160 161 162 163 164 165
#ifndef SCALFMM_SIMGRID_TASKNAMEPARAMS
    std::vector<std::unique_ptr<char[]>> m2mTaskNames;
    std::vector<std::unique_ptr<char[]>> m2lTaskNames;
    std::vector<std::unique_ptr<char[]>> m2lOuterTaskNames;
    std::vector<std::unique_ptr<char[]>> l2lTaskNames;
    std::unique_ptr<char[]> p2mTaskNames;
    std::unique_ptr<char[]> l2pTaskNames;
    std::unique_ptr<char[]> p2pTaskNames;
    std::unique_ptr<char[]> p2pOuterTaskNames;
#else
Martin Khannouz's avatar
Martin Khannouz committed
166
    FStarPUTaskNameParams taskNames;
BRAMAS Berenger's avatar
BRAMAS Berenger committed
167
#endif
168
#endif
169 170 171
#ifdef SCALFMM_STARPU_USE_PRIO
    typedef FStarPUFmmPrioritiesV2 PrioClass;// FStarPUFmmPriorities
#endif
172
	std::list<char*> taskName;
BRAMAS Berenger's avatar
BRAMAS Berenger committed
173

174
public:
175 176
    FGroupTaskStarPUMpiAlgorithm(const FMpi::FComm& inComm, OctreeClass*const inTree, KernelClass* inKernels)
        :   comm(inComm), tree(inTree), originalCpuKernel(inKernels),
177
          cellHandles(nullptr),
178 179
          noCommuteAtLastLevel(FEnv::GetBool("SCALFMM_NO_COMMUTE_LAST_L2L", true)),
          noCommuteBetweenLevel(FEnv::GetBool("SCALFMM_NO_COMMUTE_M2L_L2L", false)),
BRAMAS Berenger's avatar
BRAMAS Berenger committed
180 181 182 183 184 185 186 187
      #ifdef STARPU_USE_CPU
          cpuWrapper(tree->getHeight()),
      #endif
      #ifdef SCALFMM_ENABLE_CUDA_KERNEL
          cudaWrapper(tree->getHeight()),
      #endif
      #ifdef SCALFMM_ENABLE_OPENCL_KERNEL
          openclWrapper(tree->getHeight()),
Martin Khannouz's avatar
Martin Khannouz committed
188 189
      #endif
	  #ifdef STARPU_USE_TASK_NAME
190 191 192
	  #ifdef SCALFMM_SIMGRID_TASKNAMEPARAMS
         taskNames(inComm.processId(), inComm.processCount()),
      #endif
BRAMAS Berenger's avatar
BRAMAS Berenger committed
193 194
      #endif
          wrapperptr(&wrappers){
195 196 197
        FAssertLF(tree, "tree cannot be null");
        FAssertLF(inKernels, "kernels cannot be null");

198 199
        FAbstractAlgorithm::setNbLevelsInTree(tree->getHeight());

200 201
        struct starpu_conf conf;
        FAssertLF(starpu_conf_init(&conf) == 0);
202
#ifdef SCALFMM_STARPU_USE_PRIO
203
        PrioClass::Controller().init(&conf, tree->getHeight(), inKernels);
204
#endif
205
        FAssertLF(starpu_init(&conf) == 0);
206
        FAssertLF(starpu_mpi_init ( 0, 0, 0 ) == 0);
207

BRAMAS Berenger's avatar
BRAMAS Berenger committed
208 209
        starpu_malloc_set_align(32);

210 211
        starpu_pthread_mutex_t initMutex;
        starpu_pthread_mutex_init(&initMutex, NULL);
212
#ifdef STARPU_USE_CPU
213 214 215 216 217
        FStarPUUtils::ExecOnWorkers(STARPU_CPU, [&](){
            starpu_pthread_mutex_lock(&initMutex);
            cpuWrapper.initKernel(starpu_worker_get_id(), inKernels);
            starpu_pthread_mutex_unlock(&initMutex);
        });
218
        wrappers.set(FSTARPU_CPU_IDX, &cpuWrapper);
219
#endif
220
#ifdef SCALFMM_ENABLE_CUDA_KERNEL
221 222 223 224 225 226 227
        FStarPUUtils::ExecOnWorkers(STARPU_CUDA, [&](){
            starpu_pthread_mutex_lock(&initMutex);
            cudaWrapper.initKernel(starpu_worker_get_id(), inKernels);
            starpu_pthread_mutex_unlock(&initMutex);
        });
        wrappers.set(FSTARPU_CUDA_IDX, &cudaWrapper);
#endif
228
#ifdef SCALFMM_ENABLE_OPENCL_KERNEL
229 230 231 232 233 234
        FStarPUUtils::ExecOnWorkers(STARPU_OPENCL, [&](){
            starpu_pthread_mutex_lock(&initMutex);
            openclWrapper.initKernel(starpu_worker_get_id(), inKernels);
            starpu_pthread_mutex_unlock(&initMutex);
        });
        wrappers.set(FSTARPU_OPENCL_IDX, &openclWrapper);
235
#endif
236 237
        starpu_pthread_mutex_destroy(&initMutex);

238 239
        starpu_pause();

240
        cellHandles   = new std::vector<CellHandles>[tree->getHeight()];
241

242
#ifdef STARPU_SUPPORT_ARBITER
BRAMAS Berenger's avatar
BRAMAS Berenger committed
243
        arbiterGlobal = starpu_arbiter_create();
244
#endif
BRAMAS Berenger's avatar
BRAMAS Berenger committed
245 246
        initCodelet();
        initCodeletMpi();
BRAMAS Berenger's avatar
BRAMAS Berenger committed
247
        rebuildInteractions();
BRAMAS Berenger's avatar
BRAMAS Berenger committed
248

BRAMAS Berenger's avatar
BRAMAS Berenger committed
249
        FLOG(FLog::Controller << "FGroupTaskStarPUAlgorithm (Max Worker " << starpu_worker_get_count() << ")\n");
BRAMAS Berenger's avatar
BRAMAS Berenger committed
250
#ifdef STARPU_USE_CPU
BRAMAS Berenger's avatar
BRAMAS Berenger committed
251
        FLOG(FLog::Controller << "FGroupTaskStarPUAlgorithm (Max CPU " << starpu_cpu_worker_get_count() << ")\n");
BRAMAS Berenger's avatar
BRAMAS Berenger committed
252
#endif
253
#ifdef SCALFMM_ENABLE_OPENCL_KERNEL
BRAMAS Berenger's avatar
BRAMAS Berenger committed
254 255
        FLOG(FLog::Controller << "FGroupTaskStarPUAlgorithm (Max OpenCL " << starpu_opencl_worker_get_count() << ")\n");
#endif
256
#ifdef SCALFMM_ENABLE_CUDA_KERNEL
BRAMAS Berenger's avatar
BRAMAS Berenger committed
257 258
        FLOG(FLog::Controller << "FGroupTaskStarPUAlgorithm (Max CUDA " << starpu_cuda_worker_get_count() << ")\n");
#endif
259 260
        FLOG(FLog::Controller << "SCALFMM_NO_COMMUTE_LAST_L2L " << noCommuteAtLastLevel << "\n");
        FLOG(FLog::Controller << "SCALFMM_NO_COMMUTE_M2L_L2L " << noCommuteBetweenLevel << "\n");
BRAMAS Berenger's avatar
BRAMAS Berenger committed
261 262 263 264 265

        buildTaskNames();
    }

    void buildTaskNames(){
266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293
#ifdef STARPU_USE_TASK_NAME
#ifndef SCALFMM_SIMGRID_TASKNAMEPARAMS
        const int namesLength = 128;
        m2mTaskNames.resize(tree->getHeight());
        m2lTaskNames.resize(tree->getHeight());
        m2lOuterTaskNames.resize(tree->getHeight());
        l2lTaskNames.resize(tree->getHeight());
        for(int idxLevel = 0 ; idxLevel < tree->getHeight() ; ++idxLevel){
            m2mTaskNames[idxLevel].reset(new char[namesLength]);
            snprintf(m2mTaskNames[idxLevel].get(), namesLength, "M2M-level-%d", idxLevel);
            m2lTaskNames[idxLevel].reset(new char[namesLength]);
            snprintf(m2lTaskNames[idxLevel].get(), namesLength, "M2L-level-%d", idxLevel);
            m2lOuterTaskNames[idxLevel].reset(new char[namesLength]);
            snprintf(m2lOuterTaskNames[idxLevel].get(), namesLength, "M2L-out-level-%d", idxLevel);
            l2lTaskNames[idxLevel].reset(new char[namesLength]);
            snprintf(l2lTaskNames[idxLevel].get(), namesLength, "L2L-level-%d", idxLevel);
        }

        p2mTaskNames.reset(new char[namesLength]);
        snprintf(p2mTaskNames.get(), namesLength, "P2M");
        l2pTaskNames.reset(new char[namesLength]);
        snprintf(l2pTaskNames.get(), namesLength, "L2P");
        p2pTaskNames.reset(new char[namesLength]);
        snprintf(p2pTaskNames.get(), namesLength, "P2P");
        p2pOuterTaskNames.reset(new char[namesLength]);
        snprintf(p2pOuterTaskNames.get(), namesLength, "P2P-out");
#endif
#endif
294 295
    }

296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316
    void syncData(){
        for(int idxLevel = 0 ; idxLevel < tree->getHeight() ; ++idxLevel){
            for(int idxHandle = 0 ; idxHandle < int(cellHandles[idxLevel].size()) ; ++idxHandle){
                starpu_data_acquire(cellHandles[idxLevel][idxHandle].symb, STARPU_R);
                starpu_data_release(cellHandles[idxLevel][idxHandle].symb);
                starpu_data_acquire(cellHandles[idxLevel][idxHandle].up, STARPU_R);
                starpu_data_release(cellHandles[idxLevel][idxHandle].up);
                starpu_data_acquire(cellHandles[idxLevel][idxHandle].down, STARPU_R);
                starpu_data_release(cellHandles[idxLevel][idxHandle].down);
            }
        }
        {
            for(int idxHandle = 0 ; idxHandle < int(particleHandles.size()) ; ++idxHandle){
                starpu_data_acquire(particleHandles[idxHandle].symb, STARPU_R);
                starpu_data_release(particleHandles[idxHandle].symb);
                starpu_data_acquire(particleHandles[idxHandle].down, STARPU_R);
                starpu_data_release(particleHandles[idxHandle].down);
            }
        }
    }

317
    ~FGroupTaskStarPUMpiAlgorithm(){
BRAMAS Berenger's avatar
BRAMAS Berenger committed
318 319
        starpu_resume();

320
        cleanHandle();
321
        cleanHandleMpi();
322
        delete[] cellHandles;
323

BRAMAS Berenger's avatar
BRAMAS Berenger committed
324 325 326 327 328 329 330 331 332 333
        starpu_pthread_mutex_t releaseMutex;
        starpu_pthread_mutex_init(&releaseMutex, NULL);
#ifdef STARPU_USE_CPU
        FStarPUUtils::ExecOnWorkers(STARPU_CPU, [&](){
            starpu_pthread_mutex_lock(&releaseMutex);
            cpuWrapper.releaseKernel(starpu_worker_get_id());
            starpu_pthread_mutex_unlock(&releaseMutex);
        });
        wrappers.set(FSTARPU_CPU_IDX, &cpuWrapper);
#endif
334
#ifdef SCALFMM_ENABLE_CUDA_KERNEL
BRAMAS Berenger's avatar
BRAMAS Berenger committed
335 336 337 338 339 340 341
        FStarPUUtils::ExecOnWorkers(STARPU_CUDA, [&](){
            starpu_pthread_mutex_lock(&releaseMutex);
            cudaWrapper.releaseKernel(starpu_worker_get_id());
            starpu_pthread_mutex_unlock(&releaseMutex);
        });
        wrappers.set(FSTARPU_CUDA_IDX, &cudaWrapper);
#endif
342
#ifdef SCALFMM_ENABLE_OPENCL_KERNEL
BRAMAS Berenger's avatar
BRAMAS Berenger committed
343 344 345 346 347 348 349 350 351
        FStarPUUtils::ExecOnWorkers(STARPU_OPENCL, [&](){
            starpu_pthread_mutex_lock(&releaseMutex);
            openclWrapper.releaseKernel(starpu_worker_get_id());
            starpu_pthread_mutex_unlock(&releaseMutex);
        });
        wrappers.set(FSTARPU_OPENCL_IDX, &openclWrapper);
#endif
        starpu_pthread_mutex_destroy(&releaseMutex);

352 353

#ifdef STARPU_SUPPORT_ARBITER
BRAMAS Berenger's avatar
BRAMAS Berenger committed
354
        starpu_arbiter_destroy(arbiterGlobal);
355
#endif
356 357
		for(char* ptr : taskName)
			free(ptr);
358
        starpu_mpi_shutdown();
359 360 361
        starpu_shutdown();
    }

BRAMAS Berenger's avatar
BRAMAS Berenger committed
362
    void rebuildInteractions(){
363
        FAssertLF(getenv("OMP_WAIT_POLICY") == nullptr
BRAMAS Berenger's avatar
BRAMAS Berenger committed
364 365
                || strcmp(getenv("OMP_WAIT_POLICY"), "PASSIVE") == 0
                  || strcmp(getenv("OMP_WAIT_POLICY"), "passive") == 0);
366

BRAMAS Berenger's avatar
BRAMAS Berenger committed
367 368
#pragma omp parallel
#pragma omp single
369 370 371
        buildExternalInteractionVecs();
        buildHandles();

BRAMAS Berenger's avatar
BRAMAS Berenger committed
372 373
#pragma omp parallel
#pragma omp single
374
        buildRemoteInteractionsAndHandles();
BRAMAS Berenger's avatar
BRAMAS Berenger committed
375 376
    }

377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420
#ifdef STARPU_USE_CPU
    void forEachCpuWorker(std::function<void(void)> func){
        starpu_resume();
        FStarPUUtils::ExecOnWorkers(STARPU_CPU, func);
        starpu_pause();
    }

    void forEachCpuWorker(std::function<void(KernelClass*)> func){
        starpu_resume();
        FStarPUUtils::ExecOnWorkers(STARPU_CPU, [&](){
            func(cpuWrapper.getKernel(starpu_worker_get_id()));
        });
        starpu_pause();
    }
#endif
#ifdef SCALFMM_ENABLE_CUDA_KERNEL
    void forEachCudaWorker(std::function<void(void)> func){
        starpu_resume();
        FStarPUUtils::ExecOnWorkers(STARPU_CUDA, func);
        starpu_pause();
    }
    void forEachCudaWorker(std::function<void(void*)> func){
        starpu_resume();
        FStarPUUtils::ExecOnWorkers(STARPU_CUDA, [&](){
            func(cudaWrapper.getKernel(starpu_worker_get_id()));
        });
        starpu_pause();
    }
#endif
#ifdef SCALFMM_ENABLE_OPENCL_KERNEL
    void forEachOpenCLWorker(std::function<void(void)> func){
        starpu_resume();
        FStarPUUtils::ExecOnWorkers(STARPU_OPENCL, func);
        starpu_pause();
    }
    void forEachOpenCLWorker(std::function<void(void*)> func){
        starpu_resume();
        FStarPUUtils::ExecOnWorkers(STARPU_OPENCL, [&](){
            func(openclWrapper.getKernel(starpu_worker_get_id()));
        });
        starpu_pause();
    }
#endif

BRAMAS Berenger's avatar
BRAMAS Berenger committed
421
protected:
422

BRAMAS Berenger's avatar
BRAMAS Berenger committed
423 424 425 426 427 428 429 430 431 432 433
    /**
      * Runs the complete algorithm.
      */
    void executeCore(const unsigned operationsToProceed) override {
        FLOG( FLog::Controller << "\tStart FGroupTaskStarPUMpiAlgorithm\n" );
        const bool directOnly = (tree->getHeight() <= 2);

#ifdef STARPU_USE_CPU
        FTIME_TASKS(cpuWrapper.taskTimeRecorder.start());
#endif

434 435
        FLOG(FTic timerSoumission);

436
        starpu_resume();
437
        postRecvAllocatedBlocks();
438

439
        if( operationsToProceed & FFmmP2P ) insertParticlesSend();
440

441 442 443
        if( operationsToProceed & FFmmP2P ) directPass();
        if( operationsToProceed & FFmmP2P ) directPassMpi();

444
        if(operationsToProceed & FFmmP2M && !directOnly) bottomPass();
445

BRAMAS Berenger's avatar
BRAMAS Berenger committed
446
        if(operationsToProceed & FFmmM2M && !directOnly) upwardPass();
447
        if(operationsToProceed & FFmmM2L && !directOnly) insertCellsSend();
448

BRAMAS Berenger's avatar
BRAMAS Berenger committed
449 450 451
        if(operationsToProceed & FFmmM2L && !directOnly) transferPass(FAbstractAlgorithm::upperWorkingLevel, FAbstractAlgorithm::lowerWorkingLevel-1 , true, true);
        if(operationsToProceed & FFmmM2L && !directOnly) transferPass(FAbstractAlgorithm::lowerWorkingLevel-1, FAbstractAlgorithm::lowerWorkingLevel, false, false);
        if(operationsToProceed & FFmmM2L && !directOnly) transferPassMpi();
452

BRAMAS Berenger's avatar
BRAMAS Berenger committed
453
        if(operationsToProceed & FFmmL2L && !directOnly) downardPass();
454

BRAMAS Berenger's avatar
BRAMAS Berenger committed
455
        if(operationsToProceed & FFmmM2L && !directOnly) transferPass(FAbstractAlgorithm::lowerWorkingLevel-1, FAbstractAlgorithm::lowerWorkingLevel, true, true);
BRAMAS Berenger's avatar
BRAMAS Berenger committed
456

BRAMAS Berenger's avatar
BRAMAS Berenger committed
457 458 459 460
        if( operationsToProceed & FFmmL2P && !directOnly) mergePass();
#ifdef STARPU_USE_REDUX
        if( operationsToProceed & FFmmL2P && !directOnly) readParticle();
#endif
461

462
        FLOG( FLog::Controller << "\t\t Submitting the tasks took " << timerSoumission.tacAndElapsed() << "s\n" );
463
        starpu_task_wait_for_all();
464 465 466 467 468

        FLOG( FTic timerSync; );
        syncData();
        FLOG( FLog::Controller << "\t\t Moving data to the host took " << timerSync.tacAndElapsed() << "s\n" );

469
        starpu_pause();
BRAMAS Berenger's avatar
BRAMAS Berenger committed
470 471 472 473 474

#ifdef STARPU_USE_CPU
        FTIME_TASKS(cpuWrapper.taskTimeRecorder.end());
        FTIME_TASKS(cpuWrapper.taskTimeRecorder.saveToDisk("/tmp/taskstime-FGroupTaskStarPUAlgorithm.txt"));
#endif
475
    }
476

477 478
    void initCodelet(){
        memset(&p2m_cl, 0, sizeof(p2m_cl));
479
#ifdef STARPU_USE_CPU
480
        if(originalCpuKernel->supportP2M(FSTARPU_CPU_IDX)){
481 482 483
            p2m_cl.cpu_funcs[0] = StarPUCpuWrapperClass::bottomPassCallback;
            p2m_cl.where |= STARPU_CPU;
        }
484
#endif
485
#ifdef SCALFMM_ENABLE_CUDA_KERNEL
486
        if(originalCpuKernel->supportP2M(FSTARPU_CUDA_IDX)){
487 488 489 490
            p2m_cl.cuda_funcs[0] = StarPUCudaWrapperClass::bottomPassCallback;
            p2m_cl.where |= STARPU_CUDA;
        }
#endif
491
#ifdef SCALFMM_ENABLE_OPENCL_KERNEL
492
        if(originalCpuKernel->supportP2M(FSTARPU_OPENCL_IDX)){
493 494 495
            p2m_cl.opencl_funcs[0] = StarPUOpenClWrapperClass::bottomPassCallback;
            p2m_cl.where |= STARPU_OPENCL;
        }
496
#endif
497 498 499 500
        p2m_cl.nbuffers = 3;
        p2m_cl.modes[0] = STARPU_R;
        p2m_cl.modes[1] = STARPU_RW;
        p2m_cl.modes[2] = STARPU_R;
501
        p2m_cl.name = "p2m_cl";
502

BRAMAS Berenger's avatar
BRAMAS Berenger committed
503
        memset(&m2m_cl, 0, sizeof(m2m_cl));
504
#ifdef STARPU_USE_CPU
BRAMAS Berenger's avatar
BRAMAS Berenger committed
505 506 507 508
        if(originalCpuKernel->supportM2M(FSTARPU_CPU_IDX)){
            m2m_cl.cpu_funcs[0] = StarPUCpuWrapperClass::upwardPassCallback;
            m2m_cl.where |= STARPU_CPU;
        }
509
#endif
510
#ifdef SCALFMM_ENABLE_CUDA_KERNEL
BRAMAS Berenger's avatar
BRAMAS Berenger committed
511 512 513 514
        if(originalCpuKernel->supportM2M(FSTARPU_CUDA_IDX)){
            m2m_cl.cuda_funcs[0] = StarPUCudaWrapperClass::upwardPassCallback;
            m2m_cl.where |= STARPU_CUDA;
        }
515
#endif
516
#ifdef SCALFMM_ENABLE_OPENCL_KERNEL
BRAMAS Berenger's avatar
BRAMAS Berenger committed
517 518 519 520
        if(originalCpuKernel->supportM2M(FSTARPU_OPENCL_IDX)){
            m2m_cl.opencl_funcs[0] = StarPUOpenClWrapperClass::upwardPassCallback;
            m2m_cl.where |= STARPU_OPENCL;
        }
521
#endif
BRAMAS Berenger's avatar
BRAMAS Berenger committed
522 523 524 525 526 527 528 529 530
        m2m_cl.nbuffers = 4;
        m2m_cl.dyn_modes = (starpu_data_access_mode*)malloc(m2m_cl.nbuffers*sizeof(starpu_data_access_mode));
        m2m_cl.dyn_modes[0] = STARPU_R;
        m2m_cl.dyn_modes[1] = starpu_data_access_mode(STARPU_RW|STARPU_COMMUTE_IF_SUPPORTED);
        m2m_cl.name = "m2m_cl";
        m2m_cl.dyn_modes[2] = STARPU_R;
        m2m_cl.dyn_modes[3] = STARPU_R;

        memset(&l2l_cl, 0, sizeof(l2l_cl));
531
#ifdef STARPU_USE_CPU
BRAMAS Berenger's avatar
BRAMAS Berenger committed
532 533 534 535
        if(originalCpuKernel->supportL2L(FSTARPU_CPU_IDX)){
            l2l_cl.cpu_funcs[0] = StarPUCpuWrapperClass::downardPassCallback;
            l2l_cl.where |= STARPU_CPU;
        }
536
#endif
537
#ifdef SCALFMM_ENABLE_CUDA_KERNEL
BRAMAS Berenger's avatar
BRAMAS Berenger committed
538 539 540 541
        if(originalCpuKernel->supportL2L(FSTARPU_CUDA_IDX)){
            l2l_cl.cuda_funcs[0] = StarPUCudaWrapperClass::downardPassCallback;
            l2l_cl.where |= STARPU_CUDA;
        }
542
#endif
543
#ifdef SCALFMM_ENABLE_OPENCL_KERNEL
BRAMAS Berenger's avatar
BRAMAS Berenger committed
544 545 546
        if(originalCpuKernel->supportL2L(FSTARPU_OPENCL_IDX)){
            l2l_cl.opencl_funcs[0] = StarPUOpenClWrapperClass::downardPassCallback;
            l2l_cl.where |= STARPU_OPENCL;
547
        }
BRAMAS Berenger's avatar
BRAMAS Berenger committed
548 549 550 551 552 553 554 555
#endif
        l2l_cl.nbuffers = 4;
        l2l_cl.dyn_modes = (starpu_data_access_mode*)malloc(l2l_cl.nbuffers*sizeof(starpu_data_access_mode));
        l2l_cl.dyn_modes[0] = STARPU_R;
        l2l_cl.dyn_modes[1] = STARPU_R;
        l2l_cl.name = "l2l_cl";
        l2l_cl.dyn_modes[2] = STARPU_R;
        l2l_cl.dyn_modes[3] = starpu_data_access_mode(STARPU_RW|STARPU_COMMUTE_IF_SUPPORTED);
556

BRAMAS Berenger's avatar
BRAMAS Berenger committed
557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583
        memset(&l2l_cl_nocommute, 0, sizeof(l2l_cl_nocommute));
#ifdef STARPU_USE_CPU
        if(originalCpuKernel->supportL2L(FSTARPU_CPU_IDX)){
            l2l_cl_nocommute.cpu_funcs[0] = StarPUCpuWrapperClass::downardPassCallback;
            l2l_cl_nocommute.where |= STARPU_CPU;
        }
#endif
#ifdef SCALFMM_ENABLE_CUDA_KERNEL
        if(originalCpuKernel->supportL2L(FSTARPU_CUDA_IDX)){
            l2l_cl_nocommute.cuda_funcs[0] = StarPUCudaWrapperClass::downardPassCallback;
            l2l_cl_nocommute.where |= STARPU_CUDA;
        }
#endif
#ifdef SCALFMM_ENABLE_OPENCL_KERNEL
        if(originalCpuKernel->supportL2L(FSTARPU_OPENCL_IDX)){
            l2l_cl_nocommute.opencl_funcs[0] = StarPUOpenClWrapperClass::downardPassCallback;
            l2l_cl_nocommute.where |= STARPU_OPENCL;
        }
#endif
        l2l_cl_nocommute.nbuffers = 4;
        l2l_cl_nocommute.dyn_modes = (starpu_data_access_mode*)malloc(l2l_cl_nocommute.nbuffers*sizeof(starpu_data_access_mode));
        l2l_cl_nocommute.dyn_modes[0] = STARPU_R;
        l2l_cl_nocommute.dyn_modes[1] = STARPU_R;
        l2l_cl_nocommute.name = "l2l_cl";
        l2l_cl_nocommute.dyn_modes[2] = STARPU_R;
        l2l_cl_nocommute.dyn_modes[3] = STARPU_RW;

584
        memset(&l2p_cl, 0, sizeof(l2p_cl));
585
#ifdef STARPU_USE_CPU
586
        if(originalCpuKernel->supportL2P(FSTARPU_CPU_IDX)){
587 588 589
            l2p_cl.cpu_funcs[0] = StarPUCpuWrapperClass::mergePassCallback;
            l2p_cl.where |= STARPU_CPU;
        }
590
#endif
591
#ifdef SCALFMM_ENABLE_CUDA_KERNEL
592
        if(originalCpuKernel->supportL2P(FSTARPU_CUDA_IDX)){
593 594 595 596
            l2p_cl.cuda_funcs[0] = StarPUCudaWrapperClass::mergePassCallback;
            l2p_cl.where |= STARPU_CUDA;
        }
#endif
597
#ifdef SCALFMM_ENABLE_OPENCL_KERNEL
598
        if(originalCpuKernel->supportL2P(FSTARPU_OPENCL_IDX)){
599 600 601
            l2p_cl.opencl_funcs[0] = StarPUOpenClWrapperClass::mergePassCallback;
            l2p_cl.where |= STARPU_OPENCL;
        }
602
#endif
603
        l2p_cl.nbuffers = 4;
604
        l2p_cl.modes[0] = STARPU_R;
605 606
        l2p_cl.modes[1] = STARPU_R;
        l2p_cl.modes[2] = STARPU_R;
BRAMAS Berenger's avatar
BRAMAS Berenger committed
607 608 609
#ifdef STARPU_USE_REDUX
        l2p_cl.modes[3] = STARPU_REDUX;
#else
610
        l2p_cl.modes[3] = starpu_data_access_mode(STARPU_RW|STARPU_COMMUTE_IF_SUPPORTED);
BRAMAS Berenger's avatar
BRAMAS Berenger committed
611
#endif
612
        l2p_cl.name = "l2p_cl";
613 614

        memset(&p2p_cl_in, 0, sizeof(p2p_cl_in));
615
#ifdef STARPU_USE_CPU
616
        if(originalCpuKernel->supportP2P(FSTARPU_CPU_IDX)){
617 618 619
            p2p_cl_in.cpu_funcs[0] = StarPUCpuWrapperClass::directInPassCallback;
            p2p_cl_in.where |= STARPU_CPU;
        }
620
#endif
621
#ifdef SCALFMM_ENABLE_CUDA_KERNEL
622
        if(originalCpuKernel->supportP2P(FSTARPU_CUDA_IDX)){
623 624 625 626
            p2p_cl_in.cuda_funcs[0] = StarPUCudaWrapperClass::directInPassCallback;
            p2p_cl_in.where |= STARPU_CUDA;
        }
#endif
627
#ifdef SCALFMM_ENABLE_OPENCL_KERNEL
628
        if(originalCpuKernel->supportP2P(FSTARPU_OPENCL_IDX)){
629 630 631
            p2p_cl_in.opencl_funcs[0] = StarPUOpenClWrapperClass::directInPassCallback;
            p2p_cl_in.where |= STARPU_OPENCL;
        }
632
#endif
633 634
        p2p_cl_in.nbuffers = 2;
        p2p_cl_in.modes[0] = STARPU_R;
BRAMAS Berenger's avatar
BRAMAS Berenger committed
635 636 637
#ifdef STARPU_USE_REDUX
        p2p_cl_in.modes[1] = STARPU_REDUX;
#else
638
        p2p_cl_in.modes[1] = starpu_data_access_mode(STARPU_RW|STARPU_COMMUTE_IF_SUPPORTED);
BRAMAS Berenger's avatar
BRAMAS Berenger committed
639
#endif
640
        p2p_cl_in.name = "p2p_cl_in";
641
        memset(&p2p_cl_inout, 0, sizeof(p2p_cl_inout));
642
#ifdef STARPU_USE_CPU
643
        if(originalCpuKernel->supportP2PExtern(FSTARPU_CPU_IDX)){
644 645 646
            p2p_cl_inout.cpu_funcs[0] = StarPUCpuWrapperClass::directInoutPassCallback;
            p2p_cl_inout.where |= STARPU_CPU;
        }
647
#endif
648
#ifdef SCALFMM_ENABLE_CUDA_KERNEL
649
        if(originalCpuKernel->supportP2PExtern(FSTARPU_CUDA_IDX)){
650 651 652 653
            p2p_cl_inout.cuda_funcs[0] = StarPUCudaWrapperClass::directInoutPassCallback;
            p2p_cl_inout.where |= STARPU_CUDA;
        }
#endif
654
#ifdef SCALFMM_ENABLE_OPENCL_KERNEL
655
        if(originalCpuKernel->supportP2PExtern(FSTARPU_OPENCL_IDX)){
656 657 658
            p2p_cl_inout.opencl_funcs[0] = StarPUOpenClWrapperClass::directInoutPassCallback;
            p2p_cl_inout.where |= STARPU_OPENCL;
        }
659
#endif
660 661
        p2p_cl_inout.nbuffers = 4;
        p2p_cl_inout.modes[0] = STARPU_R;
BRAMAS Berenger's avatar
BRAMAS Berenger committed
662 663 664
#ifdef STARPU_USE_REDUX
        p2p_cl_inout.modes[1] = STARPU_REDUX;
#else
665
        p2p_cl_inout.modes[1] = starpu_data_access_mode(STARPU_RW|STARPU_COMMUTE_IF_SUPPORTED);
BRAMAS Berenger's avatar
BRAMAS Berenger committed
666
#endif
667
        p2p_cl_inout.modes[2] = STARPU_R;
BRAMAS Berenger's avatar
BRAMAS Berenger committed
668 669 670
#ifdef STARPU_USE_REDUX
        p2p_cl_inout.modes[3] = STARPU_REDUX;
#else
671
        p2p_cl_inout.modes[3] = starpu_data_access_mode(STARPU_RW|STARPU_COMMUTE_IF_SUPPORTED);
BRAMAS Berenger's avatar
BRAMAS Berenger committed
672
#endif
673
        p2p_cl_inout.name = "p2p_cl_inout";
674 675

        memset(&m2l_cl_in, 0, sizeof(m2l_cl_in));
676
#ifdef STARPU_USE_CPU
677
        if(originalCpuKernel->supportM2L(FSTARPU_CPU_IDX)){
678 679 680
            m2l_cl_in.cpu_funcs[0] = StarPUCpuWrapperClass::transferInPassCallback;
            m2l_cl_in.where |= STARPU_CPU;
        }
681
#endif
682
#ifdef SCALFMM_ENABLE_CUDA_KERNEL
683
        if(originalCpuKernel->supportM2L(FSTARPU_CUDA_IDX)){
684 685 686 687
            m2l_cl_in.cuda_funcs[0] = StarPUCudaWrapperClass::transferInPassCallback;
            m2l_cl_in.where |= STARPU_CUDA;
        }
#endif
688
#ifdef SCALFMM_ENABLE_OPENCL_KERNEL
689
        if(originalCpuKernel->supportM2L(FSTARPU_OPENCL_IDX)){
690 691 692
            m2l_cl_in.opencl_funcs[0] = StarPUOpenClWrapperClass::transferInPassCallback;
            m2l_cl_in.where |= STARPU_OPENCL;
        }
693
#endif
694 695
        m2l_cl_in.nbuffers = 3;
        m2l_cl_in.modes[0] = STARPU_R;
696
        m2l_cl_in.modes[1] = STARPU_R;
697
        m2l_cl_in.modes[2] = starpu_data_access_mode(STARPU_RW|STARPU_COMMUTE_IF_SUPPORTED);
698
        m2l_cl_in.name = "m2l_cl_in";
BRAMAS Berenger's avatar
BRAMAS Berenger committed
699

700
        memset(&m2l_cl_inout, 0, sizeof(m2l_cl_inout));
701
#ifdef STARPU_USE_CPU
702
        if(originalCpuKernel->supportM2LExtern(FSTARPU_CPU_IDX)){
703 704 705
            m2l_cl_inout.cpu_funcs[0] = StarPUCpuWrapperClass::transferInoutPassCallback;
            m2l_cl_inout.where |= STARPU_CPU;
        }
706
#endif
707
#ifdef SCALFMM_ENABLE_CUDA_KERNEL
708
        if(originalCpuKernel->supportM2LExtern(FSTARPU_CUDA_IDX)){
709 710 711 712
            m2l_cl_inout.cuda_funcs[0] = StarPUCudaWrapperClass::transferInoutPassCallback;
            m2l_cl_inout.where |= STARPU_CUDA;
        }
#endif
713
#ifdef SCALFMM_ENABLE_OPENCL_KERNEL
714
        if(originalCpuKernel->supportM2LExtern(FSTARPU_OPENCL_IDX)){
715 716 717
            m2l_cl_inout.opencl_funcs[0] = StarPUOpenClWrapperClass::transferInoutPassCallback;
            m2l_cl_inout.where |= STARPU_OPENCL;
        }
718
#endif
BRAMAS Berenger's avatar
BRAMAS Berenger committed
719
        m2l_cl_inout.nbuffers = 4;
720
        m2l_cl_inout.modes[0] = STARPU_R;
BRAMAS Berenger's avatar
BRAMAS Berenger committed
721 722
        m2l_cl_inout.modes[1] = starpu_data_access_mode(STARPU_RW|STARPU_COMMUTE_IF_SUPPORTED);
        m2l_cl_inout.modes[2] = STARPU_R;
723
        m2l_cl_inout.modes[3] = STARPU_R;
724
        m2l_cl_inout.name = "m2l_cl_inout";
BRAMAS Berenger's avatar
BRAMAS Berenger committed
725

BRAMAS Berenger's avatar
BRAMAS Berenger committed
726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768
#ifdef STARPU_USE_REDUX
        memset(&p2p_redux_init, 0, sizeof(p2p_redux_init));
#ifdef STARPU_USE_CPU
        p2p_redux_init.cpu_funcs[0] = FStarPUReduxCpu::InitData<typename ParticleGroupClass::ParticleDataType>;
        p2p_redux_init.where |= STARPU_CPU;
#endif
        p2p_redux_init.nbuffers = 1;
        p2p_redux_init.modes[0] = STARPU_RW;
        p2p_redux_init.name = "p2p_redux_init";

        memset(&p2p_redux_perform, 0, sizeof(p2p_redux_perform));
#ifdef STARPU_USE_CPU
        p2p_redux_perform.cpu_funcs[0] = FStarPUReduxCpu::ReduceData<typename ParticleGroupClass::ParticleDataType>;
        p2p_redux_perform.where |= STARPU_CPU;
#endif
        p2p_redux_perform.nbuffers = 2;
        p2p_redux_perform.modes[0] = STARPU_RW;
        p2p_redux_perform.modes[1] = STARPU_R;
        p2p_redux_perform.name = "p2p_redux_perform";

        memset(&p2p_redux_read, 0, sizeof(p2p_redux_read));
#ifdef STARPU_USE_CPU
        if(originalCpuKernel->supportL2P(FSTARPU_CPU_IDX)){
            p2p_redux_read.cpu_funcs[0] = FStarPUReduxCpu::EmptyCodelet<typename ParticleGroupClass::ParticleDataType>;
            p2p_redux_read.where |= STARPU_CPU;
        }
#endif
#ifdef SCALFMM_ENABLE_CUDA_KERNEL
        if(originalCpuKernel->supportL2P(FSTARPU_CUDA_IDX)){
            p2p_redux_read.cuda_funcs[0] = FStarPUReduxCpu::EmptyCodelet<typename ParticleGroupClass::ParticleDataType>;
            p2p_redux_read.where |= STARPU_CUDA;
        }
#endif
#ifdef SCALFMM_ENABLE_OPENCL_KERNEL
        if(originalCpuKernel->supportL2P(FSTARPU_OPENCL_IDX)){
            p2p_redux_read.opencl_funcs[0] = FStarPUReduxCpu::EmptyCodelet<typename ParticleGroupClass::ParticleDataType>;
            p2p_redux_read.where |= STARPU_OPENCL;
        }
#endif
        p2p_redux_read.nbuffers = 1;
        p2p_redux_read.modes[0] = STARPU_R;
        p2p_redux_read.name = "p2p_redux_read";
#endif
BRAMAS Berenger's avatar
BRAMAS Berenger committed
769
    }
BRAMAS Berenger's avatar
BRAMAS Berenger committed
770

771 772 773
    /** dealloc in a starpu way all the defined handles */
    void cleanHandle(){
        for(int idxLevel = 0 ; idxLevel < tree->getHeight() ; ++idxLevel){
774 775 776 777
            for(int idxHandle = 0 ; idxHandle < int(cellHandles[idxLevel].size()) ; ++idxHandle){
                starpu_data_unregister(cellHandles[idxLevel][idxHandle].symb);
                starpu_data_unregister(cellHandles[idxLevel][idxHandle].up);
                starpu_data_unregister(cellHandles[idxLevel][idxHandle].down);
778
            }
779
            cellHandles[idxLevel].clear();
780 781
        }
        {
782 783 784
            for(int idxHandle = 0 ; idxHandle < int(particleHandles.size()) ; ++idxHandle){
                starpu_data_unregister(particleHandles[idxHandle].symb);
                starpu_data_unregister(particleHandles[idxHandle].down);
785
            }
786
            particleHandles.clear();
787 788 789 790
        }
    }

    ////////////////////////////////////////////////////////////////////////////
791 792

    void initCodeletMpi(){
793
        memset(&p2p_cl_inout_mpi, 0, sizeof(p2p_cl_inout_mpi));
794
#ifdef STARPU_USE_CPU
795
        if(originalCpuKernel->supportP2PMpi(FSTARPU_CPU_IDX)){
796 797 798
            p2p_cl_inout_mpi.where |= STARPU_CPU;
            p2p_cl_inout_mpi.cpu_funcs[0] = StarPUCpuWrapperClass::directInoutPassCallbackMpi;
        }
799
#endif
800
#ifdef SCALFMM_ENABLE_CUDA_KERNEL
801
        if(originalCpuKernel->supportP2PMpi(FSTARPU_CUDA_IDX)){
802 803 804 805
            p2p_cl_inout_mpi.where |= STARPU_CUDA;
            p2p_cl_inout_mpi.cuda_funcs[0] = StarPUCudaWrapperClass::directInoutPassCallbackMpi;
        }
#endif
806
#ifdef SCALFMM_ENABLE_OPENCL_KERNEL
807
        if(originalCpuKernel->supportP2PMpi(FSTARPU_OPENCL_IDX)){
808 809 810
            p2p_cl_inout_mpi.where |= STARPU_OPENCL;
            p2p_cl_inout_mpi.opencl_funcs[0] = StarPUOpenClWrapperClass::directInoutPassCallbackMpi;
        }
811
#endif
812 813
        p2p_cl_inout_mpi.nbuffers = 3;
        p2p_cl_inout_mpi.modes[0] = STARPU_R;
814
        p2p_cl_inout_mpi.modes[1] = starpu_data_access_mode(STARPU_RW|STARPU_COMMUTE_IF_SUPPORTED);
815
        p2p_cl_inout_mpi.modes[2] = STARPU_R;
816
        p2p_cl_inout_mpi.name = "p2p_cl_inout_mpi";
817

818
        memset(&m2l_cl_inout_mpi, 0, sizeof(m2l_cl_inout_mpi));
819
#ifdef STARPU_USE_CPU
820
        if(originalCpuKernel->supportM2LMpi(FSTARPU_CPU_IDX)){
821 822 823
            m2l_cl_inout_mpi.where |= STARPU_CPU;
            m2l_cl_inout_mpi.cpu_funcs[0] = StarPUCpuWrapperClass::transferInoutPassCallbackMpi;
        }
824
#endif
825
#ifdef SCALFMM_ENABLE_CUDA_KERNEL
826
        if(originalCpuKernel->supportM2LMpi(FSTARPU_CUDA_IDX)){
827 828 829 830
            m2l_cl_inout_mpi.where |= STARPU_CUDA;
            m2l_cl_inout_mpi.cuda_funcs[0] = StarPUCudaWrapperClass::transferInoutPassCallbackMpi;
        }
#endif
831
#ifdef SCALFMM_ENABLE_OPENCL_KERNEL
832
        if(originalCpuKernel->supportM2LMpi(FSTARPU_OPENCL_IDX)){
833 834 835
            m2l_cl_inout_mpi.where |= STARPU_OPENCL;
            m2l_cl_inout_mpi.opencl_funcs[0] = StarPUOpenClWrapperClass::transferInoutPassCallbackMpi;
        }
836
#endif
837 838
        m2l_cl_inout_mpi.nbuffers = 4;
        m2l_cl_inout_mpi.modes[0] = STARPU_R;
839
        m2l_cl_inout_mpi.modes[1] = starpu_data_access_mode(STARPU_RW|STARPU_COMMUTE_IF_SUPPORTED);
840 841
        m2l_cl_inout_mpi.modes[2] = STARPU_R;
        m2l_cl_inout_mpi.modes[3] = STARPU_R;
842
        m2l_cl_inout_mpi.name = "m2l_cl_inout_mpi";
843 844
    }

845 846 847 848 849 850
    std::vector<std::pair<MortonIndex,MortonIndex>> processesIntervalPerLevels;
    struct BlockDescriptor{
        MortonIndex firstIndex;
        MortonIndex lastIndex;
        int owner;
        int bufferSize;
851 852 853 854
        size_t bufferSizeSymb;
        size_t bufferSizeUp;
        size_t bufferSizeDown;
        size_t leavesBufferSize;
855 856 857 858 859 860 861 862
    };
    std::vector<std::vector<BlockDescriptor>> processesBlockInfos;
    std::vector<int> nbBlocksPerLevelAll;
    std::vector<int> nbBlocksBeforeMinPerLevel;

    std::vector< std::vector< std::vector<BlockInteractions<CellContainerClass>>>> externalInteractionsAllLevelMpi;
    std::vector< std::vector<BlockInteractions<ParticleGroupClass>>> externalInteractionsLeafLevelMpi;

863
    struct RemoteHandle{
864 865 866 867
        RemoteHandle() : ptrSymb(nullptr), ptrUp(nullptr), ptrDown(nullptr){
            memset(&handleSymb, 0, sizeof(handleSymb));
            memset(&handleUp, 0, sizeof(handleUp));
            memset(&handleDown, 0, sizeof(handleDown));
868 869
        }

870 871 872 873 874 875
        unsigned char * ptrSymb;
        starpu_data_handle_t handleSymb;
        unsigned char * ptrUp;
        starpu_data_handle_t handleUp;
        unsigned char * ptrDown;
        starpu_data_handle_t handleDown;
BRAMAS Berenger's avatar
BRAMAS Berenger committed
876 877

        int intervalSize;
878 879 880 881 882
    };

    std::vector<std::vector<RemoteHandle>> remoteCellGroups;
    std::vector<RemoteHandle> remoteParticleGroupss;

883
    void buildRemoteInteractionsAndHandles(){
884 885
        cleanHandleMpi();

886 887
        // We need to have information about all other blocks
        std::unique_ptr<int[]> nbBlocksPerLevel(new int[tree->getHeight()]);
888
        nbBlocksPerLevel[0] = 0;
889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931
        for(int idxLevel = 1 ; idxLevel < tree->getHeight() ; ++idxLevel){
            nbBlocksPerLevel[idxLevel] = tree->getNbCellGroupAtLevel(idxLevel);
        }
        // Exchange the number of blocks per proc
        nbBlocksPerLevelAll.resize(tree->getHeight() * comm.processCount());
        FMpi::Assert(MPI_Allgather(nbBlocksPerLevel.get(), tree->getHeight(), MPI_INT,
                                   nbBlocksPerLevelAll.data(), tree->getHeight(), MPI_INT,
                                   comm.getComm()), __LINE__);
        // Compute the number of blocks before mine
        nbBlocksBeforeMinPerLevel.resize(tree->getHeight());
        for(int idxLevel = 1 ; idxLevel < tree->getHeight() ; ++idxLevel){
            nbBlocksBeforeMinPerLevel[idxLevel] = 0;
            for(int idxProc = 0 ; idxProc < comm.processId() ; ++idxProc){
                nbBlocksBeforeMinPerLevel[idxLevel] += nbBlocksPerLevelAll[idxProc*tree->getHeight() + idxLevel];
            }
        }
        // Prepare the block infos
        processesBlockInfos.resize(tree->getHeight());
        std::unique_ptr<int[]> recvBlocksCount(new int[comm.processCount()]);
        std::unique_ptr<int[]> recvBlockDispl(new int[comm.processCount()]);
        // Exchange the block info per level
        for(int idxLevel = 1 ; idxLevel < tree->getHeight() ; ++idxLevel){
            // Count the total number of blocks
            int nbBlocksInLevel = 0;
            recvBlockDispl[0] = 0;
            for(int idxProc = 0 ; idxProc < comm.processCount() ; ++idxProc){
                nbBlocksInLevel += nbBlocksPerLevelAll[idxProc*tree->getHeight() + idxLevel];
                // Count and displacement for the MPI all gatherv
                recvBlocksCount[idxProc] = nbBlocksPerLevelAll[idxProc*tree->getHeight() + idxLevel] * int(sizeof(BlockDescriptor));
                if(idxProc) recvBlockDispl[idxProc] = recvBlockDispl[idxProc-1] + recvBlocksCount[idxProc-1];
            }
            processesBlockInfos[idxLevel].resize(nbBlocksInLevel);
            // Fill my blocks
            std::vector<BlockDescriptor> myBlocksAtLevel;
            myBlocksAtLevel.resize(nbBlocksPerLevel[idxLevel]);
            FAssertLF(tree->getNbCellGroupAtLevel(idxLevel) == int(myBlocksAtLevel.size()));
            FAssertLF(nbBlocksPerLevel[idxLevel] == nbBlocksPerLevelAll[comm.processId()*tree->getHeight() + idxLevel]);

            for(int idxGroup = 0 ; idxGroup < tree->getNbCellGroupAtLevel(idxLevel) ; ++idxGroup){
                CellContainerClass*const currentCells = tree->getCellGroup(idxLevel, idxGroup);
                myBlocksAtLevel[idxGroup].firstIndex = currentCells->getStartingIndex();
                myBlocksAtLevel[idxGroup].lastIndex  = currentCells->getEndingIndex();
                myBlocksAtLevel[idxGroup].owner = comm.processId();
932 933 934
                myBlocksAtLevel[idxGroup].bufferSizeSymb = currentCells->getBufferSizeInByte();
                myBlocksAtLevel[idxGroup].bufferSizeUp   = currentCells->getMultipoleBufferSizeInByte();
                myBlocksAtLevel[idxGroup].bufferSizeDown = currentCells->getLocalBufferSizeInByte();
935 936 937 938 939 940 941 942 943 944 945 946 947

                if(idxLevel == tree->getHeight() - 1){
                    myBlocksAtLevel[idxGroup].leavesBufferSize = tree->getParticleGroup(idxGroup)->getBufferSizeInByte();
                }
                else{
                    myBlocksAtLevel[idxGroup].leavesBufferSize = 0;
                }
            }
            // Exchange with all other
            FMpi::Assert(MPI_Allgatherv(myBlocksAtLevel.data(), int(myBlocksAtLevel.size()*sizeof(BlockDescriptor)), MPI_BYTE,
                                        processesBlockInfos[idxLevel].data(), recvBlocksCount.get(), recvBlockDispl.get(), MPI_BYTE,
                                        comm.getComm()), __LINE__);
        }
948 949 950 951 952 953 954
        // Prepare remate ptr and handles
        remoteCellGroups.resize( tree->getHeight() );
        for(int idxLevel = 1 ; idxLevel < tree->getHeight() ; ++idxLevel){
            remoteCellGroups[idxLevel].resize( processesBlockInfos[idxLevel].size());
        }
        remoteParticleGroupss.resize(processesBlockInfos[tree->getHeight()-1].size());

955 956 957 958 959 960 961 962
        // From now we have the number of blocks for all process
        // we also have the size of the blocks therefor we can
        // create the handles we need
        // We will now detect the relation between our blocks and others
        // During the M2M (which is the same for the L2L)
        // During the M2L and during the P2P
        // I need to insert the task that read my data or that write the data I need.
        // M2L
963
        externalInteractionsAllLevelMpi.clear();
964 965 966 967 968 969 970 971 972 973 974 975 976 977
        externalInteractionsAllLevelMpi.resize(tree->getHeight());
        for(int idxLevel = tree->getHeight()-1 ; idxLevel >= 2 ; --idxLevel){
            // From this level there are no more blocks
            if(tree->getNbCellGroupAtLevel(idxLevel) == 0){
                // We stop here
                break;
            }
            // What are my morton interval at this level
            const MortonIndex myFirstIndex = tree->getCellGroup(idxLevel, 0)->getStartingIndex();
            const MortonIndex myLastIndex = tree->getCellGroup(idxLevel, tree->getNbCellGroupAtLevel(idxLevel)-1)->getEndingIndex();

            externalInteractionsAllLevelMpi[idxLevel].resize(tree->getNbCellGroupAtLevel(idxLevel));

            for(int idxGroup = 0 ; idxGroup < tree->getNbCellGroupAtLevel(idxLevel) ; ++idxGroup){
978
                CellContainerClass* currentCells = tree->getCellGroup(idxLevel, idxGroup);
979 980 981

                std::vector<BlockInteractions<CellContainerClass>>* externalInteractions = &externalInteractionsAllLevelMpi[idxLevel][idxGroup];

BRAMAS Berenger's avatar
BRAMAS Berenger committed
982
#pragma omp task default(none) firstprivate(idxGroup, currentCells, idxLevel, externalInteractions)
983 984 985
                {
                    std::vector<OutOfBlockInteraction> outsideInteractions;

986 987 988 989 990 991 992 993 994 995 996 997 998 999
                    for(int idxCell = 0 ; idxCell < currentCells->getNumberOfCellsInBlock() ; ++idxCell){
                        const MortonIndex mindex = currentCells->getCellMortonIndex(idxCell);

                        MortonIndex interactionsIndexes[189];
                        int interactionsPosition[189];
                        const FTreeCoordinate coord(mindex, idxLevel);
                        int counter = coord.getInteractionNeighbors(idxLevel,interactionsIndexes,interactionsPosition);

                        for(int idxInter = 0 ; idxInter < counter ; ++idxInter){
                            // This interactions need a block owned by someoneelse
                            if(interactionsIndexes[idxInter] < myFirstIndex || myLastIndex <= interactionsIndexes[idxInter]){
                                OutOfBlockInteraction property;
                                property.insideIndex = mindex;
                                property.outIndex    = interactionsIndexes[idxInter];
BRAMAS Berenger's avatar
BRAMAS Berenger committed
1000
                                property.relativeOutPosition = interactionsPosition[idxInter];
1001
                                property.insideIdxInBlock = idxCell;
BRAMAS Berenger's avatar
BRAMAS Berenger committed
1002
                                property.outsideIdxInBlock = -1;
1003
                                outsideInteractions.push_back(property);
1004 1005 1006 1007 1008 1009 1010 1011 1012
                            }
                        }
                    }

                    // Manage outofblock interaction
                    FQuickSort<OutOfBlockInteraction, int>::QsSequential(outsideInteractions.data(),int(outsideInteractions.size()));

                    int currentOutInteraction = 0;
                    for(int idxOtherGroup = 0 ; idxOtherGroup < int(processesBlockInfos[idxLevel].size())
BRAMAS Berenger's avatar
BRAMAS Berenger committed
1013
                        && currentOutInteraction < int(outsideInteractions.size()) ; ++idxOtherGroup){
1014 1015 1016 1017 1018 1019 1020 1021 1022
                        // Skip my blocks
                        if(idxOtherGroup == nbBlocksBeforeMinPerLevel[idxLevel]){
                            idxOtherGroup += tree->getNbCellGroupAtLevel(idxLevel);
                            if(idxOtherGroup == int(processesBlockInfos[idxLevel].size())){
                                break;
                            }
                            FAssertLF(idxOtherGroup < int(processesBlockInfos[idxLevel].size()));
                        }

BRAMAS Berenger's avatar
BRAMAS Berenger committed
1023 1024
                        const MortonIndex blockStartIdxOther = processesBlockInfos[idxLevel][idxOtherGroup].firstIndex;
                        const MortonIndex blockEndIdxOther   = processesBlockInfos[idxLevel][idxOtherGroup].lastIndex;
1025

BRAMAS Berenger's avatar
BRAMAS Berenger committed
1026
                        while(currentOutInteraction < int(outsideInteractions.size()) && outsideInteractions[currentOutInteraction].outIndex < blockStartIdxOther){
1027 1028 1029 1030
                            currentOutInteraction += 1;
                        }

                        int lastOutInteraction = currentOutInteraction;
BRAMAS Berenger's avatar
BRAMAS Berenger committed
1031
                        while(lastOutInteraction < int(outsideInteractions.size()) && outsideInteractions[lastOutInteraction].outIndex < blockEndIdxOther){
1032 1033 1034 1035 1036 1037
                            lastOutInteraction += 1;
                        }

                        // Create interactions
                        const int nbInteractionsBetweenBlocks = (lastOutInteraction-currentOutInteraction);
                        if(nbInteractionsBetweenBlocks){
1038
                            if(remoteCellGroups[idxLevel][idxOtherGroup].ptrSymb == nullptr){
BRAMAS Berenger's avatar
BRAMAS Berenger committed
1039
#pragma omp critical(CreateM2LRemotes)
1040
                                {
1041
                                    if(remoteCellGroups[idxLevel][idxOtherGroup].ptrSymb == nullptr){
BRAMAS Berenger's avatar
BRAMAS Berenger committed
1042
                                        const size_t nbBytesInBlockSymb = processesBlockInfos[idxLevel][idxOtherGroup].bufferSizeSymb;
1043
                                        unsigned char* memoryBlockSymb = (unsigned char*)FAlignedMemory::AllocateBytes<32>(nbBytesInBlockSymb);
1044
                                        remoteCellGroups[idxLevel][idxOtherGroup].ptrSymb = memoryBlockSymb;
1045 1046
                                        starpu_vector_data_register(&remoteCellGroups[idxLevel][idxOtherGroup].handleSymb, 0,
                                                                      (uintptr_t)remoteCellGroups[idxLevel][idxOtherGroup].ptrSymb, nbBytesInBlockSymb, 1);
BRAMAS Berenger's avatar
BRAMAS Berenger committed
1047
                                        const size_t nbBytesInBlockUp = processesBlockInfos[idxLevel][idxOtherGroup].bufferSizeUp;
1048
                                        unsigned char* memoryBlockUp = (unsigned char*)FAlignedMemory::AllocateBytes<32>(nbBytesInBlockUp);
1049
                                        remoteCellGroups[idxLevel][idxOtherGroup].ptrUp = memoryBlockUp;
1050 1051
                                        starpu_vector_data_register(&remoteCellGroups[idxLevel][idxOtherGroup].handleUp, 0,
                                                                      (uintptr_t)remoteCellGroups[idxLevel][idxOtherGroup].ptrUp, nbBytesInBlockUp, 1);
1052 1053
                                    }
                                }
1054 1055
                            }

1056 1057
                            externalInteractions->emplace_back();
                            BlockInteractions<CellContainerClass>* interactions = &externalInteractions->back();
1058
                            //interactions->otherBlock = remoteCellGroups[idxLevel][idxOtherGroup].ptr;
1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076
                            interactions->otherBlockId = idxOtherGroup;
                            interactions->interactions.resize(nbInteractionsBetweenBlocks);
                            std::copy(outsideInteractions.begin() + currentOutInteraction,
                                      outsideInteractions.begin() + lastOutInteraction,
                                      interactions->interactions.begin());
                        }

                        currentOutInteraction = lastOutInteraction;
                    }
                }
            }
        }
        // P2P
        // We create one big vector per block
        {
            const MortonIndex myFirstIndex = tree->getParticleGroup(0)->getStartingIndex();
            const MortonIndex myLastIndex = tree->getParticleGroup(tree->getNbParticleGroup()-1)->getEndingIndex();

1077
            externalInteractionsLeafLevelMpi.clear();
1078 1079