FGroupTaskStarpuMpiAlgorithm.hpp 130 KB
Newer Older
1 2 3 4
// Keep in private GIT
#ifndef FGROUPTASKSTARPUMPIALGORITHM_HPP
#define FGROUPTASKSTARPUMPIALGORITHM_HPP

5 6 7 8 9 10 11 12 13
#include "../../Utils/FGlobal.hpp"
#include "../../Core/FCoreCommon.hpp"
#include "../../Utils/FQuickSort.hpp"
#include "../../Containers/FTreeCoordinate.hpp"
#include "../../Utils/FLog.hpp"
#include "../../Utils/FTic.hpp"
#include "../../Utils/FAssert.hpp"
#include "../../Utils/FAlignedMemory.hpp"
#include "../../Utils/FAssert.hpp"
14
#include "../../Utils/FEnv.hpp"
15 16

#include "../../Utils/FMpi.hpp"
17

18 19
#include "FOutOfBlockInteraction.hpp"

20 21 22 23 24 25
#include <vector>
#include <memory>

#include <omp.h>

#include <starpu.h>
26
#include <starpu_mpi.h>
27
#include "../StarPUUtils/FStarPUUtils.hpp"
28
#include "../StarPUUtils/FStarPUFmmPriorities.hpp"
29 30
#include "../StarPUUtils/FStarPUFmmPrioritiesV2.hpp"
#include "../StarPUUtils/FStarPUReduxCpu.hpp"
31

32
#ifdef STARPU_USE_CPU
33
#include "../StarPUUtils/FStarPUCpuWrapper.hpp"
34
#endif
35
#ifdef SCALFMM_ENABLE_CUDA_KERNEL
36 37 38 39 40
#include "../StarPUUtils/FStarPUCudaWrapper.hpp"
#include "../Cuda/FCudaEmptyKernel.hpp"
#include "../Cuda/FCudaGroupAttachedLeaf.hpp"
#include "../Cuda/FCudaGroupOfParticles.hpp"
#include "../Cuda/FCudaGroupOfCells.hpp"
41
#include "../Cuda/FCudaEmptyCellSymb.hpp"
42
#endif
43
#ifdef SCALFMM_ENABLE_OPENCL_KERNEL
44 45
#include "../StarPUUtils/FStarPUOpenClWrapper.hpp"
#include "../OpenCl/FOpenCLDeviceWrapper.hpp"
46
#include "../OpenCl/FEmptyOpenCLCode.hpp"
47
#endif
48

BRAMAS Berenger's avatar
BRAMAS Berenger committed
49 50
#include "../StarPUUtils/FStarPUReduxCpu.hpp"

51

52
template <class OctreeClass, class CellContainerClass, class KernelClass, class ParticleGroupClass, class StarPUCpuWrapperClass
BRAMAS Berenger's avatar
BRAMAS Berenger committed
53 54 55 56 57 58 59
          #ifdef SCALFMM_ENABLE_CUDA_KERNEL
          , class StarPUCudaWrapperClass = FStarPUCudaWrapper<KernelClass, FCudaEmptyCellSymb, int, int, FCudaGroupOfCells<FCudaEmptyCellSymb, int, int>,
                                                              FCudaGroupOfParticles<int, 0, 0, int>, FCudaGroupAttachedLeaf<int, 0, 0, int>, FCudaEmptyKernel<int> >
          #endif
          #ifdef SCALFMM_ENABLE_OPENCL_KERNEL
          , class StarPUOpenClWrapperClass = FStarPUOpenClWrapper<KernelClass, FOpenCLDeviceWrapper<KernelClass>>
          #endif
60
          >
61
class FGroupTaskStarPUMpiAlgorithm : public FAbstractAlgorithm {
62
protected:
63
    typedef FGroupTaskStarPUMpiAlgorithm<OctreeClass, CellContainerClass, KernelClass, ParticleGroupClass, StarPUCpuWrapperClass
64
#ifdef SCALFMM_ENABLE_CUDA_KERNEL
BRAMAS Berenger's avatar
BRAMAS Berenger committed
65
    , StarPUCudaWrapperClass
66
#endif
67
#ifdef SCALFMM_ENABLE_OPENCL_KERNEL
68
    , StarPUOpenClWrapperClass
69 70
#endif
    > ThisClass;
71

72
    int getTag(const int inLevel, const MortonIndex mindex, const int mode) const{
73 74 75
        int shift = 0;
        int height = tree->getHeight();
        while(height) { shift += 1; height >>= 1; }
BRAMAS Berenger's avatar
BRAMAS Berenger committed
76
        return int((((mindex<<shift) + inLevel) << 5) + mode);
77 78
    }

79 80 81 82 83 84 85 86 87
    const FMpi::FComm& comm;

    template <class OtherBlockClass>
    struct BlockInteractions{
        OtherBlockClass* otherBlock;
        int otherBlockId;
        std::vector<OutOfBlockInteraction> interactions;
    };

88 89 90 91
    struct CellHandles{
        starpu_data_handle_t symb;
        starpu_data_handle_t up;
        starpu_data_handle_t down;
92
        int intervalSize;
93 94 95 96 97
    };

    struct ParticleHandles{
        starpu_data_handle_t symb;
        starpu_data_handle_t down;
98
        int intervalSize;
99 100
    };

101 102 103 104
    std::vector< std::vector< std::vector<BlockInteractions<CellContainerClass>>>> externalInteractionsAllLevel;
    std::vector< std::vector<BlockInteractions<ParticleGroupClass>>> externalInteractionsLeafLevel;

    OctreeClass*const tree;       //< The Tree
105
    KernelClass*const originalCpuKernel;
106

107 108
    std::vector<CellHandles>* cellHandles;
    std::vector<ParticleHandles> particleHandles;
109 110

    starpu_codelet p2m_cl;
BRAMAS Berenger's avatar
BRAMAS Berenger committed
111 112
    starpu_codelet m2m_cl;
    starpu_codelet l2l_cl;
BRAMAS Berenger's avatar
BRAMAS Berenger committed
113
    starpu_codelet l2l_cl_nocommute;
114 115 116 117
    starpu_codelet l2p_cl;

    starpu_codelet m2l_cl_in;
    starpu_codelet m2l_cl_inout;
118
    starpu_codelet m2l_cl_inout_mpi;
119 120 121

    starpu_codelet p2p_cl_in;
    starpu_codelet p2p_cl_inout;
122
    starpu_codelet p2p_cl_inout_mpi;
123

BRAMAS Berenger's avatar
BRAMAS Berenger committed
124 125 126 127 128 129 130

#ifdef STARPU_USE_REDUX
    starpu_codelet p2p_redux_init;
    starpu_codelet p2p_redux_perform;
    starpu_codelet p2p_redux_read;
#endif

131 132 133
    const bool noCommuteAtLastLevel;
    const bool noCommuteBetweenLevel;

134
#ifdef STARPU_USE_CPU
135
    StarPUCpuWrapperClass cpuWrapper;
136
#endif
137
#ifdef SCALFMM_ENABLE_CUDA_KERNEL
138 139
    StarPUCudaWrapperClass cudaWrapper;
#endif
140
#ifdef SCALFMM_ENABLE_OPENCL_KERNEL
141 142
    StarPUOpenClWrapperClass openclWrapper;
#endif
143 144 145

    FStarPUPtrInterface wrappers;
    FStarPUPtrInterface* wrapperptr;
146

147
#ifdef STARPU_SUPPORT_ARBITER
BRAMAS Berenger's avatar
BRAMAS Berenger committed
148
    starpu_arbiter_t arbiterGlobal;
149
#endif
BRAMAS Berenger's avatar
BRAMAS Berenger committed
150 151 152 153 154 155 156 157 158 159 160

#ifdef STARPU_USE_TASK_NAME
    std::vector<std::unique_ptr<char[]>> m2mTaskNames;
    std::vector<std::unique_ptr<char[]>> m2lTaskNames;
    std::vector<std::unique_ptr<char[]>> m2lOuterTaskNames;
    std::vector<std::unique_ptr<char[]>> l2lTaskNames;
    std::unique_ptr<char[]> p2mTaskNames;
    std::unique_ptr<char[]> l2pTaskNames;
    std::unique_ptr<char[]> p2pTaskNames;
    std::unique_ptr<char[]> p2pOuterTaskNames;
#endif
161 162 163
#ifdef SCALFMM_STARPU_USE_PRIO
    typedef FStarPUFmmPrioritiesV2 PrioClass;// FStarPUFmmPriorities
#endif
164
	std::list<char*> taskName;
BRAMAS Berenger's avatar
BRAMAS Berenger committed
165

166
public:
167 168
    FGroupTaskStarPUMpiAlgorithm(const FMpi::FComm& inComm, OctreeClass*const inTree, KernelClass* inKernels)
        :   comm(inComm), tree(inTree), originalCpuKernel(inKernels),
169
          cellHandles(nullptr),
170 171
          noCommuteAtLastLevel(FEnv::GetBool("SCALFMM_NO_COMMUTE_LAST_L2L", true)),
          noCommuteBetweenLevel(FEnv::GetBool("SCALFMM_NO_COMMUTE_M2L_L2L", false)),
BRAMAS Berenger's avatar
BRAMAS Berenger committed
172 173 174 175 176 177 178 179 180 181
      #ifdef STARPU_USE_CPU
          cpuWrapper(tree->getHeight()),
      #endif
      #ifdef SCALFMM_ENABLE_CUDA_KERNEL
          cudaWrapper(tree->getHeight()),
      #endif
      #ifdef SCALFMM_ENABLE_OPENCL_KERNEL
          openclWrapper(tree->getHeight()),
      #endif
          wrapperptr(&wrappers){
182 183 184
        FAssertLF(tree, "tree cannot be null");
        FAssertLF(inKernels, "kernels cannot be null");

185 186
        FAbstractAlgorithm::setNbLevelsInTree(tree->getHeight());

187 188
        struct starpu_conf conf;
        FAssertLF(starpu_conf_init(&conf) == 0);
189
#ifdef SCALFMM_STARPU_USE_PRIO
190
        PrioClass::Controller().init(&conf, tree->getHeight(), inKernels);
191
#endif
192
        FAssertLF(starpu_init(&conf) == 0);
193
        FAssertLF(starpu_mpi_init ( 0, 0, 0 ) == 0);
194

BRAMAS Berenger's avatar
BRAMAS Berenger committed
195 196
        starpu_malloc_set_align(32);

197 198
        starpu_pthread_mutex_t initMutex;
        starpu_pthread_mutex_init(&initMutex, NULL);
199
#ifdef STARPU_USE_CPU
200 201 202 203 204
        FStarPUUtils::ExecOnWorkers(STARPU_CPU, [&](){
            starpu_pthread_mutex_lock(&initMutex);
            cpuWrapper.initKernel(starpu_worker_get_id(), inKernels);
            starpu_pthread_mutex_unlock(&initMutex);
        });
205
        wrappers.set(FSTARPU_CPU_IDX, &cpuWrapper);
206
#endif
207
#ifdef SCALFMM_ENABLE_CUDA_KERNEL
208 209 210 211 212 213 214
        FStarPUUtils::ExecOnWorkers(STARPU_CUDA, [&](){
            starpu_pthread_mutex_lock(&initMutex);
            cudaWrapper.initKernel(starpu_worker_get_id(), inKernels);
            starpu_pthread_mutex_unlock(&initMutex);
        });
        wrappers.set(FSTARPU_CUDA_IDX, &cudaWrapper);
#endif
215
#ifdef SCALFMM_ENABLE_OPENCL_KERNEL
216 217 218 219 220 221
        FStarPUUtils::ExecOnWorkers(STARPU_OPENCL, [&](){
            starpu_pthread_mutex_lock(&initMutex);
            openclWrapper.initKernel(starpu_worker_get_id(), inKernels);
            starpu_pthread_mutex_unlock(&initMutex);
        });
        wrappers.set(FSTARPU_OPENCL_IDX, &openclWrapper);
222
#endif
223 224
        starpu_pthread_mutex_destroy(&initMutex);

225 226
        starpu_pause();

227
        cellHandles   = new std::vector<CellHandles>[tree->getHeight()];
228

229
#ifdef STARPU_SUPPORT_ARBITER
BRAMAS Berenger's avatar
BRAMAS Berenger committed
230
        arbiterGlobal = starpu_arbiter_create();
231
#endif
232

BRAMAS Berenger's avatar
BRAMAS Berenger committed
233 234
        initCodelet();
        initCodeletMpi();
BRAMAS Berenger's avatar
BRAMAS Berenger committed
235
        rebuildInteractions();
BRAMAS Berenger's avatar
BRAMAS Berenger committed
236

BRAMAS Berenger's avatar
BRAMAS Berenger committed
237
        FLOG(FLog::Controller << "FGroupTaskStarPUAlgorithm (Max Worker " << starpu_worker_get_count() << ")\n");
BRAMAS Berenger's avatar
BRAMAS Berenger committed
238
#ifdef STARPU_USE_CPU
BRAMAS Berenger's avatar
BRAMAS Berenger committed
239
        FLOG(FLog::Controller << "FGroupTaskStarPUAlgorithm (Max CPU " << starpu_cpu_worker_get_count() << ")\n");
BRAMAS Berenger's avatar
BRAMAS Berenger committed
240
#endif
241
#ifdef SCALFMM_ENABLE_OPENCL_KERNEL
BRAMAS Berenger's avatar
BRAMAS Berenger committed
242 243
        FLOG(FLog::Controller << "FGroupTaskStarPUAlgorithm (Max OpenCL " << starpu_opencl_worker_get_count() << ")\n");
#endif
244
#ifdef SCALFMM_ENABLE_CUDA_KERNEL
BRAMAS Berenger's avatar
BRAMAS Berenger committed
245 246
        FLOG(FLog::Controller << "FGroupTaskStarPUAlgorithm (Max CUDA " << starpu_cuda_worker_get_count() << ")\n");
#endif
247 248
        FLOG(FLog::Controller << "SCALFMM_NO_COMMUTE_LAST_L2L " << noCommuteAtLastLevel << "\n");
        FLOG(FLog::Controller << "SCALFMM_NO_COMMUTE_M2L_L2L " << noCommuteBetweenLevel << "\n");
BRAMAS Berenger's avatar
BRAMAS Berenger committed
249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279

        buildTaskNames();
    }

    void buildTaskNames(){
#ifdef STARPU_USE_TASK_NAME
        const int namesLength = 128;
        m2mTaskNames.resize(tree->getHeight());
        m2lTaskNames.resize(tree->getHeight());
        m2lOuterTaskNames.resize(tree->getHeight());
        l2lTaskNames.resize(tree->getHeight());
        for(int idxLevel = 0 ; idxLevel < tree->getHeight() ; ++idxLevel){
            m2mTaskNames[idxLevel].reset(new char[namesLength]);
            snprintf(m2mTaskNames[idxLevel].get(), namesLength, "M2M-level-%d", idxLevel);
            m2lTaskNames[idxLevel].reset(new char[namesLength]);
            snprintf(m2lTaskNames[idxLevel].get(), namesLength, "M2L-level-%d", idxLevel);
            m2lOuterTaskNames[idxLevel].reset(new char[namesLength]);
            snprintf(m2lOuterTaskNames[idxLevel].get(), namesLength, "M2L-out-level-%d", idxLevel);
            l2lTaskNames[idxLevel].reset(new char[namesLength]);
            snprintf(l2lTaskNames[idxLevel].get(), namesLength, "L2L-level-%d", idxLevel);
        }

        p2mTaskNames.reset(new char[namesLength]);
        snprintf(p2mTaskNames.get(), namesLength, "P2M");
        l2pTaskNames.reset(new char[namesLength]);
        snprintf(l2pTaskNames.get(), namesLength, "L2P");
        p2pTaskNames.reset(new char[namesLength]);
        snprintf(p2pTaskNames.get(), namesLength, "P2P");
        p2pOuterTaskNames.reset(new char[namesLength]);
        snprintf(p2pOuterTaskNames.get(), namesLength, "P2P-out");
#endif
280 281
    }

282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302
    void syncData(){
        for(int idxLevel = 0 ; idxLevel < tree->getHeight() ; ++idxLevel){
            for(int idxHandle = 0 ; idxHandle < int(cellHandles[idxLevel].size()) ; ++idxHandle){
                starpu_data_acquire(cellHandles[idxLevel][idxHandle].symb, STARPU_R);
                starpu_data_release(cellHandles[idxLevel][idxHandle].symb);
                starpu_data_acquire(cellHandles[idxLevel][idxHandle].up, STARPU_R);
                starpu_data_release(cellHandles[idxLevel][idxHandle].up);
                starpu_data_acquire(cellHandles[idxLevel][idxHandle].down, STARPU_R);
                starpu_data_release(cellHandles[idxLevel][idxHandle].down);
            }
        }
        {
            for(int idxHandle = 0 ; idxHandle < int(particleHandles.size()) ; ++idxHandle){
                starpu_data_acquire(particleHandles[idxHandle].symb, STARPU_R);
                starpu_data_release(particleHandles[idxHandle].symb);
                starpu_data_acquire(particleHandles[idxHandle].down, STARPU_R);
                starpu_data_release(particleHandles[idxHandle].down);
            }
        }
    }

303
    ~FGroupTaskStarPUMpiAlgorithm(){
BRAMAS Berenger's avatar
BRAMAS Berenger committed
304 305
        starpu_resume();

306
        cleanHandle();
307
        cleanHandleMpi();
308
        delete[] cellHandles;
309

BRAMAS Berenger's avatar
BRAMAS Berenger committed
310 311 312 313 314 315 316 317 318 319
        starpu_pthread_mutex_t releaseMutex;
        starpu_pthread_mutex_init(&releaseMutex, NULL);
#ifdef STARPU_USE_CPU
        FStarPUUtils::ExecOnWorkers(STARPU_CPU, [&](){
            starpu_pthread_mutex_lock(&releaseMutex);
            cpuWrapper.releaseKernel(starpu_worker_get_id());
            starpu_pthread_mutex_unlock(&releaseMutex);
        });
        wrappers.set(FSTARPU_CPU_IDX, &cpuWrapper);
#endif
320
#ifdef SCALFMM_ENABLE_CUDA_KERNEL
BRAMAS Berenger's avatar
BRAMAS Berenger committed
321 322 323 324 325 326 327
        FStarPUUtils::ExecOnWorkers(STARPU_CUDA, [&](){
            starpu_pthread_mutex_lock(&releaseMutex);
            cudaWrapper.releaseKernel(starpu_worker_get_id());
            starpu_pthread_mutex_unlock(&releaseMutex);
        });
        wrappers.set(FSTARPU_CUDA_IDX, &cudaWrapper);
#endif
328
#ifdef SCALFMM_ENABLE_OPENCL_KERNEL
BRAMAS Berenger's avatar
BRAMAS Berenger committed
329 330 331 332 333 334 335 336 337
        FStarPUUtils::ExecOnWorkers(STARPU_OPENCL, [&](){
            starpu_pthread_mutex_lock(&releaseMutex);
            openclWrapper.releaseKernel(starpu_worker_get_id());
            starpu_pthread_mutex_unlock(&releaseMutex);
        });
        wrappers.set(FSTARPU_OPENCL_IDX, &openclWrapper);
#endif
        starpu_pthread_mutex_destroy(&releaseMutex);

338 339

#ifdef STARPU_SUPPORT_ARBITER
BRAMAS Berenger's avatar
BRAMAS Berenger committed
340
        starpu_arbiter_destroy(arbiterGlobal);
341
#endif
342 343
		for(char* ptr : taskName)
			free(ptr);
344
        starpu_mpi_shutdown();
345 346 347
        starpu_shutdown();
    }

BRAMAS Berenger's avatar
BRAMAS Berenger committed
348
    void rebuildInteractions(){
349
        FAssertLF(getenv("OMP_WAIT_POLICY") == nullptr
BRAMAS Berenger's avatar
BRAMAS Berenger committed
350 351
                || strcmp(getenv("OMP_WAIT_POLICY"), "PASSIVE") == 0
                  || strcmp(getenv("OMP_WAIT_POLICY"), "passive") == 0);
352

BRAMAS Berenger's avatar
BRAMAS Berenger committed
353 354
#pragma omp parallel
#pragma omp single
355 356 357
        buildExternalInteractionVecs();
        buildHandles();

BRAMAS Berenger's avatar
BRAMAS Berenger committed
358 359
#pragma omp parallel
#pragma omp single
360
        buildRemoteInteractionsAndHandles();
BRAMAS Berenger's avatar
BRAMAS Berenger committed
361 362
    }

363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406
#ifdef STARPU_USE_CPU
    void forEachCpuWorker(std::function<void(void)> func){
        starpu_resume();
        FStarPUUtils::ExecOnWorkers(STARPU_CPU, func);
        starpu_pause();
    }

    void forEachCpuWorker(std::function<void(KernelClass*)> func){
        starpu_resume();
        FStarPUUtils::ExecOnWorkers(STARPU_CPU, [&](){
            func(cpuWrapper.getKernel(starpu_worker_get_id()));
        });
        starpu_pause();
    }
#endif
#ifdef SCALFMM_ENABLE_CUDA_KERNEL
    void forEachCudaWorker(std::function<void(void)> func){
        starpu_resume();
        FStarPUUtils::ExecOnWorkers(STARPU_CUDA, func);
        starpu_pause();
    }
    void forEachCudaWorker(std::function<void(void*)> func){
        starpu_resume();
        FStarPUUtils::ExecOnWorkers(STARPU_CUDA, [&](){
            func(cudaWrapper.getKernel(starpu_worker_get_id()));
        });
        starpu_pause();
    }
#endif
#ifdef SCALFMM_ENABLE_OPENCL_KERNEL
    void forEachOpenCLWorker(std::function<void(void)> func){
        starpu_resume();
        FStarPUUtils::ExecOnWorkers(STARPU_OPENCL, func);
        starpu_pause();
    }
    void forEachOpenCLWorker(std::function<void(void*)> func){
        starpu_resume();
        FStarPUUtils::ExecOnWorkers(STARPU_OPENCL, [&](){
            func(openclWrapper.getKernel(starpu_worker_get_id()));
        });
        starpu_pause();
    }
#endif

BRAMAS Berenger's avatar
BRAMAS Berenger committed
407
protected:
408

BRAMAS Berenger's avatar
BRAMAS Berenger committed
409 410 411 412 413 414 415 416 417 418 419
    /**
      * Runs the complete algorithm.
      */
    void executeCore(const unsigned operationsToProceed) override {
        FLOG( FLog::Controller << "\tStart FGroupTaskStarPUMpiAlgorithm\n" );
        const bool directOnly = (tree->getHeight() <= 2);

#ifdef STARPU_USE_CPU
        FTIME_TASKS(cpuWrapper.taskTimeRecorder.start());
#endif

420 421
        FLOG(FTic timerSoumission);

422
        starpu_resume();
423
        postRecvAllocatedBlocks();
424

425
        if( operationsToProceed & FFmmP2P ) insertParticlesSend();
426

427 428 429
        if( operationsToProceed & FFmmP2P ) directPass();
        if( operationsToProceed & FFmmP2P ) directPassMpi();

430
        if(operationsToProceed & FFmmP2M && !directOnly) bottomPass();
431

BRAMAS Berenger's avatar
BRAMAS Berenger committed
432
        if(operationsToProceed & FFmmM2M && !directOnly) upwardPass();
433
        if(operationsToProceed & FFmmM2L && !directOnly) insertCellsSend();
434

BRAMAS Berenger's avatar
BRAMAS Berenger committed
435 436 437
        if(operationsToProceed & FFmmM2L && !directOnly) transferPass(FAbstractAlgorithm::upperWorkingLevel, FAbstractAlgorithm::lowerWorkingLevel-1 , true, true);
        if(operationsToProceed & FFmmM2L && !directOnly) transferPass(FAbstractAlgorithm::lowerWorkingLevel-1, FAbstractAlgorithm::lowerWorkingLevel, false, false);
        if(operationsToProceed & FFmmM2L && !directOnly) transferPassMpi();
438

BRAMAS Berenger's avatar
BRAMAS Berenger committed
439
        if(operationsToProceed & FFmmL2L && !directOnly) downardPass();
440

BRAMAS Berenger's avatar
BRAMAS Berenger committed
441
        if(operationsToProceed & FFmmM2L && !directOnly) transferPass(FAbstractAlgorithm::lowerWorkingLevel-1, FAbstractAlgorithm::lowerWorkingLevel, true, true);
BRAMAS Berenger's avatar
BRAMAS Berenger committed
442

BRAMAS Berenger's avatar
BRAMAS Berenger committed
443 444 445 446
        if( operationsToProceed & FFmmL2P && !directOnly) mergePass();
#ifdef STARPU_USE_REDUX
        if( operationsToProceed & FFmmL2P && !directOnly) readParticle();
#endif
447

448
        FLOG( FLog::Controller << "\t\t Submitting the tasks took " << timerSoumission.tacAndElapsed() << "s\n" );
449
        starpu_task_wait_for_all();
450 451 452 453 454

        FLOG( FTic timerSync; );
        syncData();
        FLOG( FLog::Controller << "\t\t Moving data to the host took " << timerSync.tacAndElapsed() << "s\n" );

455
        starpu_pause();
BRAMAS Berenger's avatar
BRAMAS Berenger committed
456 457 458 459 460

#ifdef STARPU_USE_CPU
        FTIME_TASKS(cpuWrapper.taskTimeRecorder.end());
        FTIME_TASKS(cpuWrapper.taskTimeRecorder.saveToDisk("/tmp/taskstime-FGroupTaskStarPUAlgorithm.txt"));
#endif
461
    }
462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522
	char* getTaskNameP2M(char const* const task_type, int idxGroup, int rank) {
		char* name = nullptr;
		asprintf(&name, "%s_%lld_%lld_%d", task_type, tree->getParticleGroup(idxGroup)->getStartingIndex(), tree->getParticleGroup(idxGroup)->getEndingIndex(), rank);
		taskName.push_front(name);
		return name;
	}
	char* getTaskNameM2M(char const* const task_type, int idxLevel, int idxGroup, int idxLevel2, int idxGroup2, int rank) {
		char* name = nullptr;
		MortonIndex start, end, start2, end2;
		start = tree->getCellGroup(idxLevel, idxGroup)->getStartingIndex();
		end = tree->getCellGroup(idxLevel, idxGroup)->getEndingIndex();
		start2 = tree->getCellGroup(idxLevel2, idxGroup2)->getStartingIndex();
		end2 = tree->getCellGroup(idxLevel2, idxGroup2)->getEndingIndex();
		asprintf(&name, "%s_%d_%lld_%lld_%lld_%lld_%d", task_type, idxLevel, start, end, start2, end2, rank);
		taskName.push_front(name);
		return name;
	}
	char* getTaskNameM2MUsingInfo(char const* const task_type, int idxLevel, int idxGroup, int idxLevel2, int idxGroup2, int rank) {
		char* name = nullptr;
		MortonIndex start, end, start2, end2;
		start = tree->getCellGroup(idxLevel, idxGroup)->getStartingIndex();
		end = tree->getCellGroup(idxLevel, idxGroup)->getEndingIndex();
		start2 = processesBlockInfos[idxLevel2][idxGroup2].firstIndex;
		end2 = processesBlockInfos[idxLevel2][idxGroup2].lastIndex;
		asprintf(&name, "%s_%d_%lld_%lld_%lld_%lld_%d", task_type, idxLevel, start, end, start2, end2, rank);
		taskName.push_front(name);
		return name;
	}
	char* getTaskNameM2MUsingInfoRevert(char const* const task_type, int idxLevel, int idxGroup, int idxLevel2, int idxGroup2, int rank) {
		char* name = nullptr;
		MortonIndex start, end, start2, end2;
		start = processesBlockInfos[idxLevel][idxGroup].firstIndex;
		end = processesBlockInfos[idxLevel][idxGroup].lastIndex;
		start2 = tree->getCellGroup(idxLevel2, idxGroup2)->getStartingIndex();
		end2 = tree->getCellGroup(idxLevel2, idxGroup2)->getEndingIndex();
		asprintf(&name, "%s_%d_%lld_%lld_%lld_%lld_%d", task_type, idxLevel, start, end, start2, end2, rank);
		taskName.push_front(name);
		return name;
	}
	char* getTaskNameP2P(char const* const task_type, int idxGroup, int idxGroup2, int rank) {
		char* name = nullptr;
		MortonIndex start, end, start2, end2;
		start = tree->getParticleGroup(idxGroup)->getStartingIndex();
		end = tree->getParticleGroup(idxGroup)->getEndingIndex();
		start2 = tree->getParticleGroup(idxGroup2)->getStartingIndex();
		end2 = tree->getParticleGroup(idxGroup2)->getEndingIndex();
		asprintf(&name, "%s_%lld_%lld_%lld_%lld_%d", task_type, start, end, start2, end2, rank);
		taskName.push_front(name);
		return name;
	}
	char* getTaskNameP2PUsingInfo(char const* const task_type, int idxGroup, int idxGroup2, int rank) {
		char* name = nullptr;
		MortonIndex start, end, start2, end2;
		start = tree->getParticleGroup(idxGroup)->getStartingIndex();
		end = tree->getParticleGroup(idxGroup)->getEndingIndex();
		start2 = processesBlockInfos[tree->getHeight()-1][idxGroup2].firstIndex;
		end2 = processesBlockInfos[tree->getHeight()-1][idxGroup2].lastIndex;
		asprintf(&name, "%s_%lld_%lld_%lld_%lld_%d", task_type, start, end, start2, end2, rank);
		taskName.push_front(name);
		return name;
	}
523

524

525 526
    void initCodelet(){
        memset(&p2m_cl, 0, sizeof(p2m_cl));
527
#ifdef STARPU_USE_CPU
528
        if(originalCpuKernel->supportP2M(FSTARPU_CPU_IDX)){
529 530 531
            p2m_cl.cpu_funcs[0] = StarPUCpuWrapperClass::bottomPassCallback;
            p2m_cl.where |= STARPU_CPU;
        }
532
#endif
533
#ifdef SCALFMM_ENABLE_CUDA_KERNEL
534
        if(originalCpuKernel->supportP2M(FSTARPU_CUDA_IDX)){
535 536 537 538
            p2m_cl.cuda_funcs[0] = StarPUCudaWrapperClass::bottomPassCallback;
            p2m_cl.where |= STARPU_CUDA;
        }
#endif
539
#ifdef SCALFMM_ENABLE_OPENCL_KERNEL
540
        if(originalCpuKernel->supportP2M(FSTARPU_OPENCL_IDX)){
541 542 543
            p2m_cl.opencl_funcs[0] = StarPUOpenClWrapperClass::bottomPassCallback;
            p2m_cl.where |= STARPU_OPENCL;
        }
544
#endif
545 546 547 548
        p2m_cl.nbuffers = 3;
        p2m_cl.modes[0] = STARPU_R;
        p2m_cl.modes[1] = STARPU_RW;
        p2m_cl.modes[2] = STARPU_R;
549
        p2m_cl.name = "p2m_cl";
550

BRAMAS Berenger's avatar
BRAMAS Berenger committed
551
        memset(&m2m_cl, 0, sizeof(m2m_cl));
552
#ifdef STARPU_USE_CPU
BRAMAS Berenger's avatar
BRAMAS Berenger committed
553 554 555 556
        if(originalCpuKernel->supportM2M(FSTARPU_CPU_IDX)){
            m2m_cl.cpu_funcs[0] = StarPUCpuWrapperClass::upwardPassCallback;
            m2m_cl.where |= STARPU_CPU;
        }
557
#endif
558
#ifdef SCALFMM_ENABLE_CUDA_KERNEL
BRAMAS Berenger's avatar
BRAMAS Berenger committed
559 560 561 562
        if(originalCpuKernel->supportM2M(FSTARPU_CUDA_IDX)){
            m2m_cl.cuda_funcs[0] = StarPUCudaWrapperClass::upwardPassCallback;
            m2m_cl.where |= STARPU_CUDA;
        }
563
#endif
564
#ifdef SCALFMM_ENABLE_OPENCL_KERNEL
BRAMAS Berenger's avatar
BRAMAS Berenger committed
565 566 567 568
        if(originalCpuKernel->supportM2M(FSTARPU_OPENCL_IDX)){
            m2m_cl.opencl_funcs[0] = StarPUOpenClWrapperClass::upwardPassCallback;
            m2m_cl.where |= STARPU_OPENCL;
        }
569
#endif
BRAMAS Berenger's avatar
BRAMAS Berenger committed
570 571 572 573 574 575 576 577 578
        m2m_cl.nbuffers = 4;
        m2m_cl.dyn_modes = (starpu_data_access_mode*)malloc(m2m_cl.nbuffers*sizeof(starpu_data_access_mode));
        m2m_cl.dyn_modes[0] = STARPU_R;
        m2m_cl.dyn_modes[1] = starpu_data_access_mode(STARPU_RW|STARPU_COMMUTE_IF_SUPPORTED);
        m2m_cl.name = "m2m_cl";
        m2m_cl.dyn_modes[2] = STARPU_R;
        m2m_cl.dyn_modes[3] = STARPU_R;

        memset(&l2l_cl, 0, sizeof(l2l_cl));
579
#ifdef STARPU_USE_CPU
BRAMAS Berenger's avatar
BRAMAS Berenger committed
580 581 582 583
        if(originalCpuKernel->supportL2L(FSTARPU_CPU_IDX)){
            l2l_cl.cpu_funcs[0] = StarPUCpuWrapperClass::downardPassCallback;
            l2l_cl.where |= STARPU_CPU;
        }
584
#endif
585
#ifdef SCALFMM_ENABLE_CUDA_KERNEL
BRAMAS Berenger's avatar
BRAMAS Berenger committed
586 587 588 589
        if(originalCpuKernel->supportL2L(FSTARPU_CUDA_IDX)){
            l2l_cl.cuda_funcs[0] = StarPUCudaWrapperClass::downardPassCallback;
            l2l_cl.where |= STARPU_CUDA;
        }
590
#endif
591
#ifdef SCALFMM_ENABLE_OPENCL_KERNEL
BRAMAS Berenger's avatar
BRAMAS Berenger committed
592 593 594
        if(originalCpuKernel->supportL2L(FSTARPU_OPENCL_IDX)){
            l2l_cl.opencl_funcs[0] = StarPUOpenClWrapperClass::downardPassCallback;
            l2l_cl.where |= STARPU_OPENCL;
595
        }
BRAMAS Berenger's avatar
BRAMAS Berenger committed
596 597 598 599 600 601 602 603
#endif
        l2l_cl.nbuffers = 4;
        l2l_cl.dyn_modes = (starpu_data_access_mode*)malloc(l2l_cl.nbuffers*sizeof(starpu_data_access_mode));
        l2l_cl.dyn_modes[0] = STARPU_R;
        l2l_cl.dyn_modes[1] = STARPU_R;
        l2l_cl.name = "l2l_cl";
        l2l_cl.dyn_modes[2] = STARPU_R;
        l2l_cl.dyn_modes[3] = starpu_data_access_mode(STARPU_RW|STARPU_COMMUTE_IF_SUPPORTED);
604

BRAMAS Berenger's avatar
BRAMAS Berenger committed
605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631
        memset(&l2l_cl_nocommute, 0, sizeof(l2l_cl_nocommute));
#ifdef STARPU_USE_CPU
        if(originalCpuKernel->supportL2L(FSTARPU_CPU_IDX)){
            l2l_cl_nocommute.cpu_funcs[0] = StarPUCpuWrapperClass::downardPassCallback;
            l2l_cl_nocommute.where |= STARPU_CPU;
        }
#endif
#ifdef SCALFMM_ENABLE_CUDA_KERNEL
        if(originalCpuKernel->supportL2L(FSTARPU_CUDA_IDX)){
            l2l_cl_nocommute.cuda_funcs[0] = StarPUCudaWrapperClass::downardPassCallback;
            l2l_cl_nocommute.where |= STARPU_CUDA;
        }
#endif
#ifdef SCALFMM_ENABLE_OPENCL_KERNEL
        if(originalCpuKernel->supportL2L(FSTARPU_OPENCL_IDX)){
            l2l_cl_nocommute.opencl_funcs[0] = StarPUOpenClWrapperClass::downardPassCallback;
            l2l_cl_nocommute.where |= STARPU_OPENCL;
        }
#endif
        l2l_cl_nocommute.nbuffers = 4;
        l2l_cl_nocommute.dyn_modes = (starpu_data_access_mode*)malloc(l2l_cl_nocommute.nbuffers*sizeof(starpu_data_access_mode));
        l2l_cl_nocommute.dyn_modes[0] = STARPU_R;
        l2l_cl_nocommute.dyn_modes[1] = STARPU_R;
        l2l_cl_nocommute.name = "l2l_cl";
        l2l_cl_nocommute.dyn_modes[2] = STARPU_R;
        l2l_cl_nocommute.dyn_modes[3] = STARPU_RW;

632
        memset(&l2p_cl, 0, sizeof(l2p_cl));
633
#ifdef STARPU_USE_CPU
634
        if(originalCpuKernel->supportL2P(FSTARPU_CPU_IDX)){
635 636 637
            l2p_cl.cpu_funcs[0] = StarPUCpuWrapperClass::mergePassCallback;
            l2p_cl.where |= STARPU_CPU;
        }
638
#endif
639
#ifdef SCALFMM_ENABLE_CUDA_KERNEL
640
        if(originalCpuKernel->supportL2P(FSTARPU_CUDA_IDX)){
641 642 643 644
            l2p_cl.cuda_funcs[0] = StarPUCudaWrapperClass::mergePassCallback;
            l2p_cl.where |= STARPU_CUDA;
        }
#endif
645
#ifdef SCALFMM_ENABLE_OPENCL_KERNEL
646
        if(originalCpuKernel->supportL2P(FSTARPU_OPENCL_IDX)){
647 648 649
            l2p_cl.opencl_funcs[0] = StarPUOpenClWrapperClass::mergePassCallback;
            l2p_cl.where |= STARPU_OPENCL;
        }
650
#endif
651
        l2p_cl.nbuffers = 4;
652
        l2p_cl.modes[0] = STARPU_R;
653 654
        l2p_cl.modes[1] = STARPU_R;
        l2p_cl.modes[2] = STARPU_R;
BRAMAS Berenger's avatar
BRAMAS Berenger committed
655 656 657
#ifdef STARPU_USE_REDUX
        l2p_cl.modes[3] = STARPU_REDUX;
#else
658
        l2p_cl.modes[3] = starpu_data_access_mode(STARPU_RW|STARPU_COMMUTE_IF_SUPPORTED);
BRAMAS Berenger's avatar
BRAMAS Berenger committed
659
#endif
660
        l2p_cl.name = "l2p_cl";
661 662

        memset(&p2p_cl_in, 0, sizeof(p2p_cl_in));
663
#ifdef STARPU_USE_CPU
664
        if(originalCpuKernel->supportP2P(FSTARPU_CPU_IDX)){
665 666 667
            p2p_cl_in.cpu_funcs[0] = StarPUCpuWrapperClass::directInPassCallback;
            p2p_cl_in.where |= STARPU_CPU;
        }
668
#endif
669
#ifdef SCALFMM_ENABLE_CUDA_KERNEL
670
        if(originalCpuKernel->supportP2P(FSTARPU_CUDA_IDX)){
671 672 673 674
            p2p_cl_in.cuda_funcs[0] = StarPUCudaWrapperClass::directInPassCallback;
            p2p_cl_in.where |= STARPU_CUDA;
        }
#endif
675
#ifdef SCALFMM_ENABLE_OPENCL_KERNEL
676
        if(originalCpuKernel->supportP2P(FSTARPU_OPENCL_IDX)){
677 678 679
            p2p_cl_in.opencl_funcs[0] = StarPUOpenClWrapperClass::directInPassCallback;
            p2p_cl_in.where |= STARPU_OPENCL;
        }
680
#endif
681 682
        p2p_cl_in.nbuffers = 2;
        p2p_cl_in.modes[0] = STARPU_R;
BRAMAS Berenger's avatar
BRAMAS Berenger committed
683 684 685
#ifdef STARPU_USE_REDUX
        p2p_cl_in.modes[1] = STARPU_REDUX;
#else
686
        p2p_cl_in.modes[1] = starpu_data_access_mode(STARPU_RW|STARPU_COMMUTE_IF_SUPPORTED);
BRAMAS Berenger's avatar
BRAMAS Berenger committed
687
#endif
688
        p2p_cl_in.name = "p2p_cl_in";
689
        memset(&p2p_cl_inout, 0, sizeof(p2p_cl_inout));
690
#ifdef STARPU_USE_CPU
691
        if(originalCpuKernel->supportP2PExtern(FSTARPU_CPU_IDX)){
692 693 694
            p2p_cl_inout.cpu_funcs[0] = StarPUCpuWrapperClass::directInoutPassCallback;
            p2p_cl_inout.where |= STARPU_CPU;
        }
695
#endif
696
#ifdef SCALFMM_ENABLE_CUDA_KERNEL
697
        if(originalCpuKernel->supportP2PExtern(FSTARPU_CUDA_IDX)){
698 699 700 701
            p2p_cl_inout.cuda_funcs[0] = StarPUCudaWrapperClass::directInoutPassCallback;
            p2p_cl_inout.where |= STARPU_CUDA;
        }
#endif
702
#ifdef SCALFMM_ENABLE_OPENCL_KERNEL
703
        if(originalCpuKernel->supportP2PExtern(FSTARPU_OPENCL_IDX)){
704 705 706
            p2p_cl_inout.opencl_funcs[0] = StarPUOpenClWrapperClass::directInoutPassCallback;
            p2p_cl_inout.where |= STARPU_OPENCL;
        }
707
#endif
708 709
        p2p_cl_inout.nbuffers = 4;
        p2p_cl_inout.modes[0] = STARPU_R;
BRAMAS Berenger's avatar
BRAMAS Berenger committed
710 711 712
#ifdef STARPU_USE_REDUX
        p2p_cl_inout.modes[1] = STARPU_REDUX;
#else
713
        p2p_cl_inout.modes[1] = starpu_data_access_mode(STARPU_RW|STARPU_COMMUTE_IF_SUPPORTED);
BRAMAS Berenger's avatar
BRAMAS Berenger committed
714
#endif
715
        p2p_cl_inout.modes[2] = STARPU_R;
BRAMAS Berenger's avatar
BRAMAS Berenger committed
716 717 718
#ifdef STARPU_USE_REDUX
        p2p_cl_inout.modes[3] = STARPU_REDUX;
#else
719
        p2p_cl_inout.modes[3] = starpu_data_access_mode(STARPU_RW|STARPU_COMMUTE_IF_SUPPORTED);
BRAMAS Berenger's avatar
BRAMAS Berenger committed
720
#endif
721
        p2p_cl_inout.name = "p2p_cl_inout";
722 723

        memset(&m2l_cl_in, 0, sizeof(m2l_cl_in));
724
#ifdef STARPU_USE_CPU
725
        if(originalCpuKernel->supportM2L(FSTARPU_CPU_IDX)){
726 727 728
            m2l_cl_in.cpu_funcs[0] = StarPUCpuWrapperClass::transferInPassCallback;
            m2l_cl_in.where |= STARPU_CPU;
        }
729
#endif
730
#ifdef SCALFMM_ENABLE_CUDA_KERNEL
731
        if(originalCpuKernel->supportM2L(FSTARPU_CUDA_IDX)){
732 733 734 735
            m2l_cl_in.cuda_funcs[0] = StarPUCudaWrapperClass::transferInPassCallback;
            m2l_cl_in.where |= STARPU_CUDA;
        }
#endif
736
#ifdef SCALFMM_ENABLE_OPENCL_KERNEL
737
        if(originalCpuKernel->supportM2L(FSTARPU_OPENCL_IDX)){
738 739 740
            m2l_cl_in.opencl_funcs[0] = StarPUOpenClWrapperClass::transferInPassCallback;
            m2l_cl_in.where |= STARPU_OPENCL;
        }
741
#endif
742 743
        m2l_cl_in.nbuffers = 3;
        m2l_cl_in.modes[0] = STARPU_R;
744
        m2l_cl_in.modes[1] = STARPU_R;
745
        m2l_cl_in.modes[2] = starpu_data_access_mode(STARPU_RW|STARPU_COMMUTE_IF_SUPPORTED);
746
        m2l_cl_in.name = "m2l_cl_in";
BRAMAS Berenger's avatar
BRAMAS Berenger committed
747

748
        memset(&m2l_cl_inout, 0, sizeof(m2l_cl_inout));
749
#ifdef STARPU_USE_CPU
750
        if(originalCpuKernel->supportM2LExtern(FSTARPU_CPU_IDX)){
751 752 753
            m2l_cl_inout.cpu_funcs[0] = StarPUCpuWrapperClass::transferInoutPassCallback;
            m2l_cl_inout.where |= STARPU_CPU;
        }
754
#endif
755
#ifdef SCALFMM_ENABLE_CUDA_KERNEL
756
        if(originalCpuKernel->supportM2LExtern(FSTARPU_CUDA_IDX)){
757 758 759 760
            m2l_cl_inout.cuda_funcs[0] = StarPUCudaWrapperClass::transferInoutPassCallback;
            m2l_cl_inout.where |= STARPU_CUDA;
        }
#endif
761
#ifdef SCALFMM_ENABLE_OPENCL_KERNEL
762
        if(originalCpuKernel->supportM2LExtern(FSTARPU_OPENCL_IDX)){
763 764 765
            m2l_cl_inout.opencl_funcs[0] = StarPUOpenClWrapperClass::transferInoutPassCallback;
            m2l_cl_inout.where |= STARPU_OPENCL;
        }
766
#endif
BRAMAS Berenger's avatar
BRAMAS Berenger committed
767
        m2l_cl_inout.nbuffers = 4;
768
        m2l_cl_inout.modes[0] = STARPU_R;
BRAMAS Berenger's avatar
BRAMAS Berenger committed
769 770
        m2l_cl_inout.modes[1] = starpu_data_access_mode(STARPU_RW|STARPU_COMMUTE_IF_SUPPORTED);
        m2l_cl_inout.modes[2] = STARPU_R;
771
        m2l_cl_inout.modes[3] = STARPU_R;
772
        m2l_cl_inout.name = "m2l_cl_inout";
BRAMAS Berenger's avatar
BRAMAS Berenger committed
773

BRAMAS Berenger's avatar
BRAMAS Berenger committed
774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816
#ifdef STARPU_USE_REDUX
        memset(&p2p_redux_init, 0, sizeof(p2p_redux_init));
#ifdef STARPU_USE_CPU
        p2p_redux_init.cpu_funcs[0] = FStarPUReduxCpu::InitData<typename ParticleGroupClass::ParticleDataType>;
        p2p_redux_init.where |= STARPU_CPU;
#endif
        p2p_redux_init.nbuffers = 1;
        p2p_redux_init.modes[0] = STARPU_RW;
        p2p_redux_init.name = "p2p_redux_init";

        memset(&p2p_redux_perform, 0, sizeof(p2p_redux_perform));
#ifdef STARPU_USE_CPU
        p2p_redux_perform.cpu_funcs[0] = FStarPUReduxCpu::ReduceData<typename ParticleGroupClass::ParticleDataType>;
        p2p_redux_perform.where |= STARPU_CPU;
#endif
        p2p_redux_perform.nbuffers = 2;
        p2p_redux_perform.modes[0] = STARPU_RW;
        p2p_redux_perform.modes[1] = STARPU_R;
        p2p_redux_perform.name = "p2p_redux_perform";

        memset(&p2p_redux_read, 0, sizeof(p2p_redux_read));
#ifdef STARPU_USE_CPU
        if(originalCpuKernel->supportL2P(FSTARPU_CPU_IDX)){
            p2p_redux_read.cpu_funcs[0] = FStarPUReduxCpu::EmptyCodelet<typename ParticleGroupClass::ParticleDataType>;
            p2p_redux_read.where |= STARPU_CPU;
        }
#endif
#ifdef SCALFMM_ENABLE_CUDA_KERNEL
        if(originalCpuKernel->supportL2P(FSTARPU_CUDA_IDX)){
            p2p_redux_read.cuda_funcs[0] = FStarPUReduxCpu::EmptyCodelet<typename ParticleGroupClass::ParticleDataType>;
            p2p_redux_read.where |= STARPU_CUDA;
        }
#endif
#ifdef SCALFMM_ENABLE_OPENCL_KERNEL
        if(originalCpuKernel->supportL2P(FSTARPU_OPENCL_IDX)){
            p2p_redux_read.opencl_funcs[0] = FStarPUReduxCpu::EmptyCodelet<typename ParticleGroupClass::ParticleDataType>;
            p2p_redux_read.where |= STARPU_OPENCL;
        }
#endif
        p2p_redux_read.nbuffers = 1;
        p2p_redux_read.modes[0] = STARPU_R;
        p2p_redux_read.name = "p2p_redux_read";
#endif
BRAMAS Berenger's avatar
BRAMAS Berenger committed
817
    }
BRAMAS Berenger's avatar
BRAMAS Berenger committed
818

819 820 821
    /** dealloc in a starpu way all the defined handles */
    void cleanHandle(){
        for(int idxLevel = 0 ; idxLevel < tree->getHeight() ; ++idxLevel){
822 823 824 825
            for(int idxHandle = 0 ; idxHandle < int(cellHandles[idxLevel].size()) ; ++idxHandle){
                starpu_data_unregister(cellHandles[idxLevel][idxHandle].symb);
                starpu_data_unregister(cellHandles[idxLevel][idxHandle].up);
                starpu_data_unregister(cellHandles[idxLevel][idxHandle].down);
826
            }
827
            cellHandles[idxLevel].clear();
828 829
        }
        {
830 831 832
            for(int idxHandle = 0 ; idxHandle < int(particleHandles.size()) ; ++idxHandle){
                starpu_data_unregister(particleHandles[idxHandle].symb);
                starpu_data_unregister(particleHandles[idxHandle].down);
833
            }
834
            particleHandles.clear();
835 836 837 838
        }
    }

    ////////////////////////////////////////////////////////////////////////////
839 840

    void initCodeletMpi(){
841
        memset(&p2p_cl_inout_mpi, 0, sizeof(p2p_cl_inout_mpi));
842
#ifdef STARPU_USE_CPU
843
        if(originalCpuKernel->supportP2PMpi(FSTARPU_CPU_IDX)){
844 845 846
            p2p_cl_inout_mpi.where |= STARPU_CPU;
            p2p_cl_inout_mpi.cpu_funcs[0] = StarPUCpuWrapperClass::directInoutPassCallbackMpi;
        }
847
#endif
848
#ifdef SCALFMM_ENABLE_CUDA_KERNEL
849
        if(originalCpuKernel->supportP2PMpi(FSTARPU_CUDA_IDX)){
850 851 852 853
            p2p_cl_inout_mpi.where |= STARPU_CUDA;
            p2p_cl_inout_mpi.cuda_funcs[0] = StarPUCudaWrapperClass::directInoutPassCallbackMpi;
        }
#endif
854
#ifdef SCALFMM_ENABLE_OPENCL_KERNEL
855
        if(originalCpuKernel->supportP2PMpi(FSTARPU_OPENCL_IDX)){
856 857 858
            p2p_cl_inout_mpi.where |= STARPU_OPENCL;
            p2p_cl_inout_mpi.opencl_funcs[0] = StarPUOpenClWrapperClass::directInoutPassCallbackMpi;
        }
859
#endif
860 861
        p2p_cl_inout_mpi.nbuffers = 3;
        p2p_cl_inout_mpi.modes[0] = STARPU_R;
862
        p2p_cl_inout_mpi.modes[1] = starpu_data_access_mode(STARPU_RW|STARPU_COMMUTE_IF_SUPPORTED);
863
        p2p_cl_inout_mpi.modes[2] = STARPU_R;
864
        p2p_cl_inout_mpi.name = "p2p_cl_inout_mpi";
865

866
        memset(&m2l_cl_inout_mpi, 0, sizeof(m2l_cl_inout_mpi));
867
#ifdef STARPU_USE_CPU
868
        if(originalCpuKernel->supportM2LMpi(FSTARPU_CPU_IDX)){
869 870 871
            m2l_cl_inout_mpi.where |= STARPU_CPU;
            m2l_cl_inout_mpi.cpu_funcs[0] = StarPUCpuWrapperClass::transferInoutPassCallbackMpi;
        }
872
#endif
873
#ifdef SCALFMM_ENABLE_CUDA_KERNEL
874
        if(originalCpuKernel->supportM2LMpi(FSTARPU_CUDA_IDX)){
875 876 877 878
            m2l_cl_inout_mpi.where |= STARPU_CUDA;
            m2l_cl_inout_mpi.cuda_funcs[0] = StarPUCudaWrapperClass::transferInoutPassCallbackMpi;
        }
#endif
879
#ifdef SCALFMM_ENABLE_OPENCL_KERNEL
880
        if(originalCpuKernel->supportM2LMpi(FSTARPU_OPENCL_IDX)){
881 882 883
            m2l_cl_inout_mpi.where |= STARPU_OPENCL;
            m2l_cl_inout_mpi.opencl_funcs[0] = StarPUOpenClWrapperClass::transferInoutPassCallbackMpi;
        }
884
#endif
885 886
        m2l_cl_inout_mpi.nbuffers = 4;
        m2l_cl_inout_mpi.modes[0] = STARPU_R;
887
        m2l_cl_inout_mpi.modes[1] = starpu_data_access_mode(STARPU_RW|STARPU_COMMUTE_IF_SUPPORTED);
888 889
        m2l_cl_inout_mpi.modes[2] = STARPU_R;
        m2l_cl_inout_mpi.modes[3] = STARPU_R;
890
        m2l_cl_inout_mpi.name = "m2l_cl_inout_mpi";
891 892
    }

893 894 895 896 897 898
    std::vector<std::pair<MortonIndex,MortonIndex>> processesIntervalPerLevels;
    struct BlockDescriptor{
        MortonIndex firstIndex;
        MortonIndex lastIndex;
        int owner;
        int bufferSize;
899 900 901 902
        size_t bufferSizeSymb;
        size_t bufferSizeUp;
        size_t bufferSizeDown;
        size_t leavesBufferSize;
903 904 905 906 907 908 909 910
    };
    std::vector<std::vector<BlockDescriptor>> processesBlockInfos;
    std::vector<int> nbBlocksPerLevelAll;
    std::vector<int> nbBlocksBeforeMinPerLevel;

    std::vector< std::vector< std::vector<BlockInteractions<CellContainerClass>>>> externalInteractionsAllLevelMpi;
    std::vector< std::vector<BlockInteractions<ParticleGroupClass>>> externalInteractionsLeafLevelMpi;

911
    struct RemoteHandle{
912 913 914 915
        RemoteHandle() : ptrSymb(nullptr), ptrUp(nullptr), ptrDown(nullptr){
            memset(&handleSymb, 0, sizeof(handleSymb));
            memset(&handleUp, 0, sizeof(handleUp));
            memset(&handleDown, 0, sizeof(handleDown));
916 917
        }

918 919 920 921 922 923
        unsigned char * ptrSymb;
        starpu_data_handle_t handleSymb;
        unsigned char * ptrUp;
        starpu_data_handle_t handleUp;
        unsigned char * ptrDown;
        starpu_data_handle_t handleDown;
BRAMAS Berenger's avatar
BRAMAS Berenger committed
924 925

        int intervalSize;
926 927 928 929 930
    };

    std::vector<std::vector<RemoteHandle>> remoteCellGroups;
    std::vector<RemoteHandle> remoteParticleGroupss;

931
    void buildRemoteInteractionsAndHandles(){
932 933
        cleanHandleMpi();

934 935
        // We need to have information about all other blocks
        std::unique_ptr<int[]> nbBlocksPerLevel(new int[tree->getHeight()]);
936
        nbBlocksPerLevel[0] = 0;
937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979
        for(int idxLevel = 1 ; idxLevel < tree->getHeight() ; ++idxLevel){
            nbBlocksPerLevel[idxLevel] = tree->getNbCellGroupAtLevel(idxLevel);
        }
        // Exchange the number of blocks per proc
        nbBlocksPerLevelAll.resize(tree->getHeight() * comm.processCount());
        FMpi::Assert(MPI_Allgather(nbBlocksPerLevel.get(), tree->getHeight(), MPI_INT,
                                   nbBlocksPerLevelAll.data(), tree->getHeight(), MPI_INT,
                                   comm.getComm()), __LINE__);
        // Compute the number of blocks before mine
        nbBlocksBeforeMinPerLevel.resize(tree->getHeight());
        for(int idxLevel = 1 ; idxLevel < tree->getHeight() ; ++idxLevel){
            nbBlocksBeforeMinPerLevel[idxLevel] = 0;
            for(int idxProc = 0 ; idxProc < comm.processId() ; ++idxProc){
                nbBlocksBeforeMinPerLevel[idxLevel] += nbBlocksPerLevelAll[idxProc*tree->getHeight() + idxLevel];
            }
        }
        // Prepare the block infos
        processesBlockInfos.resize(tree->getHeight());
        std::unique_ptr<int[]> recvBlocksCount(new int[comm.processCount()]);
        std::unique_ptr<int[]> recvBlockDispl(new int[comm.processCount()]);
        // Exchange the block info per level
        for(int idxLevel = 1 ; idxLevel < tree->getHeight() ; ++idxLevel){
            // Count the total number of blocks
            int nbBlocksInLevel = 0;
            recvBlockDispl[0] = 0;
            for(int idxProc = 0 ; idxProc < comm.processCount() ; ++idxProc){
                nbBlocksInLevel += nbBlocksPerLevelAll[idxProc*tree->getHeight() + idxLevel];
                // Count and displacement for the MPI all gatherv
                recvBlocksCount[idxProc] = nbBlocksPerLevelAll[idxProc*tree->getHeight() + idxLevel] * int(sizeof(BlockDescriptor));
                if(idxProc) recvBlockDispl[idxProc] = recvBlockDispl[idxProc-1] + recvBlocksCount[idxProc-1];
            }
            processesBlockInfos[idxLevel].resize(nbBlocksInLevel);
            // Fill my blocks
            std::vector<BlockDescriptor> myBlocksAtLevel;
            myBlocksAtLevel.resize(nbBlocksPerLevel[idxLevel]);
            FAssertLF(tree->getNbCellGroupAtLevel(idxLevel) == int(myBlocksAtLevel.size()));
            FAssertLF(nbBlocksPerLevel[idxLevel] == nbBlocksPerLevelAll[comm.processId()*tree->getHeight() + idxLevel]);

            for(int idxGroup = 0 ; idxGroup < tree->getNbCellGroupAtLevel(idxLevel) ; ++idxGroup){
                CellContainerClass*const currentCells = tree->getCellGroup(idxLevel, idxGroup);
                myBlocksAtLevel[idxGroup].firstIndex = currentCells->getStartingIndex();
                myBlocksAtLevel[idxGroup].lastIndex  = currentCells->getEndingIndex();
                myBlocksAtLevel[idxGroup].owner = comm.processId();
980 981 982
                myBlocksAtLevel[idxGroup].bufferSizeSymb = currentCells->getBufferSizeInByte();
                myBlocksAtLevel[idxGroup].bufferSizeUp   = currentCells->getMultipoleBufferSizeInByte();
                myBlocksAtLevel[idxGroup].bufferSizeDown = currentCells->getLocalBufferSizeInByte();
983 984 985 986 987 988 989 990 991 992 993 994 995

                if(idxLevel == tree->getHeight() - 1){
                    myBlocksAtLevel[idxGroup].leavesBufferSize = tree->getParticleGroup(idxGroup)->getBufferSizeInByte();
                }
                else{
                    myBlocksAtLevel[idxGroup].leavesBufferSize = 0;
                }
            }
            // Exchange with all other
            FMpi::Assert(MPI_Allgatherv(myBlocksAtLevel.data(), int(myBlocksAtLevel.size()*sizeof(BlockDescriptor)), MPI_BYTE,
                                        processesBlockInfos[idxLevel].data(), recvBlocksCount.get(), recvBlockDispl.get(), MPI_BYTE,
                                        comm.getComm()), __LINE__);
        }
996 997 998 999 1000 1001 1002
        // Prepare remate ptr and handles
        remoteCellGroups.resize( tree->getHeight() );
        for(int idxLevel = 1 ; idxLevel < tree->getHeight() ; ++idxLevel){
            remoteCellGroups[idxLevel].resize( processesBlockInfos[idxLevel].size());
        }
        remoteParticleGroupss.resize(processesBlockInfos[tree->getHeight()-1].size());

1003 1004 1005 1006 1007 1008 1009 1010
        // From now we have the number of blocks for all process
        // we also have the size of the blocks therefor we can
        // create the handles we need
        // We will now detect the relation between our blocks and others
        // During the M2M (which is the same for the L2L)
        // During the M2L and during the P2P
        // I need to insert the task that read my data or that write the data I need.
        // M2L
1011
        externalInteractionsAllLevelMpi.clear();
1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025
        externalInteractionsAllLevelMpi.resize(tree->getHeight());
        for(int idxLevel = tree->getHeight()-1 ; idxLevel >= 2 ; --idxLevel){
            // From this level there are no more blocks
            if(tree->getNbCellGroupAtLevel(idxLevel) == 0){
                // We stop here
                break;
            }
            // What are my morton interval at this level
            const MortonIndex myFirstIndex = tree->getCellGroup(idxLevel, 0)->getStartingIndex();
            const MortonIndex myLastIndex = tree->getCellGroup(idxLevel, tree->getNbCellGroupAtLevel(idxLevel)-1)->getEndingIndex();

            externalInteractionsAllLevelMpi[idxLevel].resize(tree->getNbCellGroupAtLevel(idxLevel));

            for(int idxGroup = 0 ; idxGroup < tree->getNbCellGroupAtLevel(idxLevel) ; ++idxGroup){
1026
                CellContainerClass* currentCells = tree->getCellGroup(idxLevel, idxGroup);
1027 1028 1029

                std::vector<BlockInteractions<CellContainerClass>>* externalInteractions = &externalInteractionsAllLevelMpi[idxLevel][idxGroup];

BRAMAS Berenger's avatar
BRAMAS Berenger committed
1030
#pragma omp task default(none) firstprivate(idxGroup, currentCells, idxLevel, externalInteractions)
1031 1032 1033
                {
                    std::vector<OutOfBlockInteraction> outsideInteractions;

1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047
                    for(int idxCell = 0 ; idxCell < currentCells->getNumberOfCellsInBlock() ; ++idxCell){
                        const MortonIndex mindex = currentCells->getCellMortonIndex(idxCell);

                        MortonIndex interactionsIndexes[189];
                        int interactionsPosition[189];
                        const FTreeCoordinate coord(mindex, idxLevel);
                        int counter = coord.getInteractionNeighbors(idxLevel,interactionsIndexes,interactionsPosition);

                        for(int idxInter = 0 ; idxInter < counter ; ++idxInter){
                            // This interactions need a block owned by someoneelse
                            if(interactionsIndexes[idxInter] < myFirstIndex || myLastIndex <= interactionsIndexes[idxInter]){
                                OutOfBlockInteraction property;
                                property.insideIndex = mindex;
                                property.outIndex    = interactionsIndexes[idxInter];
BRAMAS Berenger's avatar
BRAMAS Berenger committed
1048
                                property.relativeOutPosition = interactionsPosition[idxInter];
1049
                                property.insideIdxInBlock = idxCell;
BRAMAS Berenger's avatar
BRAMAS Berenger committed
1050
                                property.outsideIdxInBlock = -1;
1051
                                outsideInteractions.push_back(property);
1052 1053 1054 1055 1056 1057 1058 1059 1060
                            }
                        }
                    }

                    // Manage outofblock interaction
                    FQuickSort<OutOfBlockInteraction, int>::QsSequential(outsideInteractions.data(),int(outsideInteractions.size()));

                    int currentOutInteraction = 0;
                    for(int idxOtherGroup = 0 ; idxOtherGroup < int(processesBlockInfos[idxLevel].size())
BRAMAS Berenger's avatar
BRAMAS Berenger committed
1061
                        && currentOutInteraction < int(outsideInteractions.size()) ; ++idxOtherGroup){
1062 1063 1064 1065 1066 1067 1068 1069 1070
                        // Skip my blocks
                        if(idxOtherGroup == nbBlocksBeforeMinPerLevel[idxLevel]){
                            idxOtherGroup += tree->getNbCellGroupAtLevel(idxLevel);
                            if(idxOtherGroup == int(processesBlockInfos[idxLevel].size())){
                                break;
                            }
                            FAssertLF(idxOtherGroup < int(processesBlockInfos[idxLevel].size()));
                        }

BRAMAS Berenger's avatar
BRAMAS Berenger committed
1071 1072
                        const MortonIndex blockStartIdxOther = processesBlockInfos[idxLevel][idxOtherGroup].firstIndex;
                        const MortonIndex blockEndIdxOther   = processesBlockInfos[idxLevel][idxOtherGroup].lastIndex;
1073

BRAMAS Berenger's avatar
BRAMAS Berenger committed
1074
                        while(currentOutInteraction < int(outsideInteractions.size()) && outsideInteractions[currentOutInteraction].outIndex < blockStartIdxOther){
1075 1076 1077 1078
                            currentOutInteraction += 1;
                        }

                        int lastOutInteraction = currentOutInteraction;
BRAMAS Berenger's avatar
BRAMAS Berenger committed
1079
                        while(lastOutInteraction < int(outsideInteractions.size()) && outsideInteractions[lastOutInteraction].outIndex < blockEndIdxOther){
1080 1081 1082 1083 1084 1085
                            lastOutInteraction += 1;
                        }

                        // Create interactions
                        const int nbInteractionsBetweenBlocks = (lastOutInteraction-currentOutInteraction);
                        if(nbInteractionsBetweenBlocks){
1086
                            if(remoteCellGroups[idxLevel][idxOtherGroup].ptrSymb == nullptr){
BRAMAS Berenger's avatar
BRAMAS Berenger committed
1087
#pragma omp critical(CreateM2LRemotes)
1088
                                {
1089
                                    if(remoteCellGroups[idxLevel][idxOtherGroup].ptrSymb == nullptr){
BRAMAS Berenger's avatar
BRAMAS Berenger committed
1090
                                        const size_t nbBytesInBlockSymb = processesBlockInfos[idxLevel][idxOtherGroup].bufferSizeSymb;
1091
                                        unsigned char* memoryBlockSymb = (unsigned char*)FAlignedMemory::AllocateBytes<32>(nbBytesInBlockSymb);
1092 1093 1094
                                        remoteCellGroups[idxLevel][idxOtherGroup].ptrSymb = memoryBlockSymb;
                                        starpu_variable_data_register(&remoteCellGroups[idxLevel][idxOtherGroup].handleSymb, 0,
                                                                      (uintptr_t)remoteCellGroups[idxLevel][idxOtherGroup].ptrSymb, nbBytesInBlockSymb);
BRAMAS Berenger's avatar
BRAMAS Berenger committed
1095
                                        const size_t nbBytesInBlockUp = processesBlockInfos[idxLevel][idxOtherGroup].bufferSizeUp;
1096
                                        unsigned char* memoryBlockUp = (unsigned char*)FAlignedMemory::AllocateBytes<32>(nbBytesInBlockUp);
1097 1098 1099
                                        remoteCellGroups[idxLevel][idxOtherGroup].ptrUp = memoryBlockUp;
                                        starpu_variable_data_register(&remoteCellGroups[idxLevel][idxOtherGroup].handleUp, 0,
                                                                      (uintptr_t)remoteCellGroups[idxLevel][idxOtherGroup].ptrUp, nbBytesInBlockUp);
1100 1101
                                    }
                                }
1102 1103
                            }

1104 1105
                            externalInteractions->emplace_back();
                            BlockInteractions<CellContainerClass>* interactions = &externalInteractions->back();
1106
                            //interactions->otherBlock = remoteCellGroups[idxLevel][idxOtherGroup].ptr;
BRAMAS Berenger's avatar