From 5f74edf6829a86c96980afdf83997a6985fc9c05 Mon Sep 17 00:00:00 2001
From: Berenger Bramas <bbramas@mpcdf.mpg.de>
Date: Fri, 9 Sep 2016 15:42:11 +0200
Subject: [PATCH] Add extract for the M2L implicit mpi (not tested)

---
 Src/GroupTree/Core/FGroupOfCells.hpp          |  80 ++++
 .../FGroupTaskStarpuImplicitAlgorithm.hpp     | 414 +++++++++++++++---
 2 files changed, 423 insertions(+), 71 deletions(-)

diff --git a/Src/GroupTree/Core/FGroupOfCells.hpp b/Src/GroupTree/Core/FGroupOfCells.hpp
index f162fe123..865842bdd 100644
--- a/Src/GroupTree/Core/FGroupOfCells.hpp
+++ b/Src/GroupTree/Core/FGroupOfCells.hpp
@@ -401,6 +401,86 @@ public:
                                         ));
         }
     }
+
+    /** Extract for implicit MPI */
+
+
+    size_t extractGetSizeSymbUp(const std::vector<int>& cellsToExtract) const {
+        return cellsToExtract.size() * sizeof(SymboleCellClass) * sizeof(PoleCellClass);
+    }
+
+
+    void extractDataUp(const std::vector<int>& cellsToExtract,
+                     unsigned char* outputBuffer, const size_t outputBufferSize) const {
+        size_t idxValue = 0;
+        for(size_t idxEx = 0 ; idxEx < cellsToExtract.size() ; ++idxEx){
+            const int idCell = cellsToExtract[idxEx];
+            memcpy(&outputBuffer[idxValue],
+                   &blockCells[idCell],
+                   sizeof(SymboleCellClass));
+            idxValue += sizeof(SymboleCellClass);
+            memcpy(&outputBuffer[idxValue],
+                   &cellMultipoles[idCell],
+                   sizeof(PoleCellClass));
+            idxValue += sizeof(PoleCellClass);
+        }
+        FAssertLF(idxValue == outputBufferSize);
+    }
+
+    void restoreDataUp(const std::vector<int>& cellsToExtract,
+                     const unsigned char* intputBuffer, const size_t inputBufferSize){
+        size_t idxValue = 0;
+        for(size_t idxEx = 0 ; idxEx < cellsToExtract.size() ; ++idxEx){
+            const int idCell = cellsToExtract[idxEx];
+            memcpy(&blockCells[idCell],
+                   &intputBuffer[idxValue],
+                   sizeof(SymboleCellClass));
+            idxValue += sizeof(SymboleCellClass);
+            memcpy(&cellMultipoles[idCell],
+                   &intputBuffer[idxValue],
+                   sizeof(PoleCellClass));
+            idxValue += sizeof(PoleCellClass);
+        }
+        FAssertLF(idxValue == inputBufferSize);
+    }
+
+    size_t extractGetSizeSymbDown(const std::vector<int>& cellsToExtract) const {
+        return cellsToExtract.size() * sizeof(SymboleCellClass) * sizeof(LocalCellClass);
+    }
+
+    void extractDataDown(const std::vector<int>& cellsToExtract,
+                     unsigned char* outputBuffer, const size_t outputBufferSize) const {
+        size_t idxValue = 0;
+        for(size_t idxEx = 0 ; idxEx < cellsToExtract.size() ; ++idxEx){
+            const int idCell = cellsToExtract[idxEx];
+            memcpy(&outputBuffer[idxValue],
+                   &blockCells[idCell],
+                   sizeof(SymboleCellClass));
+            idxValue += sizeof(SymboleCellClass);
+            memcpy(&outputBuffer[idxValue],
+                   &cellLocals[idCell],
+                   sizeof(PoleCellClass));
+            idxValue += sizeof(PoleCellClass);
+        }
+        FAssertLF(idxValue == outputBufferSize);
+    }
+
+    void restoreDataDown(const std::vector<int>& cellsToExtract,
+                     const unsigned char* intputBuffer, const size_t inputBufferSize){
+        size_t idxValue = 0;
+        for(size_t idxEx = 0 ; idxEx < cellsToExtract.size() ; ++idxEx){
+            const int idCell = cellsToExtract[idxEx];
+            memcpy(&blockCells[idCell],
+                   &intputBuffer[idxValue],
+                   sizeof(SymboleCellClass));
+            idxValue += sizeof(SymboleCellClass);
+            memcpy(&cellLocals[idCell],
+                   &intputBuffer[idxValue],
+                   sizeof(PoleCellClass));
+            idxValue += sizeof(PoleCellClass);
+        }
+        FAssertLF(idxValue == inputBufferSize);
+    }
 };
 
 
diff --git a/Src/GroupTree/Core/FGroupTaskStarpuImplicitAlgorithm.hpp b/Src/GroupTree/Core/FGroupTaskStarpuImplicitAlgorithm.hpp
index 54a24a7df..7f9abe414 100644
--- a/Src/GroupTree/Core/FGroupTaskStarpuImplicitAlgorithm.hpp
+++ b/Src/GroupTree/Core/FGroupTaskStarpuImplicitAlgorithm.hpp
@@ -182,6 +182,29 @@ protected:
     starpu_codelet p2p_extract;
     starpu_codelet p2p_insert;
 
+    struct CellExtractedHandles{
+        starpu_data_handle_t all;
+        size_t size;
+        std::unique_ptr<unsigned char[]> data;
+        std::vector<int> cellsToExtract;
+    };
+
+    std::list<CellExtractedHandles> extractedCellBuffer;
+
+    struct DuplicatedCellHandle{
+        starpu_data_handle_t symb;
+        size_t sizeSymb;
+        unsigned char* dataSymb; // Never delete it, we reuse already allocate memory here
+        starpu_data_handle_t other;
+        size_t sizeOther;
+        unsigned char* dataOther; // Never delete it, we reuse already allocate memory here
+    };
+
+    std::list<DuplicatedCellHandle> duplicatedCellBuffer;
+
+    starpu_codelet cell_extract_up;
+    starpu_codelet cell_insert_up;
+
 public:
     FGroupTaskStarPUImplicitAlgorithm(OctreeClass*const inTree, KernelClass* inKernels, std::vector<MortonIndex>& distributedMortonIndex)
         : tree(inTree), originalCpuKernel(inKernels),
@@ -804,6 +827,7 @@ protected:
 #endif
 
         memset(&p2p_extract, 0, sizeof(p2p_extract));
+        p2p_extract.nbuffers = 2;
         p2p_extract.modes[0] = STARPU_R;
         p2p_extract.modes[1] = STARPU_RW;
         p2p_extract.name = "p2p_extract";
@@ -811,11 +835,29 @@ protected:
         p2p_extract.where |= STARPU_CPU;
 
         memset(&p2p_insert, 0, sizeof(p2p_insert));
+        p2p_insert.nbuffers = 2;
         p2p_insert.modes[0] = STARPU_R;
         p2p_insert.modes[1] = STARPU_RW;
         p2p_insert.name = "p2p_insert";
         p2p_insert.cpu_funcs[0] = ThisClass::InsertP2P;
         p2p_insert.where |= STARPU_CPU;
+
+        memset(&cell_extract_up, 0, sizeof(cell_extract_up));
+        cell_extract_up.nbuffers = 3;
+        cell_extract_up.modes[0] = STARPU_R;
+        cell_extract_up.modes[1] = STARPU_R;
+        cell_extract_up.modes[2] = STARPU_RW;
+        cell_extract_up.name = "cell_extract_up";
+        cell_extract_up.cpu_funcs[0] = ThisClass::ExtractCellUp;
+        cell_extract_up.where |= STARPU_CPU;
+
+        memset(&cell_insert_up, 0, sizeof(cell_insert_up));
+        cell_extract_up.nbuffers = 3;
+        cell_insert_up.modes[0] = STARPU_R;
+        cell_insert_up.modes[1] = STARPU_RW;
+        cell_insert_up.name = "cell_insert_up";
+        cell_insert_up.cpu_funcs[0] = ThisClass::InsertCellUp;
+        cell_insert_up.where |= STARPU_CPU;
     }
 
     static void InsertP2P(void *buffers[], void *cl_arg){
@@ -848,6 +890,34 @@ protected:
         containers.extractData(interactionBufferPtr->leavesToExtract, inBuffer1, size1);
     }
 
+    static void InsertCellUp(void *buffers[], void *cl_arg){
+        CellContainerClass currentCells((unsigned char*)STARPU_VECTOR_GET_PTR(buffers[1]),
+                                        STARPU_VECTOR_GET_NX(buffers[1]),
+                                        nullptr,
+                                        (unsigned char*)STARPU_VECTOR_GET_PTR(buffers[2]));
+        unsigned char* inBuffer = (unsigned char*)STARPU_VECTOR_GET_PTR(buffers[0]);
+        size_t size = STARPU_VECTOR_GET_NX(buffers[0]);
+
+        CellExtractedHandles* interactionBufferPtr;
+        starpu_codelet_unpack_args(cl_arg, &interactionBufferPtr);
+
+        currentCells.restoreDataUp(interactionBufferPtr->cellsToExtract, inBuffer, size);
+    }
+
+    static void ExtractCellUp(void *buffers[], void *cl_arg){
+        CellContainerClass currentCells((unsigned char*)STARPU_VECTOR_GET_PTR(buffers[0]),
+                                        STARPU_VECTOR_GET_NX(buffers[0]),
+                                        nullptr,
+                                        (unsigned char*)STARPU_VECTOR_GET_PTR(buffers[1]));
+        unsigned char* inBuffer = (unsigned char*)STARPU_VECTOR_GET_PTR(buffers[2]);
+        size_t size = STARPU_VECTOR_GET_NX(buffers[2]);
+
+        CellExtractedHandles* interactionBufferPtr;
+        starpu_codelet_unpack_args(cl_arg, &interactionBufferPtr);
+
+        currentCells.extractDataUp(interactionBufferPtr->cellsToExtract, inBuffer, size);
+    }
+
     void initCodeletMpi(){
         memset(&p2p_cl_inout_mpi, 0, sizeof(p2p_cl_inout_mpi));
 #ifdef STARPU_USE_CPU
@@ -1382,77 +1452,279 @@ protected:
                         const int interactionid = externalInteractionsAllLevel[idxLevel][idxGroup][idxInteraction].otherBlockId;
                         const std::vector<OutOfBlockInteraction>* outsideInteractions = &externalInteractionsAllLevel[idxLevel][idxGroup][idxInteraction].interactions;
 
-                        int mode = 1;
-                        starpu_mpi_insert_task(MPI_COMM_WORLD,
-                                               &m2l_cl_inout,
-                                               STARPU_VALUE, &wrapperptr, sizeof(wrapperptr),
-                                               STARPU_VALUE, &idxLevel, sizeof(idxLevel),
-                                               STARPU_VALUE, &outsideInteractions, sizeof(outsideInteractions),
-                                               STARPU_VALUE, &cellHandles[idxLevel][idxGroup].intervalSize, sizeof(int),
-                                               STARPU_VALUE, &mode, sizeof(int),
-                       #ifdef SCALFMM_STARPU_USE_PRIO
-                                               STARPU_PRIORITY, PrioClass::Controller().getInsertionPosM2LExtern(idxLevel),
-                       #endif
-                                               STARPU_R, cellHandles[idxLevel][idxGroup].symb,
-                                               (STARPU_RW | STARPU_COMMUTE_IF_SUPPORTED), cellHandles[idxLevel][idxGroup].down,
-                                               STARPU_R, cellHandles[idxLevel][interactionid].symb,
-                                               STARPU_R, cellHandles[idxLevel][interactionid].up,
-                       #ifdef STARPU_USE_TASK_NAME
-                       #ifndef SCALFMM_SIMGRID_TASKNAMEPARAMS
-                                               STARPU_NAME, m2lOuterTaskNames[idxLevel].get(),
-                       #else
-                                               //"M2L_out-l_nb_i_nb_i_s
-                                               STARPU_NAME, taskNames->print("M2L_out", "%d, %d, %lld, %d, %lld, %d, %lld, %lld, %lld, %lld, %d\n",
-                                                                             idxLevel,
-                                                                             tree->getCellGroup(idxLevel,idxGroup)->getNumberOfCellsInBlock(),
-                                                                             tree->getCellGroup(idxLevel,idxGroup)->getSizeOfInterval(),
-                                                                             tree->getCellGroup(idxLevel,interactionid)->getNumberOfCellsInBlock(),
-                                                                             tree->getCellGroup(idxLevel,interactionid)->getSizeOfInterval(),
-                                                                             outsideInteractions->size(),
-                                                                             tree->getCellGroup(idxLevel, idxGroup)->getStartingIndex(),
-                                                                             tree->getCellGroup(idxLevel, idxGroup)->getEndingIndex(),
-                                                                             tree->getCellGroup(idxLevel, interactionid)->getStartingIndex(),
-                                                                             tree->getCellGroup(idxLevel, interactionid)->getEndingIndex(),
-                                                                             starpu_mpi_data_get_rank(cellHandles[idxLevel][idxGroup].down)),
-                       #endif
-                       #endif
-                                               0);
-
-                        mode = 2;
-                        starpu_mpi_insert_task(MPI_COMM_WORLD,
-                                               &m2l_cl_inout,
-                                               STARPU_VALUE, &wrapperptr, sizeof(wrapperptr),
-                                               STARPU_VALUE, &idxLevel, sizeof(idxLevel),
-                                               STARPU_VALUE, &outsideInteractions, sizeof(outsideInteractions),
-                                               STARPU_VALUE, &cellHandles[idxLevel][idxGroup].intervalSize, sizeof(int),
-                                               STARPU_VALUE, &mode, sizeof(int),
-                       #ifdef SCALFMM_STARPU_USE_PRIO
-                                               STARPU_PRIORITY, PrioClass::Controller().getInsertionPosM2LExtern(idxLevel),
-                       #endif
-                                               STARPU_R, cellHandles[idxLevel][interactionid].symb,
-                                               (STARPU_RW | STARPU_COMMUTE_IF_SUPPORTED), cellHandles[idxLevel][interactionid].down,
-                                               STARPU_R, cellHandles[idxLevel][idxGroup].symb,
-                                               STARPU_R, cellHandles[idxLevel][idxGroup].up,
-                       #ifdef STARPU_USE_TASK_NAME
-                       #ifndef SCALFMM_SIMGRID_TASKNAMEPARAMS
-                                               STARPU_NAME, m2lOuterTaskNames[idxLevel].get(),
-                       #else
-                                               //"M2L_out-l_nb_i_nb_i_s"
-                                               STARPU_NAME, taskNames->print("M2L_out", "%d, %d, %lld, %d, %lld, %d, %lld, %lld, %lld, %lld, %d\n",
-                                                                             idxLevel,
-                                                                             tree->getCellGroup(idxLevel,idxGroup)->getNumberOfCellsInBlock(),
-                                                                             tree->getCellGroup(idxLevel,idxGroup)->getSizeOfInterval(),
-                                                                             tree->getCellGroup(idxLevel,interactionid)->getNumberOfCellsInBlock(),
-                                                                             tree->getCellGroup(idxLevel,interactionid)->getSizeOfInterval(),
-                                                                             outsideInteractions->size(),
-                                                                             tree->getCellGroup(idxLevel, interactionid)->getStartingIndex(),
-                                                                             tree->getCellGroup(idxLevel, interactionid)->getEndingIndex(),
-                                                                             tree->getCellGroup(idxLevel, idxGroup)->getStartingIndex(),
-                                                                             tree->getCellGroup(idxLevel, idxGroup)->getEndingIndex(),
-                                                                             starpu_mpi_data_get_rank(cellHandles[idxLevel][interactionid].down)),
-                       #endif
-                       #endif
-                                               0);
+                        // On the same node -- do as usual
+                        if(starpu_mpi_data_get_rank(cellHandles[idxLevel][idxGroup].symb) == starpu_mpi_data_get_rank(cellHandles[idxLevel][interactionid].symb)){
+                            int mode = 1;
+                            starpu_mpi_insert_task(MPI_COMM_WORLD,
+                                                   &m2l_cl_inout,
+                                                   STARPU_VALUE, &wrapperptr, sizeof(wrapperptr),
+                                                   STARPU_VALUE, &idxLevel, sizeof(idxLevel),
+                                                   STARPU_VALUE, &outsideInteractions, sizeof(outsideInteractions),
+                                                   STARPU_VALUE, &cellHandles[idxLevel][idxGroup].intervalSize, sizeof(int),
+                                                   STARPU_VALUE, &mode, sizeof(int),
+                           #ifdef SCALFMM_STARPU_USE_PRIO
+                                                   STARPU_PRIORITY, PrioClass::Controller().getInsertionPosM2LExtern(idxLevel),
+                           #endif
+                                                   STARPU_R, cellHandles[idxLevel][idxGroup].symb,
+                                                   (STARPU_RW | STARPU_COMMUTE_IF_SUPPORTED), cellHandles[idxLevel][idxGroup].down,
+                                                   STARPU_R, cellHandles[idxLevel][interactionid].symb,
+                                                   STARPU_R, cellHandles[idxLevel][interactionid].up,
+                           #ifdef STARPU_USE_TASK_NAME
+                           #ifndef SCALFMM_SIMGRID_TASKNAMEPARAMS
+                                                   STARPU_NAME, m2lOuterTaskNames[idxLevel].get(),
+                           #else
+                                                   //"M2L_out-l_nb_i_nb_i_s
+                                                   STARPU_NAME, taskNames->print("M2L_out", "%d, %d, %lld, %d, %lld, %d, %lld, %lld, %lld, %lld, %d\n",
+                                                                                 idxLevel,
+                                                                                 tree->getCellGroup(idxLevel,idxGroup)->getNumberOfCellsInBlock(),
+                                                                                 tree->getCellGroup(idxLevel,idxGroup)->getSizeOfInterval(),
+                                                                                 tree->getCellGroup(idxLevel,interactionid)->getNumberOfCellsInBlock(),
+                                                                                 tree->getCellGroup(idxLevel,interactionid)->getSizeOfInterval(),
+                                                                                 outsideInteractions->size(),
+                                                                                 tree->getCellGroup(idxLevel, idxGroup)->getStartingIndex(),
+                                                                                 tree->getCellGroup(idxLevel, idxGroup)->getEndingIndex(),
+                                                                                 tree->getCellGroup(idxLevel, interactionid)->getStartingIndex(),
+                                                                                 tree->getCellGroup(idxLevel, interactionid)->getEndingIndex(),
+                                                                                 starpu_mpi_data_get_rank(cellHandles[idxLevel][idxGroup].down)),
+                           #endif
+                           #endif
+                                                   0);
+
+                            mode = 2;
+                            starpu_mpi_insert_task(MPI_COMM_WORLD,
+                                                   &m2l_cl_inout,
+                                                   STARPU_VALUE, &wrapperptr, sizeof(wrapperptr),
+                                                   STARPU_VALUE, &idxLevel, sizeof(idxLevel),
+                                                   STARPU_VALUE, &outsideInteractions, sizeof(outsideInteractions),
+                                                   STARPU_VALUE, &cellHandles[idxLevel][idxGroup].intervalSize, sizeof(int),
+                                                   STARPU_VALUE, &mode, sizeof(int),
+                           #ifdef SCALFMM_STARPU_USE_PRIO
+                                                   STARPU_PRIORITY, PrioClass::Controller().getInsertionPosM2LExtern(idxLevel),
+                           #endif
+                                                   STARPU_R, cellHandles[idxLevel][interactionid].symb,
+                                                   (STARPU_RW | STARPU_COMMUTE_IF_SUPPORTED), cellHandles[idxLevel][interactionid].down,
+                                                   STARPU_R, cellHandles[idxLevel][idxGroup].symb,
+                                                   STARPU_R, cellHandles[idxLevel][idxGroup].up,
+                           #ifdef STARPU_USE_TASK_NAME
+                           #ifndef SCALFMM_SIMGRID_TASKNAMEPARAMS
+                                                   STARPU_NAME, m2lOuterTaskNames[idxLevel].get(),
+                           #else
+                                                   //"M2L_out-l_nb_i_nb_i_s"
+                                                   STARPU_NAME, taskNames->print("M2L_out", "%d, %d, %lld, %d, %lld, %d, %lld, %lld, %lld, %lld, %d\n",
+                                                                                 idxLevel,
+                                                                                 tree->getCellGroup(idxLevel,idxGroup)->getNumberOfCellsInBlock(),
+                                                                                 tree->getCellGroup(idxLevel,idxGroup)->getSizeOfInterval(),
+                                                                                 tree->getCellGroup(idxLevel,interactionid)->getNumberOfCellsInBlock(),
+                                                                                 tree->getCellGroup(idxLevel,interactionid)->getSizeOfInterval(),
+                                                                                 outsideInteractions->size(),
+                                                                                 tree->getCellGroup(idxLevel, interactionid)->getStartingIndex(),
+                                                                                 tree->getCellGroup(idxLevel, interactionid)->getEndingIndex(),
+                                                                                 tree->getCellGroup(idxLevel, idxGroup)->getStartingIndex(),
+                                                                                 tree->getCellGroup(idxLevel, idxGroup)->getEndingIndex(),
+                                                                                 starpu_mpi_data_get_rank(cellHandles[idxLevel][interactionid].down)),
+                           #endif
+                           #endif
+                                                   0);
+                        }
+                        else{
+                                {
+                                    // Extract data from second group for the first one
+                                    // That is copy B to B'
+                                    extractedCellBuffer.emplace_back();
+                                    CellExtractedHandles& interactionBuffer = extractedCellBuffer.back();
+                                    interactionBuffer.cellsToExtract.reserve(outsideInteractions->size());
+                                    for(size_t idx = 0 ;
+                                        idx < outsideInteractions->size() ; ++idx){
+                                        interactionBuffer.cellsToExtract.push_back((*outsideInteractions)[idx].outsideIdxInBlock);
+                                    }
+                                    interactionBuffer.size = tree->getCellGroup(idxLevel,interactionid)->extractGetSizeSymbUp(interactionBuffer.cellsToExtract);
+                                    // I allocate only if I will use it to extract
+                                    if(starpu_mpi_data_get_rank(cellHandles[idxLevel][interactionid].symb)){
+                                        interactionBuffer.data.reset(new unsigned char[interactionBuffer.size]);
+                                    }
+                                    else{
+                                        interactionBuffer.data.reset(nullptr);
+                                    }
+                                    starpu_variable_data_register(&interactionBuffer.all, starpu_mpi_data_get_rank(cellHandles[idxLevel][interactionid].symb),
+                                                                  (uintptr_t)interactionBuffer.data.get(), interactionBuffer.size);
+
+                                    CellExtractedHandles* interactionBufferPtr = &interactionBuffer;
+                                    starpu_mpi_insert_task(MPI_COMM_WORLD,
+                                                           &cell_extract_up,
+                                                           STARPU_VALUE, &interactionBufferPtr, sizeof(CellExtractedHandles*),
+                                   #ifdef SCALFMM_STARPU_USE_PRIO
+                                                           STARPU_PRIORITY, PrioClass::Controller().getInsertionPosM2LExtern(idxLevel),
+                                   #endif
+                                                           STARPU_R, cellHandles[idxLevel][interactionid].symb,
+                                                           STARPU_R, cellHandles[idxLevel][interactionid].up,
+                                                           STARPU_RW, interactionBuffer.all);
+
+                                    // Move to a new memory block that is on the same node as A
+                                    // B' to B'''
+                                    duplicatedCellBuffer.emplace_back();
+                                    DuplicatedCellHandle& duplicateB = duplicatedCellBuffer.back();
+                                    duplicateB.sizeSymb = tree->getCellGroup(idxLevel,interactionid)->getBufferSizeInByte();
+                                    duplicateB.sizeOther = tree->getCellGroup(idxLevel,interactionid)->getMultipoleBufferSizeInByte();
+                                    if(starpu_mpi_data_get_rank(particleHandles[idxGroup].down) == mpi_rank){
+                                        // Reuse block but just to perform the send
+                                        duplicateB.dataSymb = const_cast<unsigned char*>(tree->getCellGroup(idxLevel,interactionid)->getRawBuffer());
+                                        duplicateB.dataOther = reinterpret_cast<unsigned char*>(tree->getCellGroup(idxLevel,interactionid)->getRawMultipoleBuffer());
+                                    }
+                                    else{
+                                        duplicateB.dataSymb = nullptr;
+                                        duplicateB.dataOther = nullptr;
+                                    }
+                                    starpu_variable_data_register(&duplicateB.symb, starpu_mpi_data_get_rank(particleHandles[idxGroup].down),
+                                                                  (uintptr_t)duplicateB.dataSymb, duplicateB.sizeSymb);
+                                    starpu_variable_data_register(&duplicateB.symb, starpu_mpi_data_get_rank(particleHandles[idxGroup].down),
+                                                                  (uintptr_t)duplicateB.dataOther, duplicateB.sizeOther);
+
+                                    starpu_mpi_insert_task(MPI_COMM_WORLD,
+                                                           &cell_insert_up,
+                                                           STARPU_VALUE, &interactionBufferPtr, sizeof(CellExtractedHandles*),
+                                   #ifdef SCALFMM_STARPU_USE_PRIO
+                                                           STARPU_PRIORITY, PrioClass::Controller().getInsertionPosM2LExtern(idxLevel),
+                                   #endif
+                                                           STARPU_R, interactionBuffer.all,
+                                                           STARPU_RW, duplicateB.symb,
+                                                           STARPU_RW, duplicateB.other);
+
+
+                                int mode = 1;
+                                starpu_mpi_insert_task(MPI_COMM_WORLD,
+                                                       &m2l_cl_inout,
+                                                       STARPU_VALUE, &wrapperptr, sizeof(wrapperptr),
+                                                       STARPU_VALUE, &idxLevel, sizeof(idxLevel),
+                                                       STARPU_VALUE, &outsideInteractions, sizeof(outsideInteractions),
+                                                       STARPU_VALUE, &cellHandles[idxLevel][idxGroup].intervalSize, sizeof(int),
+                                                       STARPU_VALUE, &mode, sizeof(int),
+                               #ifdef SCALFMM_STARPU_USE_PRIO
+                                                       STARPU_PRIORITY, PrioClass::Controller().getInsertionPosM2LExtern(idxLevel),
+                               #endif
+                                                       STARPU_R, cellHandles[idxLevel][idxGroup].symb,
+                                                       (STARPU_RW | STARPU_COMMUTE_IF_SUPPORTED), cellHandles[idxLevel][idxGroup].down,
+                                                       STARPU_R, duplicateB.symb,
+                                                       STARPU_R, duplicateB.other,
+                               #ifdef STARPU_USE_TASK_NAME
+                               #ifndef SCALFMM_SIMGRID_TASKNAMEPARAMS
+                                                       STARPU_NAME, m2lOuterTaskNames[idxLevel].get(),
+                               #else
+                                                       //"M2L_out-l_nb_i_nb_i_s
+                                                       STARPU_NAME, taskNames->print("M2L_out", "%d, %d, %lld, %d, %lld, %d, %lld, %lld, %lld, %lld, %d\n",
+                                                                                     idxLevel,
+                                                                                     tree->getCellGroup(idxLevel,idxGroup)->getNumberOfCellsInBlock(),
+                                                                                     tree->getCellGroup(idxLevel,idxGroup)->getSizeOfInterval(),
+                                                                                     tree->getCellGroup(idxLevel,interactionid)->getNumberOfCellsInBlock(),
+                                                                                     tree->getCellGroup(idxLevel,interactionid)->getSizeOfInterval(),
+                                                                                     outsideInteractions->size(),
+                                                                                     tree->getCellGroup(idxLevel, idxGroup)->getStartingIndex(),
+                                                                                     tree->getCellGroup(idxLevel, idxGroup)->getEndingIndex(),
+                                                                                     tree->getCellGroup(idxLevel, interactionid)->getStartingIndex(),
+                                                                                     tree->getCellGroup(idxLevel, interactionid)->getEndingIndex(),
+                                                                                     starpu_mpi_data_get_rank(cellHandles[idxLevel][idxGroup].down)),
+                               #endif
+                               #endif
+                                                       0);
+                            }
+                            {
+                                // Extract data from second group for the first one
+                                // That is copy A to A'
+                                extractedCellBuffer.emplace_back();
+                                CellExtractedHandles& interactionBuffer = extractedCellBuffer.back();
+                                interactionBuffer.cellsToExtract.reserve(outsideInteractions->size());
+                                for(size_t idx = 0 ;
+                                    idx < outsideInteractions->size() ; ++idx){
+                                    interactionBuffer.cellsToExtract.push_back((*outsideInteractions)[idx].insideIdxInBlock);
+                                }
+                                interactionBuffer.size = tree->getCellGroup(idxLevel,idxGroup)->extractGetSizeSymbUp(interactionBuffer.cellsToExtract);
+                                // I allocate only if I will use it to extract
+                                if(starpu_mpi_data_get_rank(cellHandles[idxLevel][idxGroup].symb)){
+                                    interactionBuffer.data.reset(new unsigned char[interactionBuffer.size]);
+                                }
+                                else{
+                                    interactionBuffer.data.reset(nullptr);
+                                }
+                                starpu_variable_data_register(&interactionBuffer.all, starpu_mpi_data_get_rank(cellHandles[idxLevel][idxGroup].symb),
+                                                              (uintptr_t)interactionBuffer.data.get(), interactionBuffer.size);
+
+                                CellExtractedHandles* interactionBufferPtr = &interactionBuffer;
+                                starpu_mpi_insert_task(MPI_COMM_WORLD,
+                                                       &cell_extract_up,
+                                                       STARPU_VALUE, &interactionBufferPtr, sizeof(CellExtractedHandles*),
+                               #ifdef SCALFMM_STARPU_USE_PRIO
+                                                       STARPU_PRIORITY, PrioClass::Controller().getInsertionPosM2LExtern(idxLevel),
+                               #endif
+                                                       STARPU_R, cellHandles[idxLevel][idxGroup].symb,
+                                                       STARPU_R, cellHandles[idxLevel][idxGroup].up,
+                                                       STARPU_RW, interactionBuffer.all);
+
+                                // Move to a new memory block that is on the same node as A
+                                // B' to B'''
+                                duplicatedCellBuffer.emplace_back();
+                                DuplicatedCellHandle& duplicateB = duplicatedCellBuffer.back();
+                                duplicateB.sizeSymb = tree->getCellGroup(idxLevel,idxGroup)->getBufferSizeInByte();
+                                duplicateB.sizeOther = tree->getCellGroup(idxLevel,idxGroup)->getMultipoleBufferSizeInByte();
+                                if(starpu_mpi_data_get_rank(particleHandles[interactionid].down) == mpi_rank){
+                                    // Reuse block but just to perform the send
+                                    duplicateB.dataSymb = const_cast<unsigned char*>(tree->getCellGroup(idxLevel,idxGroup)->getRawBuffer());
+                                    duplicateB.dataOther = reinterpret_cast<unsigned char*>(tree->getCellGroup(idxLevel,idxGroup)->getRawMultipoleBuffer());
+                                }
+                                else{
+                                    duplicateB.dataSymb = nullptr;
+                                    duplicateB.dataOther = nullptr;
+                                }
+                                starpu_variable_data_register(&duplicateB.symb, starpu_mpi_data_get_rank(particleHandles[interactionid].down),
+                                                              (uintptr_t)duplicateB.dataSymb, duplicateB.sizeSymb);
+                                starpu_variable_data_register(&duplicateB.symb, starpu_mpi_data_get_rank(particleHandles[interactionid].down),
+                                                              (uintptr_t)duplicateB.dataOther, duplicateB.sizeOther);
+
+                                starpu_mpi_insert_task(MPI_COMM_WORLD,
+                                                       &cell_insert_up,
+                                                       STARPU_VALUE, &interactionBufferPtr, sizeof(CellExtractedHandles*),
+                               #ifdef SCALFMM_STARPU_USE_PRIO
+                                                       STARPU_PRIORITY, PrioClass::Controller().getInsertionPosM2LExtern(idxLevel),
+                               #endif
+                                                       STARPU_R, interactionBuffer.all,
+                                                       STARPU_RW, duplicateB.symb,
+                                                       STARPU_RW, duplicateB.other);
+
+                                int mode = 2;
+                                starpu_mpi_insert_task(MPI_COMM_WORLD,
+                                                       &m2l_cl_inout,
+                                                       STARPU_VALUE, &wrapperptr, sizeof(wrapperptr),
+                                                       STARPU_VALUE, &idxLevel, sizeof(idxLevel),
+                                                       STARPU_VALUE, &outsideInteractions, sizeof(outsideInteractions),
+                                                       STARPU_VALUE, &cellHandles[idxLevel][idxGroup].intervalSize, sizeof(int),
+                                                       STARPU_VALUE, &mode, sizeof(int),
+                               #ifdef SCALFMM_STARPU_USE_PRIO
+                                                       STARPU_PRIORITY, PrioClass::Controller().getInsertionPosM2LExtern(idxLevel),
+                               #endif
+                                                       STARPU_R, cellHandles[idxLevel][interactionid].symb,
+                                                       (STARPU_RW | STARPU_COMMUTE_IF_SUPPORTED), cellHandles[idxLevel][interactionid].down,
+                                                       STARPU_RW, duplicateB.symb,
+                                                       STARPU_RW, duplicateB.other,
+                               #ifdef STARPU_USE_TASK_NAME
+                               #ifndef SCALFMM_SIMGRID_TASKNAMEPARAMS
+                                                       STARPU_NAME, m2lOuterTaskNames[idxLevel].get(),
+                               #else
+                                                       //"M2L_out-l_nb_i_nb_i_s"
+                                                       STARPU_NAME, taskNames->print("M2L_out", "%d, %d, %lld, %d, %lld, %d, %lld, %lld, %lld, %lld, %d\n",
+                                                                                     idxLevel,
+                                                                                     tree->getCellGroup(idxLevel,idxGroup)->getNumberOfCellsInBlock(),
+                                                                                     tree->getCellGroup(idxLevel,idxGroup)->getSizeOfInterval(),
+                                                                                     tree->getCellGroup(idxLevel,interactionid)->getNumberOfCellsInBlock(),
+                                                                                     tree->getCellGroup(idxLevel,interactionid)->getSizeOfInterval(),
+                                                                                     outsideInteractions->size(),
+                                                                                     tree->getCellGroup(idxLevel, interactionid)->getStartingIndex(),
+                                                                                     tree->getCellGroup(idxLevel, interactionid)->getEndingIndex(),
+                                                                                     tree->getCellGroup(idxLevel, idxGroup)->getStartingIndex(),
+                                                                                     tree->getCellGroup(idxLevel, idxGroup)->getEndingIndex(),
+                                                                                     starpu_mpi_data_get_rank(cellHandles[idxLevel][interactionid].down)),
+                               #endif
+                               #endif
+                                                       0);
+                            }
+                        }
                     }
                 }
                 FLOG( timerOutBlock.tac() );
-- 
GitLab