From 790b64401aac25c2f5dc8e64ea8c7a5b7bd231b0 Mon Sep 17 00:00:00 2001
From: Olivier Coulaud <Olivier.Coulaud@inria.fr>
Date: Tue, 5 Jun 2018 17:06:10 +0200
Subject: [PATCH] Bug with BPC is fixed

---
 Examples/MPIInterpolationFMM.hpp             |  23 +-
 Examples/sharedMemoryInterpolationFMM.hpp    |   5 +-
 NEWS.txt                                     |   2 +-
 README.md                                    |  11 +-
 Src/Core/FFmmAlgorithmPeriodic.hpp           |  54 ++-
 Src/Core/FFmmAlgorithmThreadProcPeriodic.hpp | 446 ++++++++++++-------
 6 files changed, 338 insertions(+), 203 deletions(-)

diff --git a/Examples/MPIInterpolationFMM.hpp b/Examples/MPIInterpolationFMM.hpp
index 27dffbf3a..e120c085f 100644
--- a/Examples/MPIInterpolationFMM.hpp
+++ b/Examples/MPIInterpolationFMM.hpp
@@ -32,7 +32,6 @@
 
 #include "Utils/FParameters.hpp"
 #include "Utils/FParameterNames.hpp"
-
 //
 // Order of the Interpolation approximation
 static constexpr unsigned ORDER = 6 ;
@@ -115,6 +114,7 @@ int main(int argc, char* argv[])
 
   // Initialize values for MPI
   FMpi app(argc,argv);
+  const bool masterIO = ( app.global().processId() == 0 );
   //
   // Initialize timer
   FTic time;
@@ -137,7 +137,7 @@ int main(int argc, char* argv[])
   FSize localParticlesNumber = 0 ;
 
   // -----------------------------------------------------
-  if(app.global().processId() == 0){
+  if(masterIO){
       std::cout << "Loading & Inserting " << loader.getNumberOfParticles()
                 << " particles ..." << std::endl
                 <<" Box: "<< std::endl
@@ -214,7 +214,7 @@ int main(int argc, char* argv[])
 
   MPI_Reduce(&timeUsed,&minTime,1,MPI_DOUBLE,MPI_MIN,0,app.global().getComm());
   MPI_Reduce(&timeUsed,&maxTime,1,MPI_DOUBLE,MPI_MAX,0,app.global().getComm());
-  if(app.global().processId() == 0){
+  if(masterIO){
       std::cout << "readinsert-time-min:" << minTime
                 << " readinsert-time-max:" << maxTime
                 << std::endl;
@@ -225,8 +225,9 @@ int main(int argc, char* argv[])
   FAbstractAlgorithm * algorithm  = nullptr;
   FAlgorithmTimers   * timer      = nullptr;
   { // -----------------------------------------------------
-    std::cout << "\n"<<interpolationType<<" FMM Proc (ORDER="<< ORDER << ") ... " << std::endl;
-
+    if(masterIO) {
+        std::cout << "\n"<<interpolationType<<" FMM Proc (ORDER="<< ORDER << ") ... " << std::endl;
+    }
     time.tic();
 
     // Kernels to use (pointer because of the limited size of the stack)
@@ -239,8 +240,10 @@ int main(int argc, char* argv[])
     //
     // periodic FMM algorithm
     FmmClassProcPER algoPer(app.global(),&tree, aboveTree);
+
     KernelClass kernelsPer(algoPer.extendedTreeHeight(), algoPer.extendedBoxWidth(),
                            algoPer.extendedBoxCenter(),&MatrixKernel);
+
     algoPer.setKernel(&kernelsPer);
     ///////////////////////////////////////////////////////////////////////////////////////////////////
     if(! periodicCondition) {// Non periodic case
@@ -252,12 +255,12 @@ int main(int argc, char* argv[])
         timer      = &algoPer ;
       }
     //
-    // FMM exectution  FFmmFarField FFmmNearField
+    // FMM exectution  FFmmFarField FFmmNearField  FFmmP2M|FFmmM2M|FFmmM2L|FFmmL2L
     algorithm->execute();
     //
     algorithm->getMortonLeafDistribution(mortonLeafDistribution);
 
-    // if(app.global().processId() == 0)
+   if(masterIO)
     {
       std::cout << app.global().processId()  <<"  Morton distribution "<< std::endl ;
       for(auto v : mortonLeafDistribution)
@@ -268,10 +271,10 @@ int main(int argc, char* argv[])
     app.global().barrier();
     time.tac();
     timeUsed = time.elapsed();
-    std::cout << "Done  " << "(@Algorithm = " << time.elapsed() << " s)." << std::endl;
     MPI_Reduce(&timeUsed,&minTime,1,MPI_DOUBLE,MPI_MIN,0,app.global().getComm());
     MPI_Reduce(&timeUsed,&maxTime,1,MPI_DOUBLE,MPI_MAX,0,app.global().getComm());
-    if(app.global().processId() == 0){
+    if(masterIO){
+        std::cout << "Done  " << "(@Algorithm = " << time.elapsed() << " s)." << std::endl;
         std::cout << "exec-time-min:" << minTime
                   << " exec-time-max:" << maxTime
                   << std::endl;
@@ -331,7 +334,7 @@ int main(int argc, char* argv[])
     });
     FReal gloEnergy          = app.global().reduceSum(energy);
     FReal TotalPhysicalValue = app.global().reduceSum(locTotalPhysicalValue);
-    if(0 == app.global().processId()){
+    if(masterIO){
         std::cout <<std::endl<<"aboveRoot: " << aboveTree << " Energy: "<< gloEnergy <<"  TotalPhysicalValue: " << TotalPhysicalValue<< std::endl; 
         std::cout <<std::endl <<" &&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&& "<<std::endl<<std::endl;
       }
diff --git a/Examples/sharedMemoryInterpolationFMM.hpp b/Examples/sharedMemoryInterpolationFMM.hpp
index d597994b0..35eac6677 100644
--- a/Examples/sharedMemoryInterpolationFMM.hpp
+++ b/Examples/sharedMemoryInterpolationFMM.hpp
@@ -198,6 +198,7 @@ int main(int argc, char* argv[])
         timer      = &algoPer ;
 
       }
+
     //
     algorithm->execute();   // Here the call of the FMM algorithm
     //
@@ -250,10 +251,6 @@ int main(int argc, char* argv[])
 		      << " Pos "<<posX[idxPart]<<" "<<posY[idxPart]<<" "<<posZ[idxPart]
 		      << "   Forces: " << forcesX[idxPart] << " " << forcesY[idxPart] << " "<< forcesZ[idxPart] <<std::endl;
 	  }
-//	  if(numLeaf==288){
-//	   std::cout << count << "  idxPart " << idxPart <<  potentials[idxPart]<<"  "<<
-//			physicalValues[idxPart] << " energy " << energy << std::endl;
-//	    }
 	  energy += potentials[idxPart]*physicalValues[idxPart] ;
 	  TotalPhysicalValue += physicalValues[idxPart]  ;
 	//  ++count ;
diff --git a/NEWS.txt b/NEWS.txt
index 719d4c8ca..c634a175a 100644
--- a/NEWS.txt
+++ b/NEWS.txt
@@ -19,7 +19,7 @@ version 1.1
 - Improvement in CmakeLists  - FUSE is working again -  
 - Compile with Intel and CLang (with a lot of warnings)
 - Now morse_cmake is a git submodule 
--
+- Fix several bugs in pseudo-periodic algorithms (sequential and parallel)
 
 
 1.5
diff --git a/README.md b/README.md
index 743afd1e8..654ca2af1 100644
--- a/README.md
+++ b/README.md
@@ -35,8 +35,17 @@ To obtain ScalFMM (develop branch) and its git submodules do
 git clone --recursive git@gitlab.inria.fr:solverstack/ScalFMM.git -b develop
 ```
 
-``` bash
+or
+
+```bash
+git clone git@gitlab.inria.fr:solverstack/ScalFMM.git
+cd ScalFMM
+git submodule init
+git submodule update
+
+``` 
 # Move to the build folder
+``` bash
 cd scalfmm/Build
 # Use cmake, with relevant options
 cmake .. # -DSCALFMM_USE_MPI=ON
diff --git a/Src/Core/FFmmAlgorithmPeriodic.hpp b/Src/Core/FFmmAlgorithmPeriodic.hpp
index 792036996..0aaaba787 100644
--- a/Src/Core/FFmmAlgorithmPeriodic.hpp
+++ b/Src/Core/FFmmAlgorithmPeriodic.hpp
@@ -198,12 +198,14 @@ protected:
 
         if(operationsToProceed & FFmmP2M) bottomPass();
 
-        if(operationsToProceed & FFmmM2M) upwardPass();
+        if(operationsToProceed & FFmmM2M) {
+            upwardPass();
+            // before downward pass we have to perform the periodicity
+            this->processPeriodicLevels();
+          }
 
         if(operationsToProceed & FFmmM2L){
             transferPass();
-            // before downward pass we have to perform the periodicity
-            processPeriodicLevels();
         }
 
         if(operationsToProceed & FFmmL2L) downardPass();
@@ -561,10 +563,10 @@ protected:
             // upperCells[offsetRealTree-1] is root cell
             CellClass*const upperCells = new CellClass[offsetRealTree];
             for(int i = 0; i < offsetRealTree; ++i) {
-                upperCells[i].setLevel(i+1); // TODO: check that the level is correct (see other todos under)
+                upperCells[i].setLevel(i+1);
             }
 
-            {
+            {  // Build Expansion at level 0
                 typename OctreeClass::Iterator octreeIterator(tree);
                 octreeIterator.gotoLeft();
 
@@ -584,7 +586,7 @@ protected:
                 std::array<const symbolic_data_t*, 8> level_1_symbolics;
                 std::transform(children, children+8, level_1_symbolics.begin(),
                                [](CellClass* c) {return c;});
-                // TODO check that parent symbolic has the right level
+                //
                 kernels->M2M(
                     real_tree_root_multipole,
                     real_tree_root_symbolic,
@@ -598,6 +600,7 @@ protected:
                     // Copy virtual cell at given level into virtual children
                     FMemUtils::setall(children,&upperCells[idxLevel],8);
 
+
                     multipole_t* const virtual_parent_multipole
                         = &((&upperCells[idxLevel-1])->getMultipoleData());
                     const symbolic_data_t* const virtual_parent_symbolic
@@ -618,7 +621,9 @@ protected:
                         );
                 }
             }
-
+            //
+            //        M2L PASS
+            //
             CellClass*const downerCells = new CellClass[offsetRealTree];
             for(int i = 0; i < offsetRealTree; ++i) {
                 downerCells[i].setLevel(i+1); // TODO: check that the level is correct (see other todos under)
@@ -627,8 +632,8 @@ protected:
             {
                 const int idxUpperLevel = 2;
 
-                const CellClass* neighbors[342];
-                int neighborPositions[342];
+                const CellClass* neighbors[342]{};
+                int neighborPositions[342]{};
                 int counter = 0;
 
                 // The cells are all the same, duplicate the central cell and
@@ -637,7 +642,7 @@ protected:
                     for(int idxY = -3 ; idxY <= 2 ; ++idxY){
                         for(int idxZ = -3 ; idxZ <= 2 ; ++idxZ){
                             if( FMath::Abs(idxX) > 1 || FMath::Abs(idxY) > 1 || FMath::Abs(idxZ) > 1){
-                                neighbors[counter] = &upperCells[idxUpperLevel-1];
+                                neighbors[counter]         = &upperCells[idxUpperLevel-1];
                                 neighborPositions[counter] = neighIndex(idxX,idxY,idxZ);
                                 ++counter;
                             }
@@ -649,13 +654,13 @@ protected:
                     = &(downerCells[idxUpperLevel-1].getLocalExpansionData());
                 const symbolic_data_t* const target_symbolic
                     = &downerCells[idxUpperLevel-1];
-                std::array<const multipole_t*, 342> neighbor_multipoles;
-                std::array<const symbolic_data_t*, 342> neighbor_symbolics;
+
+                std::array<const multipole_t*, 342>     neighbor_multipoles{};
+                std::array<const symbolic_data_t*, 342> neighbor_symbolics{};
                 std::transform(neighbors, neighbors+counter, neighbor_multipoles.begin(),
                                [](const CellClass* c) {return &(c->getMultipoleData());});
                 std::transform(neighbors, neighbors+counter, neighbor_symbolics.begin(),
                                [](const CellClass* c) {return c;});
-
                 // compute M2L
                 kernels->M2L(
                     target_local_exp,
@@ -665,8 +670,11 @@ protected:
                     neighborPositions,
                     counter);
             }
+
+
             // note: the triple loop bounds are not the same than for the previous piece of code
             // which handles the topmost virtual cell
+
             for(int idxUpperLevel = 3 ; idxUpperLevel <= offsetRealTree ; ++idxUpperLevel){
                 const CellClass* neighbors[342];
                 int neighborPositions[342];
@@ -675,7 +683,7 @@ protected:
                     for(int idxY = -2 ; idxY <= 3 ; ++idxY){
                         for(int idxZ = -2 ; idxZ <= 3 ; ++idxZ){
                             if( FMath::Abs(idxX) > 1 || FMath::Abs(idxY) > 1 || FMath::Abs(idxZ) > 1){
-                                neighbors[counter] = &upperCells[idxUpperLevel-1];
+                                neighbors[counter]         = &upperCells[idxUpperLevel-1];
                                 neighborPositions[counter] = neighIndex(idxX,idxY,idxZ);
                                 ++counter;
                             }
@@ -687,6 +695,7 @@ protected:
                     = &(downerCells[idxUpperLevel-1].getLocalExpansionData());
                 const symbolic_data_t* const target_symbolic
                     = &downerCells[idxUpperLevel-1];
+
                 std::array<const multipole_t*, 342> neighbor_multipoles;
                 std::array<const symbolic_data_t*, 342> neighbor_symbolics;
                 std::transform(neighbors, neighbors+counter, neighbor_multipoles.begin(),
@@ -706,10 +715,11 @@ protected:
                     neighborPositions,
                     counter);
             }
-
-
             // Run the L2L for all but the lowest virtual levels
+            //  2 is the highest level
+            //  offsetRealTree-1 is the root level
             {
+
                 std::array<local_expansion_t*, 8> virtual_child_local_exps = {};
                 std::array<symbolic_data_t*, 8> virtual_child_symbolics = {};
                 for(int idxLevel = 2 ; idxLevel < offsetRealTree-1  ; ++idxLevel){
@@ -732,16 +742,17 @@ protected:
                 }
             }
             // Run the L2L for the lowest virtual level
-            {
-                std::array<local_expansion_t*, 8> virtual_child_local_exps = {};
-                std::array<symbolic_data_t*, 8> virtual_child_symbolics = {};
+            if(offsetRealTree >2){
+                std::array<local_expansion_t*, 8> virtual_child_local_exps  {};
+                std::array<symbolic_data_t*, 8> virtual_child_symbolics  {};
                 const int idxLevel = offsetRealTree-1;
                 local_expansion_t* const virtual_parent_local_exp
                     = &(downerCells[idxLevel-1].getLocalExpansionData());
                 const symbolic_data_t* const virtual_parent_symbolic
                     = &downerCells[idxLevel-1];
+
                 virtual_child_local_exps[7] = &(downerCells[idxLevel].getLocalExpansionData());
-                virtual_child_symbolics[7] = &downerCells[idxLevel];
+                virtual_child_symbolics[7]  = &downerCells[idxLevel];
                 kernels->L2L(
                     virtual_parent_local_exp,
                     virtual_parent_symbolic,
@@ -750,7 +761,7 @@ protected:
                     );
             }
 
-            // Run L2L from the lowest vitual level to the highest real tree level
+            // Run L2L from the lowest virtual level to the highest real tree level
             {
                 typename OctreeClass::Iterator octreeIterator(tree);
                 octreeIterator.gotoLeft();
@@ -777,7 +788,6 @@ protected:
                     child_symbolics.data()
                     );
             }
-
             delete[] upperCells;
             delete[] downerCells;
         }
diff --git a/Src/Core/FFmmAlgorithmThreadProcPeriodic.hpp b/Src/Core/FFmmAlgorithmThreadProcPeriodic.hpp
index 361037b44..cd39d40d5 100644
--- a/Src/Core/FFmmAlgorithmThreadProcPeriodic.hpp
+++ b/Src/Core/FFmmAlgorithmThreadProcPeriodic.hpp
@@ -60,27 +60,32 @@
  */
 template<class FReal, class OctreeClass, class CellClass, class ContainerClass, class KernelClass, class LeafClass, class P2PExclusionClass = FP2PMiddleExclusion>
 class FFmmAlgorithmThreadProcPeriodic : public FAbstractAlgorithm, public FAlgorithmTimers {
-  OctreeClass* const tree;                 ///< The octree to work on
-  KernelClass** kernels;                   ///< The kernels
 
-  const FMpi::FComm& comm;                 ///< MPI communicator
+  using multipole_t = typename CellClass::multipole_t;
+  using local_expansion_t = typename CellClass::local_expansion_t;
+  using symbolic_data_t = CellClass;
 
-  CellClass rootCellFromProc;     ///< root of tree needed by the periodicity
-  const int nbLevelsAboveRoot;    ///< The nb of level the user ask to go above the tree (>= -1)
-  const int offsetRealTree;       ///< nbLevelsAboveRoot GetFakeLevel
 
+  OctreeClass* const tree;                 ///< The octree to work on
+  KernelClass** kernels;                   ///< The kernels
+
+  const int OctreeHeight;              ///< Tree height
+  const int nbLevelsAboveRoot;           ///< The nb of level the user ask to go above the tree (>= -1)
+  const int offsetRealTree;              ///< nbLevelsAboveRoot+2 GetFakeLevel
+  const int leafLevelSeparationCriteria;
+  CellClass rootCellFromProc;           ///< root cell of tree needed by the periodicity
 
   typename OctreeClass::Iterator* iterArray;  ///<  Used to store pointers to cells/leafs to work with
   typename OctreeClass::Iterator* iterArrayComm;   ///< Used to store pointers to cells/leafs to send/rcv
 
+  const FMpi::FComm& comm;                 ///< MPI communicator
+
   int numberOfLeafs;           ///< To store the size at the previous level
   const int MaxThreads;        ///< Max number of thread allowed by openmp
   const int nbProcess;         ///< Process count
   const int idProcess;         ///< Current process id
-  const int OctreeHeight;      ///< Tree height
 
   const int userChunkSize;
-  const int leafLevelSeparationCriteria;
 
   struct Interval{
     MortonIndex leftIndex;
@@ -94,10 +99,6 @@ class FFmmAlgorithmThreadProcPeriodic : public FAbstractAlgorithm, public FAlgor
 
 public:
 
-  using multipole_t = typename CellClass::multipole_t;
-  using local_expansion_t = typename CellClass::local_expansion_t;
-  using symbolic_data_t = CellClass;
-
 
   /// Get the Morton index Distribution at the leaf level
   ///
@@ -186,16 +187,16 @@ public:
                                   const int inLeafLevelSeperationCriteria = 1)
     : tree(inTree) ,
       kernels(nullptr),
-      comm(inComm),
+      OctreeHeight(tree->getHeight()),
       nbLevelsAboveRoot(inUpperLevel),
       offsetRealTree(inUpperLevel + 2),
+      leafLevelSeparationCriteria(inLeafLevelSeperationCriteria),
+      comm(inComm),
       numberOfLeafs(0),
       MaxThreads(FEnv::GetValue("SCALFMM_ALGO_NUM_THREADS",omp_get_max_threads())),
       nbProcess(inComm.processCount()),
       idProcess(inComm.processId()),
-      OctreeHeight(tree->getHeight()),
       userChunkSize(inUserChunkSize),
-      leafLevelSeparationCriteria(inLeafLevelSeperationCriteria),
       intervals(new Interval[inComm.processCount()]),
       workingIntervalsPerLevel(new Interval[inComm.processCount() * tree->getHeight()]) {
 
@@ -439,7 +440,9 @@ protected:
     Timers[L2LTimer].tac();
 
     Timers[NearTimer].tic();
-    if((operationsToProceed & FFmmP2P) || (operationsToProceed & FFmmL2P)) directPass((operationsToProceed & FFmmP2P),(operationsToProceed & FFmmL2P));
+    if((operationsToProceed & FFmmP2P) || (operationsToProceed & FFmmL2P)) {
+        directPass((operationsToProceed & FFmmP2P),(operationsToProceed & FFmmL2P));
+      }
     Timers[NearTimer].tac();
 
     tree->forEachCell([this](CellClass* node){
@@ -504,7 +507,9 @@ protected:
   // Upward
   /////////////////////////////////////////////////////////////////////////////
 
-  /** M2M */
+  /** M2M
+ Build multipole expansion from height-1 to 1
+*/
   void upwardPass(){
     FLOG( FLog::Controller.write("\tStart Upward Pass\n").write(FLog::Flush); );
     FLOG(FTic counterTime);
@@ -782,102 +787,151 @@ protected:
     //////////////////////////////////////////////////////////////////
     //Periodicity
     //////////////////////////////////////////////////////////////////
+    ///
+    ///  Build the mutipole and the local expansion ine the root cell
+    ///
 
     octreeIterator = typename OctreeClass::Iterator(tree);
 
     if( idProcess == 0){
+        // Build request to receive the size of the data
         int iterRequestsSize = 0;
-
+//        bool receiveData = false ;
         for(int idxProc = 1 ; idxProc < nbProcess ; ++idxProc ){
             if( procHasWorkAtLevel(1,idxProc) ){
+   //             receiveData = true ;
                 MPI_Irecv(&recvBufferSize[iterRequestsSize], 1, FMpi::GetType(recvBufferSize[iterRequestsSize]), idxProc,
                           FMpi::TagFmmM2MSize, comm.getComm(), &requests[iterRequestsSize]);
                 iterRequestsSize += 1;
                 FAssertLF(iterRequestsSize <= 7);
               }
           }
-        MPI_Waitall( iterRequestsSize, requests, MPI_STATUSES_IGNORE);
+        //if(receiveData)
+         // {
+            // Build the rootCell Level if we don't have all data
+            MPI_Waitall( iterRequestsSize, requests, MPI_STATUSES_IGNORE);
+            // Build request to receive the data
+
+            int iterRequests = 0;
+
+            for(int idxProc = 1 ; idxProc < nbProcess ; ++idxProc ){
+                if( procHasWorkAtLevel(1,idxProc) ){
+                    recvBuffer[iterRequests].cleanAndResize(recvBufferSize[iterRequests]);
+                    FAssertLF(recvBufferSize[iterRequests] < std::numeric_limits<int>::max());
+                    MPI_Irecv(recvBuffer[iterRequests].data(), int(recvBufferSize[iterRequests]), MPI_BYTE, idxProc,
+                              FMpi::TagFmmM2M, comm.getComm(), &requests[iterRequests]);
+                    iterRequests += 1;
+                    FAssertLF(iterRequests <= 7);
+                  }
+              }
 
-        int iterRequests = 0;
+            MPI_Waitall( iterRequests, requests, MPI_STATUSES_IGNORE);
 
-        for(int idxProc = 1 ; idxProc < nbProcess ; ++idxProc ){
-            if( procHasWorkAtLevel(1,idxProc) ){
-                recvBuffer[iterRequests].cleanAndResize(recvBufferSize[iterRequests]);
-                FAssertLF(recvBufferSize[iterRequests] < std::numeric_limits<int>::max());
-                MPI_Irecv(recvBuffer[iterRequests].data(), int(recvBufferSize[iterRequests]), MPI_BYTE, idxProc,
-                          FMpi::TagFmmM2M, comm.getComm(), &requests[iterRequests]);
-                iterRequests += 1;
-                FAssertLF(iterRequests <= 7);
-              }
-          }
+            CellClass* currentChild[8];
+            memcpy(currentChild, octreeIterator.getCurrentBox(), 8 * sizeof(CellClass*));
 
-        MPI_Waitall( iterRequests, requests, MPI_STATUSES_IGNORE);
+            // retreive data and merge my child and the child from others
+            int counterProc = 0;
+            for(int idxProc = 1 ; idxProc < nbProcess ; ++idxProc){
+                if( procHasWorkAtLevel(1,idxProc) ){
+                    recvBuffer[counterProc].seek(0);
+                    unsigned state = unsigned(recvBuffer[counterProc].getValue<unsigned char>());
 
-        CellClass* currentChild[8];
-        memcpy(currentChild, octreeIterator.getCurrentBox(), 8 * sizeof(CellClass*));
+                    int position = 0;
+                    int positionToInsert = 0;
+                    while( state && position < 8){
+                        while(!(state & 0x1)){
+                            state >>= 1;
+                            ++position;
+                          }
+                        FAssertLF(positionToInsert < 7);
+                        FAssertLF(position < 8);
+                        FAssertLF(!currentChild[position], "Already has a cell here");
 
-        // retreive data and merge my child and the child from others
-        int counterProc = 0;
-        for(int idxProc = 1 ; idxProc < nbProcess ; ++idxProc){
-            if( procHasWorkAtLevel(1,idxProc) ){
-                recvBuffer[counterProc].seek(0);
-                unsigned state = unsigned(recvBuffer[counterProc].getValue<unsigned char>());
+                        recvBufferCells[positionToInsert].deserializeUp(recvBuffer[counterProc]);
+
+                        currentChild[position] = (CellClass*) &recvBufferCells[positionToInsert];
 
-                int position = 0;
-                int positionToInsert = 0;
-                while( state && position < 8){
-                    while(!(state & 0x1)){
                         state >>= 1;
-                        ++position;
+                        position += 1;
+                        positionToInsert += 1;
                       }
-                    FAssertLF(positionToInsert < 7);
-                    FAssertLF(position < 8);
-                    FAssertLF(!currentChild[position], "Already has a cell here");
-
-                    recvBufferCells[positionToInsert].deserializeUp(recvBuffer[counterProc]);
 
-                    currentChild[position] = (CellClass*) &recvBufferCells[positionToInsert];
-
-                    state >>= 1;
-                    position += 1;
-                    positionToInsert += 1;
+                    FAssertLF(recvBuffer[counterProc].tell() == recvBufferSize[counterProc]);
+                    counterProc += 1;
                   }
-
-                FAssertLF(recvBuffer[counterProc].tell() == recvBufferSize[counterProc]);
-                counterProc += 1;
               }
-          }
-
-
-        multipole_t* parent_multipole
-            = &(rootCellFromProc.getMultipoleData());
-        const symbolic_data_t* parent_symbolic
-            = &(rootCellFromProc);
-
-        std::array<const multipole_t*, 8> child_multipoles {};
-        std::transform(currentChild, currentChild + 8,
-                       std::begin(child_multipoles),
-                       [](const CellClass* c) {
-            return (c == nullptr)
-                ? nullptr
-                : &(c->getMultipoleData());
-          });
-        std::array<const symbolic_data_t*, 8> child_symbolics {};
-
-
-
+      //  }
+
+
+#ifdef USEQALGO
+            // set the reecived
+//            std::copy(currentChild, currentChild+8,
+//                      std::begin(child_symbolics));
+            typename OctreeClass::Iterator octreeIterator(tree);
+            octreeIterator.gotoLeft();
+
+//            multipole_t* const real_tree_root_multipole
+//                = &((&upperCells[offsetRealTree-1])->getMultipoleData());
+//            const symbolic_data_t* const real_tree_root_symbolic
+//                = &(upperCells[offsetRealTree-1]);
+
+            CellClass** children = octreeIterator.getCurrentBox();
+
+            std::transform(currentChild, currentChild + 8,
+                           std::begin(children),
+                           [](const CellClass* c) {
+                return (c == nullptr)
+                    ? nullptr
+                    : &(c->getMultipoleData());
+              });
+//            std::copy(currentChild, currentChild+8,
+//                      std::begin(child_symbolics));
+            //            std::copy(currentChild, currentChild+8,
+//                      std::begin(children));
+            this->processPeriodicLevelsSEQ();
+
+#else
+            // Build expansion at the rootCellFromProc
+                  multipole_t*           parent_multipole = &(rootCellFromProc.getMultipoleData());
+                  const symbolic_data_t* parent_symbolic  = &(rootCellFromProc);
+                  //
+                  std::array<const multipole_t*, 8> child_multipoles {};
+                  std::transform(currentChild, currentChild + 8,
+                                 std::begin(child_multipoles),
+                                 [](const CellClass* c) {
+                      return (c == nullptr)
+                          ? nullptr
+                          : &(c->getMultipoleData());
+                    });
+            std::array<const symbolic_data_t*, 8> child_symbolics {};
+            std::copy(currentChild, currentChild+8,
+                      std::begin(child_symbolics));
+
+            ///
+            /// // (*kernels[0]).M2M( &rootCellFromProc , currentChild, offsetRealTree);
+            ///
+            (*kernels[0]).M2M(
+                  parent_multipole,
+                  parent_symbolic,
+                  child_multipoles.data(),
+                  child_symbolics.data()
+                  );
 
-        // (*kernels[0]).M2M( &rootCellFromProc , currentChild, offsetRealTree);
-        (*kernels[0]).M2M(
-              parent_multipole,
-              parent_symbolic,
-              child_multipoles.data(),
-              child_symbolics.data()
-              );
+            ///  Now we have the multipole on the root level
 
-        processPeriodicLevels();
+            ///
+            ///  Treat at root level the periodicity
+            ///
+        ///
+        ///  Treat at root level the periodicity
+        ///
+        this->processPeriodicLevels();
+#endif
       }
     else {
+        //       OTHER PROCESS  (NOT 0)
+        //
         if( hasWorkAtLevel(1) ){
             const int firstChild = getWorkingInterval(1, idProcess).leftIndex & 7;
             const int lastChild  = getWorkingInterval(1, idProcess).rightIndex & 7;
@@ -899,6 +953,7 @@ protected:
             MPI_Send(sendBuffer.data(), int(sendBuffer.getSize()), MPI_BYTE, 0, FMpi::TagFmmM2M, comm.getComm());
           }
       }
+    ///////////////////////////////////////////////////////////////////////////
   }
 
   /////////////////////////////////////////////////////////////////////////////
@@ -1392,13 +1447,17 @@ protected:
         FAssertLF(recvBuffer.getSize() == sizeOfSerialization);
         recvBuffer.seek(0);
       }
-
+    // all processors apply L2L(rootCellFromProc) and store the local expansion in child_local_expansions
+    // That is an alias of the local expansion n the octree at level 1
+    //
     {
       const local_expansion_t* parent_local_expansion
           = &(rootCellFromProc.getLocalExpansionData());
       const symbolic_data_t* parent_symbolic
           = &(rootCellFromProc);
+      std::cout <<    "  >> L2L   F rootCellFromProc  -> Tree Level1" <<std::endl;
 
+      // In fact child_local_expansions[i]  is an alias of &(children[i]->getLocalExpansionData())
       CellClass ** children = octreeIterator.getCurrentBox();
       std::array<local_expansion_t*, 8> child_local_expansions {};
       std::transform(children, children + 8,
@@ -1409,8 +1468,9 @@ protected:
             : &(c->getLocalExpansionData());
       });
       std::array<const symbolic_data_t*, 8> child_symbolics {};
-      std::copy(children, children + 8, std::begin(child_symbolics));
-
+  //    std::copy(children, children + 8, std::begin(child_symbolics));
+      std::transform(children, children+8, child_symbolics.begin(),
+                     [](CellClass* c) {return c;});
 
       kernels[0]->L2L(
             parent_local_expansion,
@@ -1970,18 +2030,18 @@ protected:
 			      for(int idxNeig = 0 ; idxNeig < counter ; ++idxNeig){
 				  if( !offsets[idxNeig].equals(0,0,0) ){  // We need to move the cell
 				      //
-                                      // Put periodic cell into another cell (copy data)
-                                      periodicNeighbors[periodicNeighborsCounter] = new ContainerClass(*(neighbors[idxNeig])) ;
+				      // Put periodic cell into another cell (copy data)
+				      periodicNeighbors[periodicNeighborsCounter] = new ContainerClass(*(neighbors[idxNeig])) ;
 				      // newPos = pos + ofsset
-                                      FReal*const positionsX = periodicNeighbors[periodicNeighborsCounter]->getPositions()[0];
-                                      FReal*const positionsY = periodicNeighbors[periodicNeighborsCounter]->getPositions()[1];
-                                      FReal*const positionsZ = periodicNeighbors[periodicNeighborsCounter]->getPositions()[2];
+				      FReal*const positionsX = periodicNeighbors[periodicNeighborsCounter]->getPositions()[0];
+				      FReal*const positionsY = periodicNeighbors[periodicNeighborsCounter]->getPositions()[1];
+				      FReal*const positionsZ = periodicNeighbors[periodicNeighborsCounter]->getPositions()[2];
 				      FReal  xoffset= boxWidth * FReal(offsets[idxNeig].getX()) ;
 				      FReal  yoffset= boxWidth * FReal(offsets[idxNeig].getY()) ;
 				      FReal  zoffset= boxWidth * FReal(offsets[idxNeig].getZ()) ;
-                                      for(FSize idxPart = 0; idxPart < periodicNeighbors[periodicNeighborsCounter]->getNbParticles() ; ++idxPart){
-                             //             std::cout << " Xo " << positionsX[idxPart]  << " " << positionsY[idxPart]  << " " << positionsZ[idxPart] <<std::endl;
-                               //           std::cout << " Offset " << xoffset  << " " << yoffset  << " " << zoffset <<std::endl;
+				      for(FSize idxPart = 0; idxPart < periodicNeighbors[periodicNeighborsCounter]->getNbParticles() ; ++idxPart){
+			     //             std::cout << " Xo " << positionsX[idxPart]  << " " << positionsY[idxPart]  << " " << positionsZ[idxPart] <<std::endl;
+			       //           std::cout << " Offset " << xoffset  << " " << yoffset  << " " << zoffset <<std::endl;
 
 					  positionsX[idxPart] += xoffset;
 					  positionsY[idxPart] += yoffset;
@@ -1990,37 +2050,37 @@ protected:
 					}
 
      //                                 periodicOffsets[periodicNeighborsCounter]           = offsets[idxNeig];
-				      periodicNeighborPositions[periodicNeighborsCounter] = neighborPositions[idxNeig];
-				      ++periodicNeighborsCounter;
-				    }
-				  else{  // set non periodic cells in neighbors and in neighborPositions
-				      // nonPeriodicCounter versus idxNeig-periodicNeighborsCounter
-				//      std::cout << nonPeriodicCounter <<"  "<< idxNeig-periodicNeighborsCounter <<std::endl;
-				      neighbors[idxNeig-periodicNeighborsCounter]         = neighbors[idxNeig];
-				      neighborPositions[idxNeig-periodicNeighborsCounter] = neighborPositions[idxNeig];
-				      ++nonPeriodicCounter ;
-				    }
-				}
-			      //
-			      //  Treat periodic interaction without mutual interaction
-			      //
-			      myThreadkernels->P2PRemote(currentIter.coord,currentIter.targets,
-							 currentIter.sources, periodicNeighbors,
-							 periodicNeighborPositions, periodicNeighborsCounter);
-
-			      for(int idxNeig = 0 ; idxNeig < periodicNeighborsCounter ; ++idxNeig){
+                                      periodicNeighborPositions[periodicNeighborsCounter] = neighborPositions[idxNeig];
+                                      ++periodicNeighborsCounter;
+                                    }
+                                  else{  // set non periodic cells in neighbors and in neighborPositions
+                                      // nonPeriodicCounter versus idxNeig-periodicNeighborsCounter
+                                //      std::cout << nonPeriodicCounter <<"  "<< idxNeig-periodicNeighborsCounter <<std::endl;
+                                      neighbors[idxNeig-periodicNeighborsCounter]         = neighbors[idxNeig];
+                                      neighborPositions[idxNeig-periodicNeighborsCounter] = neighborPositions[idxNeig];
+                                      ++nonPeriodicCounter ;
+                                    }
+                                }
+                              //
+                              //  Treat periodic interaction without mutual interaction
+                              //
+                              myThreadkernels->P2PRemote(currentIter.coord,currentIter.targets,
+                                                         currentIter.sources, periodicNeighbors,
+                                                         periodicNeighborPositions, periodicNeighborsCounter);
+
+                              for(int idxNeig = 0 ; idxNeig < periodicNeighborsCounter ; ++idxNeig){
                                 delete periodicNeighbors[idxNeig] ;
                               }
-			    }
-			  //
-			  // Now treat P2P interaction with non periodic cells
-			  //    Like in non periodic algorithm !!
-			  //
-			  myThreadkernels->P2P( currentIter.coord, currentIter.targets,
-						currentIter.sources, neighbors,
-						neighborPositions, counter-periodicNeighborsCounter);
-			}
-		    }
+                            }
+                          //
+                          // Now treat P2P interaction with non periodic cells
+                          //    Like in non periodic algorithm !!
+                          //
+                          myThreadkernels->P2P( currentIter.coord, currentIter.targets,
+                                                currentIter.sources, neighbors,
+                                                neighborPositions, counter-periodicNeighborsCounter);
+                        }
+                    }
 
 		}
 	      }
@@ -2390,14 +2450,29 @@ public:
  * Then the M2L by taking into account the periodicity directions
  * Then the border by using the precomputed M2M
  * Finally the L2L
+ * The algorithm above the root can be spited in 4 steps:
+ * begin{itemize}
+ * \item Upward pass, for level $0$ to $-d$ we compute the $2^{DIM}$ $M2M$ ($8$ per level in $3D$).
+ * \item Transfer pass, for level $0$ to $-d$ we compute the $6^{DIM}-3^{DIM}$ $M2L$ ($189$ per level in $3D$).
+ * For level $0$ to $-d+1$ we put the cell at position $(1,1,1)$ relatively to its parent.
+ * For the last $M2L$ at level $-d$ we put the cell at position $(0,0,0)$ relatively to its parent.
+ * \item Border pass, at level $-d-1$ we compute the border and apply it using $M2L$.
+ * \item Downward pass, from level $-d-1$ to $0$ we perform one $L2L$.
+ * There is only one sub-cell per $L2L$ and the position has to be the same as the one used during the Transfer pass.
+ * \end{itemize}
+ *
  */
+
   void processPeriodicLevels(){
     FLOG( FLog::Controller.write("\tStart Periodic Pass\n").write(FLog::Flush); );
     FLOG(FTic counterTime);
+    // the root level of the octree in the virtual array of cells
+    const int rootLevel = offsetRealTree-1 ;
+    rootCellFromProc.setLevel(rootLevel+1);
 
     if( nbLevelsAboveRoot != -1 ){
         // we will use offsetRealTree-1 cells but for simplicity allocate offsetRealTree
-        // upperCells[offsetRealTree-1] is root cell
+        // upperCells[offsetRealTree-1] is the root cell
 
         std::unique_ptr<multipole_t[]> virtual_multipoles
             (new multipole_t[offsetRealTree] {});
@@ -2409,14 +2484,23 @@ public:
         for(int i = 0; i < offsetRealTree; ++i) {
             virtual_symbolics[i].setLevel(i+1);
           }
-
-        {
-          const int idxLevel = offsetRealTree-1;
-          multipole_t* parent_multipole
-              = &(virtual_multipoles[idxLevel-1]);
+        /////////////////////////////////////////////////////////////////
+        ///  Build all multipoles in the fake tree
+        ///  M2M operator
+        if(rootLevel > 1){
+          //fill at rootLevel-1
+          // Proceed as follows because I don(t know how to set
+          // virtual_symbolics[rootLevel]) = rootCellFromProc
+          // virtual_multipoles[rootLevel] = rootCellFromProc.getMultipoleData()
+          //
+
+            multipole_t *  parent_multipole
+              = &(virtual_multipoles[rootLevel-1]);
           symbolic_data_t* parent_symbolic
-              = &(virtual_symbolics[idxLevel-1]);
+              = &(virtual_symbolics[rootLevel-1]);
 
+
+          // Fill children with root expansion
           std::array<const multipole_t*, 8> child_multipoles {};
           std::fill(std::begin(child_multipoles), std::end(child_multipoles),
                     &(rootCellFromProc.getMultipoleData()));
@@ -2430,11 +2514,12 @@ public:
                 child_multipoles.data(),
                 child_symbolics.data()
                 );
+
         }
         {
-          std::array<const multipole_t*, 8> child_multipoles {};
-          std::array<const symbolic_data_t*, 8> child_symbolics {};
-          for(int idxLevel = offsetRealTree-1-1 ; idxLevel > 1  ; --idxLevel){
+          std::array<const multipole_t*, 8>     child_multipoles {};
+          std::array<const symbolic_data_t*, 8> child_symbolics  {};
+          for(int idxLevel = rootLevel -1 ; idxLevel > 1  ; --idxLevel){
               multipole_t* parent_multipole
                   = &(virtual_multipoles[idxLevel-1]);
               symbolic_data_t* parent_symbolic
@@ -2451,27 +2536,38 @@ public:
                     );
             }
         }
+        /////////////////////////////////////////////////////////////////
+        ///  M2L
+        ///
+        ///  For the highest level the cell is set at position (0,0,0) relatively to its parent
+        ///  but for all other level the location is (1,1,1) (i.e index 7 in the height cells)
+        ///
+        ///
 
-        {
-          const int idxUpperLevel = 2;
+        if(rootLevel > 1){
+          // Build a virtual environment for the M2L at the topmost level
 
+          const int idxUpperLevel = 2;
           local_expansion_t* target_local_expansion
               = &(virtual_local_expansions[idxUpperLevel-1]);
           const symbolic_data_t* target_symbolic
               = &(virtual_symbolics[idxUpperLevel-1]);
 
-          std::array<const multipole_t*, 342> source_multipoles {};
+          std::array<const multipole_t*, 342>     source_multipoles {};
           std::array<const symbolic_data_t*, 342> source_symbolics  {};
 
-          int neighborPositions[342];
+          int neighborPositions[342]{};
           int counter = 0;
+
+          // The cells are all the same, duplicate the central cell and
+          // create fake neighbour positions
           for(int idxX = -3 ; idxX <= 2 ; ++idxX){
               for(int idxY = -3 ; idxY <= 2 ; ++idxY){
                   for(int idxZ = -3 ; idxZ <= 2 ; ++idxZ){
                       if( FMath::Abs(idxX) > 1 || FMath::Abs(idxY) > 1 || FMath::Abs(idxZ) > 1) {
                           source_multipoles[counter] = &virtual_multipoles[idxUpperLevel-1];
-                          source_symbolics[counter] = &virtual_symbolics[idxUpperLevel-1];
-                          neighborPositions[counter] = neighIndex(idxX,idxY,idxZ);
+                          source_symbolics[counter]  = &virtual_symbolics[idxUpperLevel-1];
+                          neighborPositions[counter] = this->neighIndex(idxX,idxY,idxZ);
                           ++counter;
                         }
                     }
@@ -2487,14 +2583,17 @@ public:
                 counter
                 );
         }
+        // note: the triple loop bounds are not the same than for the previous piece of code
+        // which handles the topmost virtual cell
 
         for(int idxUpperLevel = 3 ; idxUpperLevel <= offsetRealTree-1 ; ++idxUpperLevel) {
+
             local_expansion_t* target_local_expansion
                 = &(virtual_local_expansions[idxUpperLevel-1]);
             const symbolic_data_t* target_symbolic
                 = &(virtual_symbolics[idxUpperLevel-1]);
 
-            std::array<const multipole_t*, 342> source_multipoles {};
+            std::array<const multipole_t*, 342>     source_multipoles {};
             std::array<const symbolic_data_t*, 342> source_symbolics  {};
 
             int neighborPositions[342];
@@ -2504,7 +2603,7 @@ public:
                     for(int idxZ = -2 ; idxZ <= 3 ; ++idxZ){
                         if( FMath::Abs(idxX) > 1 || FMath::Abs(idxY) > 1 || FMath::Abs(idxZ) > 1) {
                             source_multipoles[counter] = &virtual_multipoles[idxUpperLevel-1];
-                            source_symbolics[counter] = &virtual_symbolics[idxUpperLevel-1];
+                            source_symbolics[counter]  = &virtual_symbolics[idxUpperLevel-1];
                             neighborPositions[counter] = neighIndex(idxX,idxY,idxZ);
                             ++counter;
                           }
@@ -2522,22 +2621,30 @@ public:
                   );
           }
         {
+          // M2L at the root level
+          int startIdx = -2, endIdx = 3 ;
+          if(rootLevel == 1) { // rootCell is the highest level
+              startIdx = -3, endIdx = 2 ;
+            }
+
           local_expansion_t* target_local_expansion
               = &(rootCellFromProc.getLocalExpansionData());
           const symbolic_data_t* target_symbolic
               = &(rootCellFromProc);
+//          std::cout <<    "  >> M2L  on rootCellFromProc    "<< " level: " << target_symbolic->getLevel()
+//                       << startIdx << " "  << endIdx <<  std::endl;
 
-          std::array<const multipole_t*, 342> source_multipoles {};
+          std::array<const multipole_t*, 342>     source_multipoles {};
           std::array<const symbolic_data_t*, 342> source_symbolics  {};
 
           int neighborPositions[342];
           int counter = 0;
-          for(int idxX = -2 ; idxX <= 3 ; ++idxX) {
-              for(int idxY = -2 ; idxY <= 3 ; ++idxY) {
-                  for(int idxZ = -2 ; idxZ <= 3 ; ++idxZ) {
+          for(int idxX = startIdx ; idxX <= endIdx ; ++idxX){
+              for(int idxY = startIdx ; idxY <= endIdx ; ++idxY){
+                  for(int idxZ = startIdx ; idxZ <= endIdx ; ++idxZ){
                       if( FMath::Abs(idxX) > 1 || FMath::Abs(idxY) > 1 || FMath::Abs(idxZ) > 1) {
                           source_multipoles[counter] = &(rootCellFromProc.getMultipoleData());
-                          source_symbolics[counter] = &rootCellFromProc;
+                          source_symbolics[counter]  = &rootCellFromProc;
                           neighborPositions[counter] = neighIndex(idxX,idxY,idxZ);
                           ++counter;
                         }
@@ -2554,20 +2661,28 @@ public:
                 counter
                 );
 
-
         }
-
+        /////////////////////////////////////////////////////////////////
+        ///
+        ///    L2L operator
+        /// two steps
+        ///    1) Run the L2L for all levels except for the lowest virtual level
+        ///          We only compute expansion for cell 0=(0,0,0)
+        ///    2) Run L2L on the true root level of tree (rootlevel in the full octree
+        ///          We only compute expansion for cell 7=(1,1,1)
+       ///
         {
           std::array<local_expansion_t*, 8> child_local_expansions {};
           std::array<const symbolic_data_t*, 8> child_symbolics {};
-          for(int idxLevel = 2 ; idxLevel <= offsetRealTree-1-1  ; ++idxLevel){
+
+          for(int idxLevel = 2 ; idxLevel <= rootLevel-1  ; ++idxLevel){
               const local_expansion_t* parent_local_expansion
                   = &(virtual_local_expansions[idxLevel-1]);
               const symbolic_data_t* parent_symbolic
                   = &(virtual_symbolics[idxLevel-1]);
 
               child_local_expansions[0] = &(virtual_local_expansions[idxLevel]);
-              child_symbolics[0] = &(virtual_symbolics[idxLevel]);
+              child_symbolics[0]        = &(virtual_symbolics[idxLevel]);
               kernels[0]->L2L(
                     parent_local_expansion,
                     parent_symbolic,
@@ -2576,35 +2691,36 @@ public:
                     );
             }
         }
-        {
-          const int idxLevel = offsetRealTree-1;
+        if(offsetRealTree >2){    // L2L from rootLevel+1 to level rootLevel
+       //   const int idxLevel = offsetRealTree-1;
 
           const local_expansion_t* parent_local_expansion
-              = &(virtual_local_expansions[idxLevel-1]);
+              = &(virtual_local_expansions[rootLevel-1]);
           const symbolic_data_t* parent_symbolic
-              = &(virtual_symbolics[idxLevel-1]);
-
-          std::array<local_expansion_t*, 8> child_local_expansions
-          { &(rootCellFromProc.getLocalExpansionData()) };
-          std::array<const symbolic_data_t*, 8> child_symbolics
-          { &(rootCellFromProc) };
-
+              = &(virtual_symbolics[rootLevel-1]);
+         // only the 7 element of the array is initialized the ofther are
+         //      set to the default value (nullptr)
+          std::array<local_expansion_t*,     8> child_local_expansions{};
+          std::array<const symbolic_data_t*, 8> child_symbolics{} ;
+          //
+          child_local_expansions[7] = &(rootCellFromProc.getLocalExpansionData()) ;
+          child_symbolics[7]        = &(rootCellFromProc) ;
           kernels[0]->L2L(
                 parent_local_expansion,
                 parent_symbolic,
                 child_local_expansions.data(),
                 child_symbolics.data()
                 );
-        }
 
-        // L2L from 0 to level 1
+        }
+        // Run L2L from the lowest vitual level to the highest real tree level
+        // Done in the classical downardPass()
 
       }
 
     FLOG( FLog::Controller << "\tFinished (@Periodic = "  << counterTime.tacAndElapsed() << " s)\n" );
   }
 
-
 };
 
 
-- 
GitLab