diff --git a/CMakeLists.txt b/CMakeLists.txt
index 7f52e37e51df5bd787e34c8b293a20c1d54bbc2f..7901be546b6e142d248c8030ef7ae923069cafd0 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -9,10 +9,10 @@ endif(insource)
 # Options
 OPTION( SCALFMM_USE_CBLAS "Set to ON to build ScaFMM with BLAS" ON )
 OPTION( SCALFMM_USE_MPI "Set to ON to build ScaFMM with MPI" ON )
-OPTION( BUILD_TESTS "Set to ON to build fonctionnalities Tests" ON )
-OPTION( BUILD_UTESTS "Set to ON to build UTests" ON )
+OPTION( SCALFMM_BUILD_TESTS "Set to ON to build fonctionnalities Tests" ON )
+OPTION( SCALFMM_BUILD_UTESTS "Set to ON to build UTests" ON )
 
-# MPI option
+# MPI option has to be set before project
 if( SCALFMM_USE_MPI )
   SET(CMAKE_CXX_COMPILER mpicxx)
 endif()
@@ -42,14 +42,14 @@ CONFIGURE_FILE( ${CMAKE_SOURCE_DIR}/Src/ScalFmmConfig.h.cmake
 add_subdirectory(Src)
 
 # Build - Tests
-MESSAGE( STATUS "BUILD_TESTS = ${BUILD_TESTS}" )
-if( BUILD_TESTS )
+MESSAGE( STATUS "SCALFMM_BUILD_TESTS = ${SCALFMM_BUILD_TESTS}" )
+if( SCALFMM_BUILD_TESTS )
 	add_subdirectory(Tests)
 endif()
 
 # Build - UTests
-MESSAGE( STATUS "BUILD_UTESTS = ${BUILD_UTESTS}" )
-if( BUILD_UTESTS )
+MESSAGE( STATUS "SCALFMM_BUILD_UTESTS = ${SCALFMM_BUILD_UTESTS}" )
+if( SCALFMM_BUILD_UTESTS )
 	add_subdirectory(UTests)
 endif()
 
diff --git a/Src/CMakeLists.txt b/Src/CMakeLists.txt
index c258069d83bf9c968dd5865466f4d9605b56e856..f930e4b78c532b6d07a917f09a167e40514f5683 100644
--- a/Src/CMakeLists.txt
+++ b/Src/CMakeLists.txt
@@ -20,6 +20,6 @@ add_library(
 
 # Adding the entire project dir as an include dir
 INCLUDE_DIRECTORIES(
-     ${CMAKE_BINARY_DIR}/Sources 
+     ${CMAKE_BINARY_DIR}/Src 
 )
 
diff --git a/Src/Core/FFmmAlgorithm.hpp b/Src/Core/FFmmAlgorithm.hpp
index 8010b0e45941ad28a35b7fc369223ed5146ebabf..ee0aacfc34ff7e4ada1d08a662c57d222cebb299 100644
--- a/Src/Core/FFmmAlgorithm.hpp
+++ b/Src/Core/FFmmAlgorithm.hpp
@@ -33,9 +33,6 @@ class FFmmAlgorithm : protected FAssertable{
     Octree* const tree;                                                     //< The octree to work on
     KernelClass<ParticleClass, CellClass, OctreeHeight>* const kernels;    //< The kernels
 
-    FDEBUG(FTic counterTime);                                              //< In case of debug: to count the elapsed time
-    FDEBUG(FTic computationCounter);                                   //< In case of debug: to  count computation time
-
 public:	
     /** The constructor need the octree and the kernels used for computation
       * @param inTree the octree to work on
@@ -62,23 +59,26 @@ public:
     void execute(){
         FTRACE( FTrace::Controller.enterFunction(FTrace::FMM, __FUNCTION__ , __FILE__ , __LINE__) );
 
-        kernels->init();
-
         bottomPass();
         upwardPass();
 
         downardPass();
 
         directPass();
+
         FTRACE( FTrace::Controller.leaveFunction(FTrace::FMM) );
     }
 
+    /////////////////////////////////////////////////////////////////////////////
+    // P2M
+    /////////////////////////////////////////////////////////////////////////////
+
     /** P2M */
     void bottomPass(){
         FTRACE( FTrace::Controller.enterFunction(FTrace::FMM, __FUNCTION__ , __FILE__ , __LINE__) );
         FDEBUG( FDebug::Controller.write("\tStart Bottom Pass\n").write(FDebug::Flush) );
-        FDEBUG( counterTime.tic() );
-        FDEBUG( double totalComputation = 0 );
+        FDEBUG(FTic counterTime);
+        FDEBUG(FTic computationCounter);
 
         FOctreeIterator octreeIterator(tree);
 
@@ -90,21 +90,23 @@ public:
             FDEBUG(computationCounter.tic());
             kernels->P2M( octreeIterator.getCurrentCell() , octreeIterator.getCurrentListSrc());
             FDEBUG(computationCounter.tac());
-            FDEBUG(totalComputation += computationCounter.elapsed());
         } while(octreeIterator.moveRight());
 
-        FDEBUG( counterTime.tac() );
-        FDEBUG( FDebug::Controller << "\tFinished ("  << counterTime.elapsed() << "s)\n" );
-        FDEBUG( FDebug::Controller << "\t\t Computation : " << totalComputation << " s\n" );
+        FDEBUG( FDebug::Controller << "\tFinished ("  << counterTime.tacAndElapsed() << "s)\n" );
+        FDEBUG( FDebug::Controller << "\t\t Computation : " << computationCounter.cumulated() << " s\n" );
         FTRACE( FTrace::Controller.leaveFunction(FTrace::FMM) );
     }
 
+    /////////////////////////////////////////////////////////////////////////////
+    // Upward
+    /////////////////////////////////////////////////////////////////////////////
+
     /** M2M */
     void upwardPass(){
         FTRACE( FTrace::Controller.enterFunction(FTrace::FMM, __FUNCTION__ , __FILE__ , __LINE__) );
         FDEBUG( FDebug::Controller.write("\tStart Upward Pass\n").write(FDebug::Flush); );
-        FDEBUG( counterTime.tic() );
-        FDEBUG( double totalComputation = 0 );
+        FDEBUG(FTic counterTime);
+        FDEBUG(FTic computationCounter);
 
         // Start from leal level - 1
         FOctreeIterator octreeIterator(tree);
@@ -122,27 +124,31 @@ public:
                 FDEBUG(computationCounter.tic());
                 kernels->M2M( octreeIterator.getCurrentCell() , octreeIterator.getCurrentChild(), idxLevel);
                 FDEBUG(computationCounter.tac());
-                FDEBUG(totalComputation += computationCounter.elapsed());
             } while(octreeIterator.moveRight());
 
             avoidGotoLeftIterator.moveUp();
             octreeIterator = avoidGotoLeftIterator;// equal octreeIterator.moveUp(); octreeIterator.gotoLeft();
         }
 
-        FDEBUG( counterTime.tac() );
-        FDEBUG( FDebug::Controller << "\tFinished ("  << counterTime.elapsed() << "s)\n" );
-        FDEBUG( FDebug::Controller << "\t\t Computation : " << totalComputation << " s\n" );
+
+        FDEBUG( FDebug::Controller << "\tFinished ("  << counterTime.tacAndElapsed() << "s)\n" );
+        FDEBUG( FDebug::Controller << "\t\t Computation : " << computationCounter.cumulated() << " s\n" );
         FTRACE( FTrace::Controller.leaveFunction(FTrace::FMM) );
     }
 
+    /////////////////////////////////////////////////////////////////////////////
+    // Downward
+    /////////////////////////////////////////////////////////////////////////////
+
     /** M2L L2L */
     void downardPass(){
         FTRACE( FTrace::Controller.enterFunction(FTrace::FMM, __FUNCTION__ , __FILE__ , __LINE__) );
-        FDEBUG( FDebug::Controller.write("\tStart Downward Pass (M2L)\n").write(FDebug::Flush); );
-        FDEBUG( counterTime.tic() );
-        FDEBUG( double totalComputation = 0 );
 
         { // first M2L
+            FDEBUG( FDebug::Controller.write("\tStart Downward Pass (M2L)\n").write(FDebug::Flush); );
+            FDEBUG(FTic counterTime);
+            FDEBUG(FTic computationCounter);
+
             FOctreeIterator octreeIterator(tree);
             octreeIterator.moveDown();
 
@@ -153,25 +159,24 @@ public:
             for(int idxLevel = 2 ; idxLevel < OctreeHeight ; ++idxLevel ){
                 // for each cells
                 do{
-                    FDEBUG(computationCounter.tic());
                     const int counter = tree->getDistantNeighbors(neighbors, octreeIterator.getCurrentGlobalIndex(),idxLevel);
+                    FDEBUG(computationCounter.tic());
                     if(counter) kernels->M2L( octreeIterator.getCurrentCell() , neighbors, counter, idxLevel);
                     FDEBUG(computationCounter.tac());
-                    FDEBUG(totalComputation += computationCounter.elapsed());
                 } while(octreeIterator.moveRight());
 
                 avoidGotoLeftIterator.moveDown();
                 octreeIterator = avoidGotoLeftIterator;
             }
+            FDEBUG( FDebug::Controller << "\tFinished ("  << counterTime.tacAndElapsed() << "s)\n" );
+            FDEBUG( FDebug::Controller << "\t\t Computation : " << computationCounter.cumulated() << " s\n" );
         }
-        FDEBUG( counterTime.tac() );
-        FDEBUG( FDebug::Controller << "\tFinished ("  << counterTime.elapsed() << "s)\n" );
-        FDEBUG( FDebug::Controller << "\t\t Computation : " << totalComputation << " s\n" );
-
-        FDEBUG( FDebug::Controller.write("\tStart Downward Pass (L2L)\n").write(FDebug::Flush); );
-        FDEBUG( counterTime.tic() );
-        FDEBUG( totalComputation = 0 );
-        { // second L2L
+
+        { // second L2L            
+            FDEBUG( FDebug::Controller.write("\tStart Downward Pass (L2L)\n").write(FDebug::Flush); );
+            FDEBUG(FTic counterTime);
+            FDEBUG(FTic computationCounter );
+
             FOctreeIterator octreeIterator(tree);
             octreeIterator.moveDown();
 
@@ -185,26 +190,29 @@ public:
                     FDEBUG(computationCounter.tic());
                     kernels->L2L( octreeIterator.getCurrentCell() , octreeIterator.getCurrentChild(), idxLevel);
                     FDEBUG(computationCounter.tac());
-                    FDEBUG(totalComputation += computationCounter.elapsed());
                 } while(octreeIterator.moveRight());
 
                 avoidGotoLeftIterator.moveDown();
                 octreeIterator = avoidGotoLeftIterator;
             }
+
+            FDEBUG( FDebug::Controller << "\tFinished ("  << counterTime.tacAndElapsed() << "s)\n" );
+            FDEBUG( FDebug::Controller << "\t\t Computation : " << computationCounter.cumulated() << " s\n" );
         }
 
-        FDEBUG( counterTime.tac() );
-        FDEBUG( FDebug::Controller << "\tFinished ("  << counterTime.elapsed() << "s)\n" );
-        FDEBUG( FDebug::Controller << "\t\t Computation : " << totalComputation << " s\n" );
         FTRACE( FTrace::Controller.leaveFunction(FTrace::FMM) );
     }
 
+    /////////////////////////////////////////////////////////////////////////////
+    // Direct
+    /////////////////////////////////////////////////////////////////////////////
+
     /** P2P */
     void directPass(){
         FTRACE( FTrace::Controller.enterFunction(FTrace::FMM, __FUNCTION__ , __FILE__ , __LINE__) );
         FDEBUG( FDebug::Controller.write("\tStart Direct Pass\n").write(FDebug::Flush); );
-        FDEBUG( counterTime.tic() );
-        FDEBUG( double totalComputation = 0 );
+        FDEBUG(FTic counterTime);
+        FDEBUG(FTic computationCounter);
 
         const int heightMinusOne = OctreeHeight - 1;
 
@@ -216,16 +224,17 @@ public:
         do{
             FDEBUG(computationCounter.tic());
             kernels->L2P(octreeIterator.getCurrentCell(), octreeIterator.getCurrentListTargets());
+            FDEBUG(computationCounter.tac());
             // need the current particles and neighbors particles
             const int counter = tree->getLeafsNeighbors(neighbors, octreeIterator.getCurrentGlobalIndex(),heightMinusOne);
+            FDEBUG(computationCounter.tic());
             kernels->P2P( octreeIterator.getCurrentListTargets(), octreeIterator.getCurrentListSrc() , neighbors, counter);
             FDEBUG(computationCounter.tac());
-            FDEBUG(totalComputation += computationCounter.elapsed());
         } while(octreeIterator.moveRight());
 
-        FDEBUG( counterTime.tac() );
-        FDEBUG( FDebug::Controller << "\tFinished ("  << counterTime.elapsed() << "s)\n" );
-        FDEBUG( FDebug::Controller << "\t\t Computation : " << totalComputation << " s\n" );
+
+        FDEBUG( FDebug::Controller << "\tFinished ("  << counterTime.tacAndElapsed() << "s)\n" );
+        FDEBUG( FDebug::Controller << "\t\t Computation : " << computationCounter.cumulated() << " s\n" );
         FTRACE( FTrace::Controller.leaveFunction(FTrace::FMM) );
     }
 
diff --git a/Src/Core/FFmmAlgorithmThread.hpp b/Src/Core/FFmmAlgorithmThread.hpp
index b1b14fb07c1bc4c2da521d0e07cfc0cc4d7e0387..7826d5c7d1956918303f4fef34c920a57167aafa 100644
--- a/Src/Core/FFmmAlgorithmThread.hpp
+++ b/Src/Core/FFmmAlgorithmThread.hpp
@@ -40,15 +40,14 @@ class FFmmAlgorithmThread : protected FAssertable{
     typedef KernelClass<ParticleClass, CellClass, OctreeHeight> Kernel;
 
     Octree* const tree;                  //< The octree to work on
-    Kernel* kernels[FThreadNumbers];          //< The kernels
-
-    FDEBUG(FTic counterTime);                //< In case of debug: to count the elapsed time
-    FDEBUG(FTic computationCounter);     //< In case of debug: to  count computation time
+    Kernel** kernels;                    //< The kernels
 
     OctreeIterator* iterArray;
 
     static const int SizeShape = 3*3*3;
-    int shapeLeaf[SizeShape];
+    int shapeLeaf[SizeShape];    
+
+    const int MaxThreads;
 
 public:
     /** The constructor need the octree and the kernels used for computation
@@ -57,12 +56,12 @@ public:
       * An assert is launched if one of the arguments is null
       */
     FFmmAlgorithmThread(Octree* const inTree, Kernel* const inKernels)
-                      : tree(inTree) , iterArray(0) {
+                      : tree(inTree) , kernels(0), iterArray(0), MaxThreads(omp_get_max_threads()) {
 
         assert(tree, "tree cannot be null", __LINE__, __FILE__);
-        assert(kernels, "kernels cannot be null", __LINE__, __FILE__);
 
-        for(int idxThread = 0 ; idxThread < FThreadNumbers ; ++idxThread){
+        this->kernels = new Kernel*[MaxThreads];
+        for(int idxThread = 0 ; idxThread < MaxThreads ; ++idxThread){
             this->kernels[idxThread] = new KernelClass<ParticleClass, CellClass, OctreeHeight>(*inKernels);
         }
 
@@ -71,9 +70,10 @@ public:
 
     /** Default destructor */
     virtual ~FFmmAlgorithmThread(){
-        for(int idxThread = 0 ; idxThread < FThreadNumbers ; ++idxThread){
+        for(int idxThread = 0 ; idxThread < MaxThreads ; ++idxThread){
             delete this->kernels[idxThread];
         }
+        delete [] this->kernels;
     }
 
     /**
@@ -103,10 +103,6 @@ public:
         iterArray = new OctreeIterator[leafs];
         assert(iterArray, "iterArray bad alloc", __LINE__, __FILE__);
 
-        for(int idxThread = 0 ; idxThread < FThreadNumbers ; ++idxThread){
-            this->kernels[idxThread]->init();
-        }
-
         bottomPass();
         upwardPass();
 
@@ -120,11 +116,15 @@ public:
         FTRACE( FTrace::Controller.leaveFunction(FTrace::FMM) );
     }
 
+    /////////////////////////////////////////////////////////////////////////////
+    // P2M
+    /////////////////////////////////////////////////////////////////////////////
+
     /** P2M */
     void bottomPass(){
         FTRACE( FTrace::Controller.enterFunction(FTrace::FMM, __FUNCTION__ , __FILE__ , __LINE__) );
         FDEBUG( FDebug::Controller.write("\tStart Bottom Pass\n").write(FDebug::Flush) );
-        FDEBUG( counterTime.tic() );
+        FDEBUG(FTic counterTime);
 
         OctreeIterator octreeIterator(tree);
         int leafs = 0;
@@ -135,8 +135,8 @@ public:
             ++leafs;
         } while(octreeIterator.moveRight());
 
-        FDEBUG(computationCounter.tic());
-        #pragma omp parallel num_threads(FThreadNumbers)
+        FDEBUG(FTic computationCounter);
+        #pragma omp parallel
         {
             Kernel * const myThreadkernels = kernels[omp_get_thread_num()];
             #pragma omp for
@@ -146,20 +146,23 @@ public:
                 myThreadkernels->P2M( iterArray[idxLeafs].getCurrentCell() , iterArray[idxLeafs].getCurrentListSrc());
             }
         }
-        FDEBUG(computationCounter.tac());
+        FDEBUG(computationCounter.tac() );
 
-        FDEBUG( counterTime.tac() );
-        FDEBUG( FDebug::Controller << "\tFinished ("  << counterTime.elapsed() << "s)\n" );
+        FDEBUG( FDebug::Controller << "\tFinished ("  << counterTime.tacAndElapsed() << "s)\n" );
         FDEBUG( FDebug::Controller << "\t\t Computation : " << computationCounter.elapsed() << " s\n" );
         FTRACE( FTrace::Controller.leaveFunction(FTrace::FMM) );
     }
 
+    /////////////////////////////////////////////////////////////////////////////
+    // Upward
+    /////////////////////////////////////////////////////////////////////////////
+
     /** M2M */
     void upwardPass(){
         FTRACE( FTrace::Controller.enterFunction(FTrace::FMM, __FUNCTION__ , __FILE__ , __LINE__) );
         FDEBUG( FDebug::Controller.write("\tStart Upward Pass\n").write(FDebug::Flush); );
-        FDEBUG( counterTime.tic() );
-        FDEBUG( double totalComputation = 0 );
+        FDEBUG(FTic counterTime);
+        FDEBUG(FTic computationCounter);
 
         // Start from leal level - 1
         OctreeIterator octreeIterator(tree);
@@ -179,7 +182,7 @@ public:
             octreeIterator = avoidGotoLeftIterator;// equal octreeIterator.moveUp(); octreeIterator.gotoLeft();
 
             FDEBUG(computationCounter.tic());
-            #pragma omp parallel num_threads(FThreadNumbers)
+            #pragma omp parallel
             {
                 Kernel * const myThreadkernels = kernels[omp_get_thread_num()];
                 #pragma omp for
@@ -190,23 +193,27 @@ public:
                 }
             }
             FDEBUG(computationCounter.tac());
-            FDEBUG(totalComputation += computationCounter.elapsed());
         }
 
-        FDEBUG( counterTime.tac() );
-        FDEBUG( FDebug::Controller << "\tFinished ("  << counterTime.elapsed() << "s)\n" );
-        FDEBUG( FDebug::Controller << "\t\t Computation : " << totalComputation << " s\n" );
+
+        FDEBUG( FDebug::Controller << "\tFinished ("  << counterTime.tacAndElapsed() << "s)\n" );
+        FDEBUG( FDebug::Controller << "\t\t Computation : " << computationCounter.cumulated() << " s\n" );
         FTRACE( FTrace::Controller.leaveFunction(FTrace::FMM) );
     }
 
+    /////////////////////////////////////////////////////////////////////////////
+    // Downard
+    /////////////////////////////////////////////////////////////////////////////
+
     /** M2L L2L */
     void downardPass(){
         FTRACE( FTrace::Controller.enterFunction(FTrace::FMM, __FUNCTION__ , __FILE__ , __LINE__) );
-        FDEBUG( FDebug::Controller.write("\tStart Downward Pass (M2L)\n").write(FDebug::Flush); );
-        FDEBUG( counterTime.tic() );
-        FDEBUG( double totalComputation = 0 );
 
         { // first M2L
+            FDEBUG( FDebug::Controller.write("\tStart Downward Pass (M2L)\n").write(FDebug::Flush); );
+            FDEBUG(FTic counterTime);
+            FDEBUG(FTic computationCounter);
+
             OctreeIterator octreeIterator(tree);
             octreeIterator.moveDown();
             OctreeIterator avoidGotoLeftIterator(octreeIterator);
@@ -223,7 +230,7 @@ public:
                 octreeIterator = avoidGotoLeftIterator;
 
                 FDEBUG(computationCounter.tic());
-                #pragma omp parallel num_threads(FThreadNumbers)
+                #pragma omp parallel
                 {
                     Kernel * const myThreadkernels = kernels[omp_get_thread_num()];
                     CellClass* neighbors[208];
@@ -234,17 +241,17 @@ public:
                     }
                 }
                 FDEBUG(computationCounter.tac());
-                FDEBUG(totalComputation += computationCounter.elapsed());
             }
+
+            FDEBUG( FDebug::Controller << "\tFinished ("  << counterTime.tacAndElapsed() << "s)\n" );
+            FDEBUG( FDebug::Controller << "\t\t Computation : " << computationCounter.cumulated() << " s\n" );
         }
-        FDEBUG( counterTime.tac() );
-        FDEBUG( FDebug::Controller << "\tFinished ("  << counterTime.elapsed() << "s)\n" );
-        FDEBUG( FDebug::Controller << "\t\t Computation : " << totalComputation << " s\n" );
 
-        FDEBUG( FDebug::Controller.write("\tStart Downward Pass (L2L)\n").write(FDebug::Flush); );
-        FDEBUG( counterTime.tic() );
-        FDEBUG( totalComputation = 0 );
         { // second L2L
+            FDEBUG( FDebug::Controller.write("\tStart Downward Pass (L2L)\n").write(FDebug::Flush); );
+            FDEBUG(FTic counterTime);
+            FDEBUG(FTic computationCounter);
+
             OctreeIterator octreeIterator(tree);
             octreeIterator.moveDown();
 
@@ -263,7 +270,7 @@ public:
                 octreeIterator = avoidGotoLeftIterator;
 
                 FDEBUG(computationCounter.tic());
-                #pragma omp parallel num_threads(FThreadNumbers)
+                #pragma omp parallel
                 {
                     Kernel * const myThreadkernels = kernels[omp_get_thread_num()];
                     #pragma omp for
@@ -272,21 +279,25 @@ public:
                     }
                 }
                 FDEBUG(computationCounter.tac());
-                FDEBUG(totalComputation += computationCounter.elapsed());
             }
+
+            FDEBUG( FDebug::Controller << "\tFinished ("  << counterTime.tacAndElapsed() << "s)\n" );
+            FDEBUG( FDebug::Controller << "\t\t Computation : " << computationCounter.cumulated() << " s\n" );
         }
 
-        FDEBUG( counterTime.tac() );
-        FDEBUG( FDebug::Controller << "\tFinished ("  << counterTime.elapsed() << "s)\n" );
-        FDEBUG( FDebug::Controller << "\t\t Computation : " << totalComputation << " s\n" );
         FTRACE( FTrace::Controller.leaveFunction(FTrace::FMM) );
     }
 
+    /////////////////////////////////////////////////////////////////////////////
+    // Direct
+    /////////////////////////////////////////////////////////////////////////////
+
     /** P2P */
     void directPass(){
         FTRACE( FTrace::Controller.enterFunction(FTrace::FMM, __FUNCTION__ , __FILE__ , __LINE__) );
         FDEBUG( FDebug::Controller.write("\tStart Direct Pass\n").write(FDebug::Flush); );
-        FDEBUG( counterTime.tic() );
+        FDEBUG(FTic counterTime);
+        FDEBUG(FTic computationCounter);
 
         OctreeIterator* shapeArray[SizeShape];
         int countShape[SizeShape];
@@ -317,7 +328,7 @@ public:
         FDEBUG(computationCounter.tic());
         for(int idxShape = 0 ; idxShape < SizeShape ; ++idxShape){
             const int leafAtThisShape = this->shapeLeaf[idxShape];
-            #pragma omp parallel num_threads(FThreadNumbers)
+            #pragma omp parallel
             {
                 Kernel * const myThreadkernels = kernels[omp_get_thread_num()];
                 // There is a maximum of 26 neighbors
@@ -338,9 +349,8 @@ public:
             delete [] shapeArray[idxShape];
         }
 
-        FDEBUG( counterTime.tac() );
-        FDEBUG( FDebug::Controller << "\tFinished ("  << counterTime.elapsed() << "s)\n" );
-        FDEBUG( FDebug::Controller << "\t\t Computation : " << computationCounter.elapsed() << " s\n" );
+        FDEBUG( FDebug::Controller << "\tFinished ("  << counterTime.tacAndElapsed() << "s)\n" );
+        FDEBUG( FDebug::Controller << "\t\t Computation : " << computationCounter.cumulated() << " s\n" );
         FTRACE( FTrace::Controller.leaveFunction(FTrace::FMM) );
     }
 
diff --git a/Src/Core/FFmmAlgorithmThreadProc.hpp b/Src/Core/FFmmAlgorithmThreadProc.hpp
index f53c638c50bceab6df36e31ae02041d7778e491a..4b88452e0e6fe52ae2bf58cfa85c64d4f5d339f3 100644
--- a/Src/Core/FFmmAlgorithmThreadProc.hpp
+++ b/Src/Core/FFmmAlgorithmThreadProc.hpp
@@ -44,20 +44,17 @@ typedef FSingleApplication ApplicationImplementation;
 * schedule(runtime)
 */
 template<template< class ParticleClass, class CellClass, int OctreeHeight> class KernelClass,
-        class ParticleClass, class CellClass,
-        template<class ParticleClass> class LeafClass,
-        int OctreeHeight, int SubtreeHeight>
-class FFmmAlgorithmThreadProc : protected FAssertable, protected ApplicationImplementation{
+class ParticleClass, class CellClass,
+template<class ParticleClass> class LeafClass,
+int OctreeHeight, int SubtreeHeight>
+        class FFmmAlgorithmThreadProc : protected FAssertable, protected ApplicationImplementation{
     // To reduce the size of variable type based on foctree in this file
     typedef FOctree<ParticleClass, CellClass, LeafClass, OctreeHeight, SubtreeHeight> Octree;
     typedef typename FOctree<ParticleClass, CellClass,LeafClass, OctreeHeight, SubtreeHeight>::Iterator OctreeIterator;
     typedef KernelClass<ParticleClass, CellClass, OctreeHeight> Kernel;
 
     Octree* const tree;                  //< The octree to work on
-    Kernel* kernels[FThreadNumbers];          //< The kernels
-
-    FDEBUG(FTic counterTime);                //< In case of debug: to count the elapsed time
-    FDEBUG(FTic computationCounter);     //< In case of debug: to  count computation time
+    Kernel** kernels;                    //< The kernels
 
     OctreeIterator* iterArray;
     OctreeIterator* previousIterArray;
@@ -69,6 +66,8 @@ class FFmmAlgorithmThreadProc : protected FAssertable, protected ApplicationImpl
     int leftOffsets[OctreeHeight];
     int rightOffsets[OctreeHeight];
 
+    const int MaxThreads;
+
     void run(){}
 
     void swapArray(){
@@ -84,13 +83,14 @@ public:
       * An assert is launched if one of the arguments is null
       */
     FFmmAlgorithmThreadProc(Octree* const inTree, Kernel* const inKernels, const int inArgc, char ** const inArgv )
-                      : ApplicationImplementation(inArgc,inArgv), tree(inTree) , iterArray(0),
-                        previousIterArray(0), previousLeft(0),previousRight(0), previousSize(0) {
+        : ApplicationImplementation(inArgc,inArgv), tree(inTree) , kernels(0), iterArray(0),
+        previousIterArray(0), previousLeft(0),previousRight(0), previousSize(0),
+        MaxThreads(omp_get_max_threads()) {
 
         assert(tree, "tree cannot be null", __LINE__, __FILE__);
-        assert(kernels, "kernels cannot be null", __LINE__, __FILE__);
 
-        for(int idxThread = 0 ; idxThread < FThreadNumbers ; ++idxThread){
+        this->kernels = new Kernel*[MaxThreads];
+        for(int idxThread = 0 ; idxThread < MaxThreads ; ++idxThread){
             this->kernels[idxThread] = new KernelClass<ParticleClass, CellClass, OctreeHeight>(*inKernels);
         }
 
@@ -99,9 +99,10 @@ public:
 
     /** Default destructor */
     virtual ~FFmmAlgorithmThreadProc(){
-        for(int idxThread = 0 ; idxThread < FThreadNumbers ; ++idxThread){
+        for(int idxThread = 0 ; idxThread < MaxThreads ; ++idxThread){
             delete this->kernels[idxThread];
         }
+        delete [] this->kernels;
     }
 
     /**
@@ -125,11 +126,6 @@ public:
         previousIterArray = new OctreeIterator[leafs];
         assert(previousIterArray, "previousIterArray bad alloc", __LINE__, __FILE__);
 
-        // init kernels
-        for(int idxThread = 0 ; idxThread < FThreadNumbers ; ++idxThread){
-            this->kernels[idxThread]->init();
-        }
-
         // init offsets
         for(int idxOff = 0 ; idxOff < OctreeHeight ; ++idxOff){
             leftOffsets[idxOff] = 0;
@@ -154,6 +150,10 @@ public:
         FTRACE( FTrace::Controller.leaveFunction(FTrace::FMM) );
     }
 
+    /////////////////////////////////////////////////////////////////////////////
+    // Utils functions
+    /////////////////////////////////////////////////////////////////////////////
+
     int getLeft(const int idProc, const int inSize, const int nbOfProc) const {
         const float step = (float(inSize) / nbOfProc);
         return int(FMath::Ceil(step * idProc));
@@ -171,11 +171,15 @@ public:
         return int(position/step);
     }
 
+    /////////////////////////////////////////////////////////////////////////////
+    // P2M
+    /////////////////////////////////////////////////////////////////////////////
+
     /** P2M */
     void bottomPass(){
         FTRACE( FTrace::Controller.enterFunction(FTrace::FMM, __FUNCTION__ , __FILE__ , __LINE__) );
         FDEBUG( FDebug::Controller.write("\tStart Bottom Pass\n").write(FDebug::Flush) );
-        FDEBUG( counterTime.tic() );
+        FDEBUG(FTic counterTime);
 
         OctreeIterator octreeIterator(tree);
         const int nbProcess = processCount();
@@ -196,11 +200,11 @@ public:
         this->previousRight = endIdx - 1;
         this->previousSize = leafs;
 
-        FDEBUG(computationCounter.tic());
-        #pragma omp parallel num_threads(FThreadNumbers)
+        FDEBUG(FTic computationCounter);
+#pragma omp parallel
         {
             Kernel * const myThreadkernels = kernels[omp_get_thread_num()];
-            #pragma omp for
+#pragma omp for
             for(int idxLeafs = startIdx ; idxLeafs < endIdx ; ++idxLeafs){
                 // We need the current cell that represent the leaf
                 // and the list of particles
@@ -214,18 +218,24 @@ public:
         this->previousRight = endIdx - 1;
         this->previousSize = leafs;
 
-        FDEBUG( counterTime.tac() );
-        FDEBUG( FDebug::Controller << "\tFinished ("  << counterTime.elapsed() << "s)\n" );
+
+        FDEBUG( FDebug::Controller << "\tFinished ("  << counterTime.tacAndElapsed() << "s)\n" );
         FDEBUG( FDebug::Controller << "\t\t Computation : " << computationCounter.elapsed() << " s\n" );
         FTRACE( FTrace::Controller.leaveFunction(FTrace::FMM) );
     }
 
+    /////////////////////////////////////////////////////////////////////////////
+    // Upward
+    /////////////////////////////////////////////////////////////////////////////
+
     /** M2M */
     void upwardPass(){
         FTRACE( FTrace::Controller.enterFunction(FTrace::FMM, __FUNCTION__ , __FILE__ , __LINE__) );
         FDEBUG( FDebug::Controller.write("\tStart Upward Pass\n").write(FDebug::Flush); );
-        FDEBUG( counterTime.tic() );
-        FDEBUG( double totalComputation = 0 );
+        FDEBUG(FTic counterTime);
+        FDEBUG(FTic computationCounter);
+        FDEBUG(FTic sendCounter);
+        FDEBUG(FTic receiveCounter);
 
         // Start from leal level - 1
         OctreeIterator octreeIterator(tree);
@@ -238,7 +248,6 @@ public:
 
         // for each levels
         for(int idxLevel = OctreeHeight - 2 ; idxLevel > 1 ; --idxLevel ){
-            //print();
 
             int leafs = 0;
             // for each cells
@@ -252,37 +261,32 @@ public:
             const int startIdx = getLeft(idPorcess,leafs,nbProcess);
             const int endIdx = getRight(idPorcess,leafs,nbProcess);
 
-            //std::cout << idPorcess << ">>--startIdx " << (startIdx) << " endIdx " << (endIdx) << std::endl;
-            //std::cout << idPorcess << ">>--previousLeft " << (previousLeft) << " previousRight " << (previousRight) << std::endl;
-
-            //std::cout << "level " << idxLevel << " start " << startIdx << " end " << endIdx << std::endl;
-
             if(startIdx < leafs){
+                FDEBUG(sendCounter.tic());
                 int leftOffset = 0;
                 {
                     const MortonIndex MostLeftChild = iterArray[startIdx].getCurrentGlobalIndex() << 3;
                     const MortonIndex leftChildIter = previousIterArray[previousLeft].getCurrentGlobalIndex();
-                    //std::cout << idPorcess << ">>--MostLeftChild " << (MostLeftChild) << " leftChildIter " << (leftChildIter) << std::endl;
+
                     if(leftChildIter < MostLeftChild){
                         int parentOffset = startIdx - 1;
                         MortonIndex parentIndex = iterArray[parentOffset].getCurrentGlobalIndex();
                         MortonIndex childIndex = 0;
                         while( (childIndex = previousIterArray[previousLeft+leftOffset].getCurrentGlobalIndex()) < MostLeftChild){
                             childIndex >>= 3;
-                            //std::cout << "before loop" << std::endl;
+
                             while(childIndex != parentIndex){
                                 if(childIndex < parentIndex) --parentOffset;
                                 else ++parentOffset;
-                                //std::cout << "parentOffset " << parentOffset << " parentIndex " << parentIndex << " childIndex " << childIndex << std::endl;
+
                                 parentIndex = iterArray[parentOffset].getCurrentGlobalIndex();
-                                //std::cout << "parentOffset " << parentOffset << " parentIndex " << parentIndex << " childIndex " << childIndex << std::endl;
+
                             }
-                            //std::cout << "before send" << std::endl;
+
                             const int idxReceiver = getProc(parentOffset,leafs,nbProcess);
                             sendData(idxReceiver,sizeof(CellClass),previousIterArray[this->previousLeft+leftOffset].getCurrentCell(),previousLeft+leftOffset);
-                            //std::cout << idPorcess << "\t>>-- sends left to " << (idxReceiver) << " index " << (previousLeft+leftOffset) << std::endl;
+
                             ++leftOffset;
-                            //std::cout << "before end big loop" << std::endl;
                         }
                     }
                     else if(this->previousLeft > 0 && leftChildIter > MostLeftChild){
@@ -291,13 +295,12 @@ public:
                         }
                     }
                 }
-                //std::cout << idPorcess << ">>--leftOffset " << (leftOffset) << std::endl;
 
                 int rightOffset = 0;
                 {
                     const MortonIndex MostRightChild = (iterArray[endIdx-1].getCurrentGlobalIndex() << 3) | 7;
                     const MortonIndex rightChildIter = previousIterArray[previousRight].getCurrentGlobalIndex();
-                    //std::cout << idPorcess << ">>--MostRightChild " << (MostRightChild) << " rightChildIter " << (rightChildIter) << std::endl;
+
                     if(this->previousRight < this->previousSize - 1 && rightChildIter < MostRightChild){
                         while( previousIterArray[previousRight-rightOffset+1].getCurrentGlobalIndex() <= MostRightChild){
                             --rightOffset;
@@ -316,60 +319,55 @@ public:
                             }
                             const int idxReceiver = getProc(parentOffset,leafs,nbProcess);
                             sendData(idxReceiver,sizeof(CellClass),previousIterArray[this->previousRight-rightOffset].getCurrentCell(),previousRight-rightOffset);
-                            //std::cout << idPorcess << "\t>>-- sends right to " << (idxReceiver) << " index " << (previousRight+rightOffset) << std::endl;
+
                             ++rightOffset;
                         }
                     }
                 }
-                //std::cout << idPorcess << ">>--rightOffset " << (rightOffset) << std::endl;
+                FDEBUG(sendCounter.tac());
 
                 leftOffsets[idxLevel+1] = leftOffset;
                 rightOffsets[idxLevel+1] = rightOffset;
 
-                #pragma omp parallel num_threads(FThreadNumbers)
+#pragma omp parallel
                 {
                     // received computed data
-                    #pragma omp single
+#pragma omp single
                     {
+                        FDEBUG(receiveCounter.tic());
                         int needToReceive = FMath::Max(0,-rightOffset) + FMath::Max(0,-leftOffset);
                         CellClass tempCell;
                         int source = 0, tag = 0, filled = 0;
 
-                        //std::cout << idPorcess <<  ">>--Will receive " << needToReceive << std::endl;
-
                         while(needToReceive){
                             receiveData(sizeof(CellClass),&tempCell,&source,&tag,&filled);
                             if(filled){
                                 *previousIterArray[tag].getCurrentCell() = tempCell;
                             }
                             --needToReceive;
-                            //std::cout << idPorcess <<  ">>receive tag " << (tag) << " tempCell.up " << tempCell.getDataUp()  << " source " << source << std::endl;
                         }
-                        //std::cout << idPorcess <<  ">>--All receive--" << std::endl;
+                        FDEBUG(receiveCounter.tac());
                     }
 
-                    #pragma omp single nowait
+#pragma omp single nowait
                     {
                         FDEBUG(computationCounter.tic());
                     }
 
                     Kernel * const myThreadkernels = kernels[omp_get_thread_num()];
-                    #pragma omp for
+#pragma omp for
                     for(int idxLeafs = startIdx ; idxLeafs < endIdx ; ++idxLeafs){
                         myThreadkernels->M2M( iterArray[idxLeafs].getCurrentCell() , iterArray[idxLeafs].getCurrentChild(), idxLevel);
                     }
 
-                    #pragma omp single nowait
+#pragma omp single nowait
                     {
                         FDEBUG(computationCounter.tac());
-                        FDEBUG(totalComputation += computationCounter.elapsed());
                     }
 
                 }
             }
             else {
-                //std::cout << "I am out startIdx " << startIdx << " endIdx " << endIdx << std::endl;
-
                 int parentOffset = leafs - 1;
                 MortonIndex parentIndex = iterArray[parentOffset].getCurrentGlobalIndex();
 
@@ -381,11 +379,9 @@ public:
                     }
                     const int idxReceiver = getProc(parentOffset,leafs,nbProcess);
                     sendData(idxReceiver,sizeof(CellClass),previousIterArray[idxLeafs].getCurrentCell(),idxLeafs);
-                    //std::cout << idPorcess << "\t>>-- sends all to " << (idxReceiver) << " index " << idxLeafs << std::endl;
                 }
 
                 leftOffsets[idxLevel+1] = (previousRight-previousLeft) + 1;
-                //std::cout << "left off set at " << idxLevel+1 << " = " << leftOffsets[idxLevel+1] << std::endl;
             }
 
             swapArray();
@@ -396,22 +392,28 @@ public:
             processBarrier();
         }
 
-        //print();
-
-        FDEBUG( counterTime.tac() );
-        FDEBUG( FDebug::Controller << "\tFinished ("  << counterTime.elapsed() << "s)\n" );
-        FDEBUG( FDebug::Controller << "\t\t Computation : " << totalComputation << " s\n" );
+        FDEBUG( FDebug::Controller << "\tFinished ("  << counterTime.tacAndElapsed() << "s)\n" );
+        FDEBUG( FDebug::Controller << "\t\t Computation : " << computationCounter.cumulated() << " s\n" );
+        FDEBUG( FDebug::Controller << "\t\t Send : " << sendCounter.cumulated() << " s\n" );
+        FDEBUG( FDebug::Controller << "\t\t Receive : " << receiveCounter.cumulated() << " s\n" );
         FTRACE( FTrace::Controller.leaveFunction(FTrace::FMM) );
     }
 
+    /////////////////////////////////////////////////////////////////////////////
+    // Downard
+    /////////////////////////////////////////////////////////////////////////////
+
     /** M2L L2L */
     void downardPass(){
         FTRACE( FTrace::Controller.enterFunction(FTrace::FMM, __FUNCTION__ , __FILE__ , __LINE__) );
-        FDEBUG( FDebug::Controller.write("\tStart Downward Pass (M2L)\n").write(FDebug::Flush); );
-        FDEBUG( counterTime.tic() );
-        FDEBUG( double totalComputation = 0 );
 
         { // first M2L
+            FDEBUG( FDebug::Controller.write("\tStart Downward Pass (M2L)\n").write(FDebug::Flush); );
+            FDEBUG(FTic counterTime);
+            FDEBUG(FTic computationCounter);
+            FDEBUG(FTic sendCounter);
+            FDEBUG(FTic receiveCounter);
+
             OctreeIterator octreeIterator(tree);
             octreeIterator.moveDown();
             OctreeIterator avoidGotoLeftIterator(octreeIterator);
@@ -450,17 +452,18 @@ public:
                     alreadySent[idxProc] = new FBoolArray(leafs);
                 }
 
-                //std::cout << "There are " << leafs << " leafs" << std::endl;
-
-                //print();
-
-                #pragma omp parallel num_threads(FThreadNumbers)
+#pragma omp parallel
                 {
                     CellClass* neighbors[208];
                     MortonIndex neighborsIndexes[208];
                     Kernel * const myThreadkernels = kernels[omp_get_thread_num()];
 
-                    #pragma omp for
+#pragma omp single nowait
+                    {
+                        FDEBUG(sendCounter.tic());
+                    }
+
+#pragma omp for
                     for(int idxLeafs = startIdx ; idxLeafs < endIdx ; ++idxLeafs){
                         const int neighborsCounter = tree->getDistantNeighborsWithIndex(neighbors, neighborsIndexes, iterArray[idxLeafs].getCurrentGlobalIndex(),idxLevel);
                         bool needData = false;
@@ -495,17 +498,15 @@ public:
 
                                 // Find receiver and send him the cell
                                 const int idxReceiver = getProc(cellPositionInArray,leafs,nbProcess);
-                                #pragma omp critical(CheckToSend)
+#pragma omp critical(CheckToSend)
                                 {
                                     if(!alreadySent[idxReceiver]->get(idxLeafs)){
-                                        //std::cout << idPorcess << ">>--idxLeafs " << (idxLeafs) << " idxReceiver " << (idxReceiver)
-                                        //        << " cellPositionInArray " << (cellPositionInArray)  << " indexCell " << indexCell<< std::endl;
                                         sendData(idxReceiver,sizeof(CellClass),iterArray[idxLeafs].getCurrentCell(),idxLeafs);
                                         alreadySent[idxReceiver]->set(idxLeafs,true);
                                         needData = true;
                                     }
                                 }
-                                #pragma omp critical(CheckToReceive)
+#pragma omp critical(CheckToReceive)
                                 {
                                     if(!alreadySent[idPorcess]->get(cellPositionInArray)){
                                         ++needToReceive;
@@ -516,7 +517,6 @@ public:
                             }
                         }
                         if(needData){
-                            //std::cout << idPorcess <<  ">>this cell need data " << idxLeafs << " index " << iterArray[idxLeafs].getCurrentGlobalIndex() << " neighborsCounter " << neighborsCounter << std::endl;
                             const int currentCell = idxLeafs - startIdx;
                             unfinishedCells[currentCell] = new LimitCell();
                             unfinishedCells[currentCell]->counter = neighborsCounter;
@@ -524,15 +524,19 @@ public:
                             alreadySent[idPorcess]->set(idxLeafs,true);
                         }
                         else if(neighborsCounter){
-                            //std::cout << idPorcess <<  ">>compute directly " << idxLeafs << " index " << iterArray[idxLeafs].getCurrentGlobalIndex() << std::endl;
                             myThreadkernels->M2L(  iterArray[idxLeafs].getCurrentCell() , neighbors, neighborsCounter, idxLevel);
                         }
                     }
 
+#pragma omp single nowait
+                    {
+                        FDEBUG(sendCounter.tac());
+                    }
+
                     // received computed data
-                    #pragma omp single
+#pragma omp single
                     {
-                        //std::cout << idPorcess << ">>--needToReceive " << (needToReceive) << std::endl;
+                        FDEBUG(receiveCounter.tic());
 
                         CellClass tempCell;
                         int source = 0, tag = 0, filled = 0;
@@ -543,19 +547,27 @@ public:
                                 *iterArray[tag].getCurrentCell() = tempCell;
                             }
                             --needToReceive;
-
-                            //std::cout << idPorcess <<  ">>receive tag " << (tag) << " tempCell.up " << tempCell.getDataUp() << std::endl;
                         }
+                        FDEBUG(receiveCounter.tac());
                     }
 
-                    #pragma omp for
+#pragma omp single nowait
+                    {
+                        FDEBUG(computationCounter.tic());
+                    }
+
+#pragma omp for
                     for(int idxLeafs = startIdx ; idxLeafs < endIdx ; ++idxLeafs){
                         if(alreadySent[idPorcess]->get(idxLeafs)){
-                            //std::cout << idPorcess <<  ">>finish to compute " << idxLeafs << " index " << iterArray[idxLeafs].getCurrentGlobalIndex() << std::endl;
                             myThreadkernels->M2L(  iterArray[idxLeafs].getCurrentCell() , unfinishedCells[idxLeafs-startIdx]->neighbors, unfinishedCells[idxLeafs-startIdx]->counter, idxLevel);
                             delete unfinishedCells[idxLeafs-startIdx];
                         }
                     }
+
+#pragma omp single nowait
+                    {
+                        FDEBUG(computationCounter.tac());
+                    }
                 }
 
                 delete [] unfinishedCells;
@@ -567,18 +579,20 @@ public:
                 processBarrier();
 
             }
+            FDEBUG( FDebug::Controller << "\tFinished ("  << counterTime.tacAndElapsed() << "s)\n" );
+            FDEBUG( FDebug::Controller << "\t\t Computation : " << computationCounter.cumulated() << " s\n" );
+            FDEBUG( FDebug::Controller << "\t\t Send : " << sendCounter.cumulated() << " s\n" );
+            FDEBUG( FDebug::Controller << "\t\t Receive : " << receiveCounter.cumulated() << " s\n" );
         }
 
-        //print();
-
-        FDEBUG( counterTime.tac() );
-        FDEBUG( FDebug::Controller << "\tFinished ("  << counterTime.elapsed() << "s)\n" );
-        FDEBUG( FDebug::Controller << "\t\t Computation : " << totalComputation << " s\n" );
 
-        FDEBUG( FDebug::Controller.write("\tStart Downward Pass (L2L)\n").write(FDebug::Flush); );
-        FDEBUG( counterTime.tic() );
-        FDEBUG( totalComputation = 0 );
         { // second L2L
+            FDEBUG( FDebug::Controller.write("\tStart Downward Pass (L2L)\n").write(FDebug::Flush); );
+            FDEBUG(FTic counterTime);
+            FDEBUG(FTic computationCounter);
+            FDEBUG(FTic sendCounter);
+            FDEBUG(FTic receiveCounter);
+
             OctreeIterator octreeIterator(tree);
             octreeIterator.moveDown();
 
@@ -589,61 +603,50 @@ public:
             const int idPorcess = processId();
 
             // for each levels exepted leaf level
-            for(int idxLevel = 2 ; idxLevel <= heightMinusOne ; ++idxLevel ){
-                //print();
-
-                // keep data
-                /*swapArray();
-                this->previousLeft = startIdx;
-                this->previousRight = endIdx - 1;
-                this->previousSize = leafs;*/
-
-                 int leafs = 0;
-                 // for each cells
-                 do{
-                     iterArray[leafs] = octreeIterator;
-                     ++leafs;
-                 } while(octreeIterator.moveRight());
-                 avoidGotoLeftIterator.moveDown();
-                 octreeIterator = avoidGotoLeftIterator;
+            for(int idxLevel = 2 ; idxLevel < OctreeHeight ; ++idxLevel ){
 
-                 const int startIdx = getLeft(idPorcess,leafs,nbProcess);
-                 const int endIdx = getRight(idPorcess,leafs,nbProcess);
+                int leafs = 0;
+                // for each cells
+                do{
+                    iterArray[leafs] = octreeIterator;
+                    ++leafs;
+                } while(octreeIterator.moveRight());
+                avoidGotoLeftIterator.moveDown();
+                octreeIterator = avoidGotoLeftIterator;
 
-                 std::cout << "At level " << idxLevel << " left " << startIdx << " right " << endIdx << std::endl;
+                const int startIdx = getLeft(idPorcess,leafs,nbProcess);
+                const int endIdx = getRight(idPorcess,leafs,nbProcess);
 
-                 const int currentLeft = startIdx;
-                 const int currentRight = endIdx -1;
+                const int currentLeft = startIdx;
+                const int currentRight = endIdx -1;
 
-                #pragma omp parallel num_threads(FThreadNumbers)
+#pragma omp parallel
                 {
                     // send computed data
-                    #pragma omp single nowait
+#pragma omp single nowait
                     {
+                        FDEBUG(sendCounter.tic());
                         const int leftOffset = -leftOffsets[idxLevel];
                         for(int idxLeafs = 1 ; idxLeafs <= leftOffset ; ++idxLeafs){
                             const int idxReceiver = getProc((currentLeft-idxLeafs),leafs,nbProcess);
                             sendData(idxReceiver,sizeof(CellClass),iterArray[currentLeft-idxLeafs].getCurrentCell(),currentLeft-idxLeafs);
-                            //std::cout << idPorcess << "\t>>-- sends (1) to " << (idxReceiver) << " index " << (currentLeft-idxLeafs) << std::endl;
                         }
                         const int rightOffset = -rightOffsets[idxLevel];
                         for(int idxLeafs = 1 ; idxLeafs <= rightOffset ; ++idxLeafs){
                             const int idxReceiver = getProc((currentRight+idxLeafs),leafs,nbProcess);
                             sendData(idxReceiver,sizeof(CellClass),iterArray[currentRight+idxLeafs].getCurrentCell(),currentRight+idxLeafs);
-                            //std::cout << idPorcess << "\t>>-- sends (2) to " << (idxReceiver) << " index " << (currentRight+idxLeafs) << " currentRight " << currentRight << std::endl;
                         }
-                        //std::cout << idPorcess << ">>--Will send " << (leftOffset) << " and " << (rightOffset) << std::endl;
+                        FDEBUG(sendCounter.tac());
                     }
 
                     // received computed data
-                    #pragma omp single
+#pragma omp single
                     {
+                        FDEBUG(receiveCounter.tic());
                         int needToReceive = FMath::Max(0,rightOffsets[idxLevel]) + FMath::Max(0,leftOffsets[idxLevel]);
                         CellClass tempCell;
                         int source = 0, tag = 0, filled = 0;
 
-                        //std::cout << idPorcess << ">>--needToReceive " << (needToReceive) << std::endl;
-
                         while(needToReceive){
                             receiveData(sizeof(CellClass),&tempCell,&source,&tag,&filled);
                             if(filled){
@@ -651,40 +654,44 @@ public:
                             }
                             --needToReceive;
 
-                            //std::cout << idPorcess <<  ">>receive tag " << (tag) << " tempCell.down " << tempCell.getDataDown() << " from " << source << std::endl;
                         }
-                        //std::cout << "all received" << std::endl;
+                        FDEBUG(receiveCounter.tac());
                     }
                 }
 
-                 if(idxLevel != heightMinusOne){
-                    #pragma omp parallel num_threads(FThreadNumbers)
+                if(idxLevel != heightMinusOne){
+                    FDEBUG(computationCounter.tic());
+#pragma omp parallel
                     {
                         Kernel * const myThreadkernels = kernels[omp_get_thread_num()];
-                        #pragma omp for
+#pragma omp for
                         for(int idxLeafs = startIdx ; idxLeafs < endIdx ; ++idxLeafs){
                             myThreadkernels->L2L( iterArray[idxLeafs].getCurrentCell() , iterArray[idxLeafs].getCurrentChild(), idxLevel);
                         }
                     }
-                 }
-
-                 processBarrier();
+                    FDEBUG(computationCounter.tac());
+                    processBarrier();
+                }
             }
-        }
 
-        //print();
+            FDEBUG( FDebug::Controller << "\tFinished ("  << counterTime.tacAndElapsed() << "s)\n" );
+            FDEBUG( FDebug::Controller << "\t\t Computation : " << computationCounter.cumulated() << " s\n" );
+            FDEBUG( FDebug::Controller << "\t\t Send : " << sendCounter.cumulated() << " s\n" );
+            FDEBUG( FDebug::Controller << "\t\t Receive : " << receiveCounter.cumulated() << " s\n" );
+        }
 
-        FDEBUG( counterTime.tac() );
-        FDEBUG( FDebug::Controller << "\tFinished ("  << counterTime.elapsed() << "s)\n" );
-        FDEBUG( FDebug::Controller << "\t\t Computation : " << totalComputation << " s\n" );
         FTRACE( FTrace::Controller.leaveFunction(FTrace::FMM) );
     }
 
+    /////////////////////////////////////////////////////////////////////////////
+    // Direct
+    /////////////////////////////////////////////////////////////////////////////
+
     /** P2P */
     void directPass(){
         FTRACE( FTrace::Controller.enterFunction(FTrace::FMM, __FUNCTION__ , __FILE__ , __LINE__) );
         FDEBUG( FDebug::Controller.write("\tStart Direct Pass\n").write(FDebug::Flush); );
-        FDEBUG( counterTime.tic() );
+        FDEBUG(FTic counterTime);
 
         const int LeafIndex = OctreeHeight - 1;
         int leafs = 0;
@@ -698,7 +705,7 @@ public:
             } while(octreeIterator.moveRight());
         }
 
-        FDEBUG(computationCounter.tic());
+        FDEBUG(FTic computationCounter);
 
         const int nbProcess = processCount();
         const int idPorcess = processId();
@@ -706,13 +713,13 @@ public:
         const int startIdx = getLeft(idPorcess,leafs,nbProcess);
         const int endIdx = getRight(idPorcess,leafs,nbProcess);
 
-        #pragma omp parallel num_threads(FThreadNumbers)
+#pragma omp parallel
         {
             Kernel * const myThreadkernels = kernels[omp_get_thread_num()];
             // There is a maximum of 26 neighbors
             FList<ParticleClass*>* neighbors[26];
 
-            #pragma omp for
+#pragma omp for
             for(int idxLeafs = startIdx ; idxLeafs < endIdx ; ++idxLeafs){
                 myThreadkernels->L2P(iterArray[idxLeafs].getCurrentCell(), iterArray[idxLeafs].getCurrentListTargets());
                 // need the current particles and neighbors particles
@@ -722,12 +729,16 @@ public:
         }
         FDEBUG(computationCounter.tac());
 
-        FDEBUG( counterTime.tac() );
-        FDEBUG( FDebug::Controller << "\tFinished ("  << counterTime.elapsed() << "s)\n" );
+
+        FDEBUG( FDebug::Controller << "\tFinished ("  << counterTime.tacAndElapsed() << "s)\n" );
         FDEBUG( FDebug::Controller << "\t\t Computation : " << computationCounter.elapsed() << " s\n" );
         FTRACE( FTrace::Controller.leaveFunction(FTrace::FMM) );
     }
 
+    /////////////////////////////////////////////////////////////////////////////
+    // Test function
+    /////////////////////////////////////////////////////////////////////////////
+
     /** This function test the octree to be sure that the fmm algorithm
       * has worked completly.
       */
@@ -740,8 +751,6 @@ public:
             OctreeIterator octreeIteratorValide(valideTree);
             octreeIteratorValide.gotoBottomLeft();
 
-            //std::cout << "We start at level " << OctreeHeight - 1 << std::endl;
-
             for(int level = OctreeHeight - 1 ; level > 0 ; --level){
                 int NbLeafs = 0;
                 do{
@@ -752,7 +761,6 @@ public:
                 const int startIdx = getLeft(processId(),NbLeafs,processCount());
                 const int endIdx = getRight(processId(),NbLeafs,processCount());
                 // Check that each particle has been summed with all other
-                //std::cout << "level " << level << " start " << startIdx << " end " << endIdx << std::endl;
 
                 for(int idx = 0 ; idx < startIdx ; ++idx){
                     octreeIterator.moveRight();
@@ -828,23 +836,11 @@ public:
         }
 
         std::cout << "Done\n";
-
-        //print();
-        //print(valideTree);
-    }
-
-    void print(){
-        OctreeIterator octreeIterator(tree);
-        for(int idxLevel = OctreeHeight - 1 ; idxLevel > 1 ; --idxLevel ){
-            do{
-                std::cout << "[" << octreeIterator.getCurrentGlobalIndex() << "] up:" << octreeIterator.getCurrentCell()->getDataUp() << " down:" << octreeIterator.getCurrentCell()->getDataDown() << "\t";
-            } while(octreeIterator.moveRight());
-            std::cout << "\n";
-            octreeIterator.gotoLeft();
-            octreeIterator.moveDown();
-        }
     }
 
+    /** To print an octree
+      * used to debug and understand how the values were passed
+      */
     void print(Octree* const valideTree){
         OctreeIterator octreeIterator(valideTree);
         for(int idxLevel = OctreeHeight - 1 ; idxLevel > 1 ; --idxLevel ){
diff --git a/Src/Core/FFmmAlgorithmThreadTsm.hpp b/Src/Core/FFmmAlgorithmThreadTsm.hpp
index e89f29e3b2853491d9a12c8e97ffecca9149dd52..b5a575753d215589d1dd3195b6756527d8c7825b 100644
--- a/Src/Core/FFmmAlgorithmThreadTsm.hpp
+++ b/Src/Core/FFmmAlgorithmThreadTsm.hpp
@@ -40,13 +40,12 @@ class FFmmAlgorithmThreadTsm : protected FAssertable{
     typedef KernelClass<ParticleClass, CellClass, OctreeHeight> Kernel;
 
     Octree* const tree;                  //< The octree to work on
-    Kernel* kernels[FThreadNumbers];          //< The kernels
-
-    FDEBUG(FTic counterTime);                //< In case of debug: to count the elapsed time
-    FDEBUG(FTic computationCounter);     //< In case of debug: to  count computation time
+    Kernel** kernels;                    //< The kernels
 
     OctreeIterator* iterArray;
 
+    const int MaxThreads;
+
 public:	
     /** The constructor need the octree and the kernels used for computation
       * @param inTree the octree to work on
@@ -54,12 +53,12 @@ public:
       * An assert is launched if one of the arguments is null
       */
     FFmmAlgorithmThreadTsm(Octree* const inTree, Kernel* const inKernels)
-                      : tree(inTree) , iterArray(0) {
+                      : tree(inTree) , kernels(0), iterArray(0), MaxThreads(omp_get_max_threads()) {
 
         assert(tree, "tree cannot be null", __LINE__, __FILE__);
-        assert(kernels, "kernels cannot be null", __LINE__, __FILE__);
 
-        for(int idxThread = 0 ; idxThread < FThreadNumbers ; ++idxThread){
+        this->kernels = new Kernel*[MaxThreads];
+        for(int idxThread = 0 ; idxThread < MaxThreads ; ++idxThread){
             this->kernels[idxThread] = new KernelClass<ParticleClass, CellClass, OctreeHeight>(*inKernels);
         }
 
@@ -68,9 +67,10 @@ public:
 
     /** Default destructor */
     virtual ~FFmmAlgorithmThreadTsm(){
-        for(int idxThread = 0 ; idxThread < FThreadNumbers ; ++idxThread){
+        for(int idxThread = 0 ; idxThread < MaxThreads ; ++idxThread){
             delete this->kernels[idxThread];
         }
+        delete [] this->kernels;
     }
 
     /**
@@ -90,7 +90,7 @@ public:
         iterArray = new OctreeIterator[leafs];
         assert(iterArray, "iterArray bad alloc", __LINE__, __FILE__);
 
-        for(int idxThread = 0 ; idxThread < FThreadNumbers ; ++idxThread){
+        for(int idxThread = 0 ; idxThread < MaxThreads ; ++idxThread){
             this->kernels[idxThread]->init();
         }
 
@@ -111,7 +111,7 @@ public:
     void bottomPass(){
         FTRACE( FTrace::Controller.enterFunction(FTrace::FMM, __FUNCTION__ , __FILE__ , __LINE__) );
         FDEBUG( FDebug::Controller.write("\tStart Bottom Pass\n").write(FDebug::Flush) );
-        FDEBUG( counterTime.tic() );
+        FDEBUG( FTic counterTime );
 
         OctreeIterator octreeIterator(tree);
         int leafs = 0;
@@ -122,8 +122,8 @@ public:
             ++leafs;
         } while(octreeIterator.moveRight());
 
-        FDEBUG(computationCounter.tic());
-        #pragma omp parallel num_threads(FThreadNumbers)
+        FDEBUG(FTic computationCounter);
+        #pragma omp parallel
         {
             Kernel * const myThreadkernels = kernels[omp_get_thread_num()];
             #pragma omp for
@@ -152,8 +152,8 @@ public:
     void upwardPass(){
         FTRACE( FTrace::Controller.enterFunction(FTrace::FMM, __FUNCTION__ , __FILE__ , __LINE__) );
         FDEBUG( FDebug::Controller.write("\tStart Upward Pass\n").write(FDebug::Flush); );
-        FDEBUG( counterTime.tic() );
-        FDEBUG( double totalComputation = 0 );
+        FDEBUG(FTic counterTime);
+        FDEBUG(FTic computationCounter);
 
         // Start from leal level - 1
         OctreeIterator octreeIterator(tree);
@@ -173,7 +173,7 @@ public:
             octreeIterator = avoidGotoLeftIterator;// equal octreeIterator.moveUp(); octreeIterator.gotoLeft();
 
             FDEBUG(computationCounter.tic());
-            #pragma omp parallel num_threads(FThreadNumbers)
+            #pragma omp parallel
             {
                 Kernel * const myThreadkernels = kernels[omp_get_thread_num()];
                 #pragma omp for
@@ -199,23 +199,23 @@ public:
                 }
             }
             FDEBUG(computationCounter.tac());
-            FDEBUG(totalComputation += computationCounter.elapsed());
         }
 
         FDEBUG( counterTime.tac() );
         FDEBUG( FDebug::Controller << "\tFinished ("  << counterTime.elapsed() << "s)\n" );
-        FDEBUG( FDebug::Controller << "\t\t Computation : " << totalComputation << " s\n" );
+        FDEBUG( FDebug::Controller << "\t\t Computation : " << computationCounter.cumulated() << " s\n" );
         FTRACE( FTrace::Controller.leaveFunction(FTrace::FMM) );
     }
 
     /** M2L L2L */
     void downardPass(){
         FTRACE( FTrace::Controller.enterFunction(FTrace::FMM, __FUNCTION__ , __FILE__ , __LINE__) );
-        FDEBUG( FDebug::Controller.write("\tStart Downward Pass (M2L)\n").write(FDebug::Flush); );
-        FDEBUG( counterTime.tic() );
-        FDEBUG( double totalComputation = 0 );
 
         { // first M2L
+            FDEBUG( FDebug::Controller.write("\tStart Downward Pass (M2L)\n").write(FDebug::Flush); );
+            FDEBUG(FTic counterTime);
+            FDEBUG(FTic computationCounter);
+
             OctreeIterator octreeIterator(tree);
             octreeIterator.moveDown();
             OctreeIterator avoidGotoLeftIterator(octreeIterator);
@@ -232,7 +232,7 @@ public:
                 octreeIterator = avoidGotoLeftIterator;
 
                 FDEBUG(computationCounter.tic());
-                #pragma omp parallel num_threads(FThreadNumbers)
+                #pragma omp parallel
                 {
                     Kernel * const myThreadkernels = kernels[omp_get_thread_num()];
                     CellClass* neighbors[208];
@@ -259,17 +259,16 @@ public:
                     }
                 }
                 FDEBUG(computationCounter.tac());
-                FDEBUG(totalComputation += computationCounter.elapsed());
             }
+            FDEBUG( FDebug::Controller << "\tFinished ("  << counterTime.tacAndElapsed() << "s)\n" );
+            FDEBUG( FDebug::Controller << "\t\t Computation : " << computationCounter.cumulated() << " s\n" );
         }
-        FDEBUG( counterTime.tac() );
-        FDEBUG( FDebug::Controller << "\tFinished ("  << counterTime.elapsed() << "s)\n" );
-        FDEBUG( FDebug::Controller << "\t\t Computation : " << totalComputation << " s\n" );
 
-        FDEBUG( FDebug::Controller.write("\tStart Downward Pass (L2L)\n").write(FDebug::Flush); );
-        FDEBUG( counterTime.tic() );
-        FDEBUG( totalComputation = 0 );
         { // second L2L
+            FDEBUG( FDebug::Controller.write("\tStart Downward Pass (L2L)\n").write(FDebug::Flush); );
+            FDEBUG(FTic counterTime);
+            FDEBUG(FTic computationCounter);
+
             OctreeIterator octreeIterator(tree);
             octreeIterator.moveDown();
 
@@ -288,7 +287,7 @@ public:
                 octreeIterator = avoidGotoLeftIterator;
 
                 FDEBUG(computationCounter.tic());
-                #pragma omp parallel num_threads(FThreadNumbers)
+                #pragma omp parallel
                 {
                     Kernel * const myThreadkernels = kernels[omp_get_thread_num()];
                     #pragma omp for
@@ -308,13 +307,11 @@ public:
                     }
                 }
                 FDEBUG(computationCounter.tac());
-                FDEBUG(totalComputation += computationCounter.elapsed());
             }
+            FDEBUG( FDebug::Controller << "\tFinished ("  << counterTime.tacAndElapsed() << "s)\n" );
+            FDEBUG( FDebug::Controller << "\t\t Computation : " << computationCounter.cumulated() << " s\n" );
         }
 
-        FDEBUG( counterTime.tac() );
-        FDEBUG( FDebug::Controller << "\tFinished ("  << counterTime.elapsed() << "s)\n" );
-        FDEBUG( FDebug::Controller << "\t\t Computation : " << totalComputation << " s\n" );
         FTRACE( FTrace::Controller.leaveFunction(FTrace::FMM) );
     }
 
@@ -322,7 +319,7 @@ public:
     void directPass(){
         FTRACE( FTrace::Controller.enterFunction(FTrace::FMM, __FUNCTION__ , __FILE__ , __LINE__) );
         FDEBUG( FDebug::Controller.write("\tStart Direct Pass\n").write(FDebug::Flush); );
-        FDEBUG( counterTime.tic() );
+        FDEBUG(FTic counterTime);
 
         int leafs = 0;
         {
@@ -336,8 +333,8 @@ public:
         }
 
         const int heightMinusOne = OctreeHeight - 1;
-        FDEBUG(computationCounter.tic());
-        #pragma omp parallel num_threads(FThreadNumbers)
+        FDEBUG(FTic computationCounter);
+        #pragma omp parallel
         {
             Kernel * const myThreadkernels = kernels[omp_get_thread_num()];
             // There is a maximum of 26 neighbors
@@ -354,7 +351,7 @@ public:
         FDEBUG(computationCounter.tac());
 
         FDEBUG( counterTime.tac() );
-        FDEBUG( FDebug::Controller << "\tFinished ("  << counterTime.elapsed() << "s)\n" );
+        FDEBUG( FDebug::Controller << "\tFinished ("  << counterTime.tacAndElapsed() << "s)\n" );
         FDEBUG( FDebug::Controller << "\t\t Computation : " << computationCounter.elapsed() << " s\n" );
         FTRACE( FTrace::Controller.leaveFunction(FTrace::FMM) );
     }
diff --git a/Src/Core/FFmmAlgorithmThreadUs.hpp b/Src/Core/FFmmAlgorithmThreadUs.hpp
index c8bd5a14d4a7834993ea6e8e9d6c2331a83eccd0..5d73dd677e167117fec68a9bd3240014aae6c002 100644
--- a/Src/Core/FFmmAlgorithmThreadUs.hpp
+++ b/Src/Core/FFmmAlgorithmThreadUs.hpp
@@ -40,13 +40,12 @@ class FFmmAlgorithmThreadUs : protected FAssertable{
     typedef KernelClass<ParticleClass, CellClass, OctreeHeight> Kernel;
 
     Octree* const tree;                  //< The octree to work on
-    Kernel* kernels[FThreadNumbers];          //< The kernels
-
-    FDEBUG(FTic counterTime);                //< In case of debug: to count the elapsed time
-    FDEBUG(FTic computationCounter);     //< In case of debug: to  count computation time
+    Kernel** kernels;                    //< The kernels
 
     OctreeIterator* iterArray;
 
+    const int MaxThreads;
+
 public:	
     /** The constructor need the octree and the kernels used for computation
       * @param inTree the octree to work on
@@ -54,12 +53,12 @@ public:
       * An assert is launched if one of the arguments is null
       */
     FFmmAlgorithmThreadUs(Octree* const inTree, Kernel* const inKernels)
-                      : tree(inTree) , iterArray(0) {
+                      : tree(inTree), kernels(0), iterArray(0), MaxThreads(omp_get_max_threads()) {
 
         assert(tree, "tree cannot be null", __LINE__, __FILE__);
-        assert(kernels, "kernels cannot be null", __LINE__, __FILE__);
 
-        for(int idxThread = 0 ; idxThread < FThreadNumbers ; ++idxThread){
+        this->kernels = new Kernel*[MaxThreads];
+        for(int idxThread = 0 ; idxThread < MaxThreads ; ++idxThread){
             this->kernels[idxThread] = new KernelClass<ParticleClass, CellClass, OctreeHeight>(*inKernels);
         }
 
@@ -68,9 +67,10 @@ public:
 
     /** Default destructor */
     virtual ~FFmmAlgorithmThreadUs(){
-        for(int idxThread = 0 ; idxThread < FThreadNumbers ; ++idxThread){
+        for(int idxThread = 0 ; idxThread < MaxThreads ; ++idxThread){
             delete this->kernels[idxThread];
         }
+        delete [] this->kernels;
     }
 
     /**
@@ -90,7 +90,7 @@ public:
         iterArray = new OctreeIterator[leafs];
         assert(iterArray, "iterArray bad alloc", __LINE__, __FILE__);
 
-        for(int idxThread = 0 ; idxThread < FThreadNumbers ; ++idxThread){
+        for(int idxThread = 0 ; idxThread < MaxThreads ; ++idxThread){
             this->kernels[idxThread]->init();
         }
 
@@ -111,7 +111,7 @@ public:
     void bottomPass(){
         FTRACE( FTrace::Controller.enterFunction(FTrace::FMM, __FUNCTION__ , __FILE__ , __LINE__) );
         FDEBUG( FDebug::Controller.write("\tStart Bottom Pass\n").write(FDebug::Flush) );
-        FDEBUG( counterTime.tic() );
+        FDEBUG(FTic counterTime);
 
         OctreeIterator octreeIterator(tree);
         int leafs = 0;
@@ -122,8 +122,8 @@ public:
             ++leafs;
         } while(octreeIterator.moveRight());
 
-        FDEBUG(computationCounter.tic());
-        #pragma omp parallel num_threads(FThreadNumbers)
+        FDEBUG(FTic computationCounter);
+        #pragma omp parallel
         {
             Kernel * const myThreadkernels = kernels[omp_get_thread_num()];
             #pragma omp for
@@ -135,8 +135,7 @@ public:
         }
         FDEBUG(computationCounter.tac());
 
-        FDEBUG( counterTime.tac() );
-        FDEBUG( FDebug::Controller << "\tFinished ("  << counterTime.elapsed() << "s)\n" );
+        FDEBUG( FDebug::Controller << "\tFinished ("  << counterTime.tacAndElapsed() << "s)\n" );
         FDEBUG( FDebug::Controller << "\t\t Computation : " << computationCounter.elapsed() << " s\n" );
         FTRACE( FTrace::Controller.leaveFunction(FTrace::FMM) );
     }
@@ -145,8 +144,8 @@ public:
     void upwardPass(){
         FTRACE( FTrace::Controller.enterFunction(FTrace::FMM, __FUNCTION__ , __FILE__ , __LINE__) );
         FDEBUG( FDebug::Controller.write("\tStart Upward Pass\n").write(FDebug::Flush); );
-        FDEBUG( counterTime.tic() );
-        FDEBUG( double totalComputation = 0 );
+        FDEBUG(FTic counterTime);
+        FDEBUG(FTic computationCounter);
 
         // Start from leal level - 1
         OctreeIterator octreeIterator(tree);
@@ -166,7 +165,7 @@ public:
             octreeIterator = avoidGotoLeftIterator;// equal octreeIterator.moveUp(); octreeIterator.gotoLeft();
 
             FDEBUG(computationCounter.tic());
-            #pragma omp parallel num_threads(FThreadNumbers)
+            #pragma omp parallel
             {
                 Kernel * const myThreadkernels = kernels[omp_get_thread_num()];
                 #pragma omp for
@@ -177,23 +176,22 @@ public:
                 }
             }
             FDEBUG(computationCounter.tac());
-            FDEBUG(totalComputation += computationCounter.elapsed());
         }
 
-        FDEBUG( counterTime.tac() );
-        FDEBUG( FDebug::Controller << "\tFinished ("  << counterTime.elapsed() << "s)\n" );
-        FDEBUG( FDebug::Controller << "\t\t Computation : " << totalComputation << " s\n" );
+        FDEBUG( FDebug::Controller << "\tFinished ("  << counterTime.tacAndElapsed() << "s)\n" );
+        FDEBUG( FDebug::Controller << "\t\t Computation : " << computationCounter.cumulated() << " s\n" );
         FTRACE( FTrace::Controller.leaveFunction(FTrace::FMM) );
     }
 
     /** M2L L2L */
     void downardPass(){
         FTRACE( FTrace::Controller.enterFunction(FTrace::FMM, __FUNCTION__ , __FILE__ , __LINE__) );
-        FDEBUG( FDebug::Controller.write("\tStart Downward Pass (M2L)\n").write(FDebug::Flush); );
-        FDEBUG( counterTime.tic() );
-        FDEBUG( double totalComputation = 0 );
 
         { // first M2L
+            FDEBUG( FDebug::Controller.write("\tStart Downward Pass (M2L)\n").write(FDebug::Flush); );
+            FDEBUG(FTic counterTime);
+            FDEBUG(FTic computationCounter);
+
             OctreeIterator octreeIterator(tree);
             octreeIterator.moveDown();
             OctreeIterator avoidGotoLeftIterator(octreeIterator);
@@ -210,7 +208,7 @@ public:
                 octreeIterator = avoidGotoLeftIterator;
 
                 FDEBUG(computationCounter.tic());
-                #pragma omp parallel num_threads(FThreadNumbers)
+                #pragma omp parallel
                 {
                     Kernel * const myThreadkernels = kernels[omp_get_thread_num()];
                     CellClass* neighbors[208];
@@ -221,17 +219,16 @@ public:
                     }
                 }
                 FDEBUG(computationCounter.tac());
-                FDEBUG(totalComputation += computationCounter.elapsed());
             }
+            FDEBUG( FDebug::Controller << "\tFinished ("  << counterTime.tacAndElapsed() << "s)\n" );
+            FDEBUG( FDebug::Controller << "\t\t Computation : " << computationCounter.cumulated() << " s\n" );
         }
-        FDEBUG( counterTime.tac() );
-        FDEBUG( FDebug::Controller << "\tFinished ("  << counterTime.elapsed() << "s)\n" );
-        FDEBUG( FDebug::Controller << "\t\t Computation : " << totalComputation << " s\n" );
 
-        FDEBUG( FDebug::Controller.write("\tStart Downward Pass (L2L)\n").write(FDebug::Flush); );
-        FDEBUG( counterTime.tic() );
-        FDEBUG( totalComputation = 0 );
         { // second L2L
+            FDEBUG( FDebug::Controller.write("\tStart Downward Pass (L2L)\n").write(FDebug::Flush); );
+            FDEBUG(FTic counterTime);
+            FDEBUG(FTic computationCounter);
+
             OctreeIterator octreeIterator(tree);
             octreeIterator.moveDown();
 
@@ -250,7 +247,7 @@ public:
                 octreeIterator = avoidGotoLeftIterator;
 
                 FDEBUG(computationCounter.tic());
-                #pragma omp parallel num_threads(FThreadNumbers)
+                #pragma omp parallel
                 {
                     Kernel * const myThreadkernels = kernels[omp_get_thread_num()];
                     #pragma omp for
@@ -259,13 +256,11 @@ public:
                     }
                 }
                 FDEBUG(computationCounter.tac());
-                FDEBUG(totalComputation += computationCounter.elapsed());
             }
+            FDEBUG( FDebug::Controller << "\tFinished ("  << counterTime.tacAndElapsed() << "s)\n" );
+            FDEBUG( FDebug::Controller << "\t\t Computation : " << computationCounter.cumulated() << " s\n" );
         }
 
-        FDEBUG( counterTime.tac() );
-        FDEBUG( FDebug::Controller << "\tFinished ("  << counterTime.elapsed() << "s)\n" );
-        FDEBUG( FDebug::Controller << "\t\t Computation : " << totalComputation << " s\n" );
         FTRACE( FTrace::Controller.leaveFunction(FTrace::FMM) );
     }
 
@@ -273,7 +268,7 @@ public:
     void directPass(){
         FTRACE( FTrace::Controller.enterFunction(FTrace::FMM, __FUNCTION__ , __FILE__ , __LINE__) );
         FDEBUG( FDebug::Controller.write("\tStart Direct Pass\n").write(FDebug::Flush); );
-        FDEBUG( counterTime.tic() );
+        FDEBUG(FTic counterTime);
 
         int leafs = 0;
         {
@@ -287,8 +282,8 @@ public:
         }
 
         const int heightMinusOne = OctreeHeight - 1;
-        FDEBUG(computationCounter.tic());
-        #pragma omp parallel num_threads(FThreadNumbers)
+        FDEBUG(FTic computationCounter);
+        #pragma omp parallel
         {
             Kernel * const myThreadkernels = kernels[omp_get_thread_num()];
             // There is a maximum of 26 neighbors
@@ -304,8 +299,7 @@ public:
         }
         FDEBUG(computationCounter.tac());
 
-        FDEBUG( counterTime.tac() );
-        FDEBUG( FDebug::Controller << "\tFinished ("  << counterTime.elapsed() << "s)\n" );
+        FDEBUG( FDebug::Controller << "\tFinished ("  << counterTime.tacAndElapsed() << "s)\n" );
         FDEBUG( FDebug::Controller << "\t\t Computation : " << computationCounter.elapsed() << " s\n" );
         FTRACE( FTrace::Controller.leaveFunction(FTrace::FMM) );
     }
diff --git a/Src/ScalFmmConfig.h.cmake b/Src/ScalFmmConfig.h.cmake
index b0b9f85ca0ec4efbdb1567fb8fb5d3040458c529..26775a7adc17494cec0caffc6f4fec0166f58ada 100644
--- a/Src/ScalFmmConfig.h.cmake
+++ b/Src/ScalFmmConfig.h.cmake
@@ -1,6 +1,17 @@
 #ifndef SSCALFMMCONFIG_H
 #define SSCALFMMCONFIG_H
 
-#cmakedefine FUSE_MKL_AS_BLAS
+///////////////////////////////////////////////////////
+// Blas
+///////////////////////////////////////////////////////
+
+#cmakedefine SCALFMM_USE_CBLAS
+#cmakedefine SCALFMM_USE_MKL_AS_BLAS
+
+///////////////////////////////////////////////////////
+// MPI
+///////////////////////////////////////////////////////
+
+#cmakedefine SCALFMM_USE_MPI
 
 #endif // CONFIG_H
diff --git a/Src/Utils/FGlobal.hpp b/Src/Utils/FGlobal.hpp
index c170160fa7cb6274d01b9e1beba6e13b7d27642f..aa8397c46795ba2262d31068d585e4a00dac6786 100644
--- a/Src/Utils/FGlobal.hpp
+++ b/Src/Utils/FGlobal.hpp
@@ -27,18 +27,6 @@
 // Uncomment the next line to use trace mode
 //#define SCALFMM_USE_TRACE
 
-///////////////////////////////////////////////////////
-// MPI
-///////////////////////////////////////////////////////
-
-#define SCALFMM_USE_MPI
-
-///////////////////////////////////////////////////////
-// Threads
-///////////////////////////////////////////////////////
-
-static const int FThreadNumbers = 1;
-
 ///////////////////////////////////////////////////////
 // Types
 ///////////////////////////////////////////////////////
diff --git a/Src/Utils/FTic.hpp b/Src/Utils/FTic.hpp
index 51fe87db19fb833cfdb56e3e7175e71157edde1f..2a990aba46e6a26eb13b4b953867dd657c4c389d 100644
--- a/Src/Utils/FTic.hpp
+++ b/Src/Utils/FTic.hpp
@@ -29,12 +29,14 @@
   */
 class FTic {
 private:
-    double start;   //< start time (tic)
-    double end;     //< stop time (tac)
+    double start;       //< start time (tic)
+    double end;         //< stop time (tac)
+    double cumulate;    //< the cumulate time
 
 public:
     /** Constructor */
-    FTic() : start(0.0), end(0.0) {
+    FTic() : start(0.0), end(0.0), cumulate(0.0) {
+        tic();
     }
 
     /** Tic : start <= current time */
@@ -45,6 +47,7 @@ public:
     /** Tac : end <= current time */
     void tac(){
         this->end = FTic::GetTime();
+        cumulate += elapsed();
     }
 
     /** Return end - start
@@ -53,6 +56,19 @@ public:
         return this->end - this->start;
     }
 
+    /** Return cumulate
+      * @return the time elapsed between ALL tic & tac in second */
+    double cumulated() const{
+        return cumulate;
+    }
+
+    /** Return end - start
+      * @return the time elapsed between tic & tac in second */
+    double tacAndElapsed() {
+        tac();
+        return elapsed();
+    }
+
     /** Global get time
       * @return a global time
       * GetTickCount on windows
diff --git a/Tests/CMakeLists.txt b/Tests/CMakeLists.txt
index 3fa605710327cbef8405e48aa2ab8dbf16d8b3f4..d4d6aaaf75b03327bc505fc45f5c3b216bc4e241 100644
--- a/Tests/CMakeLists.txt
+++ b/Tests/CMakeLists.txt
@@ -25,7 +25,7 @@ file(
 
 # Adding the project sources dir as an include dir
 INCLUDE_DIRECTORIES(
-     ${CMAKE_BINARY_DIR}/Sources 
+     ${CMAKE_BINARY_DIR}/Src 
 )
 
 # Add execs - 1 cpp = 1 exec
diff --git a/Tests/testTic.cpp b/Tests/testTic.cpp
index 872876b0afb4791517ae780679d9dd9a26587c6e..da77a545d2c80deda3c22837e8f70aab79bebd76 100644
--- a/Tests/testTic.cpp
+++ b/Tests/testTic.cpp
@@ -15,16 +15,31 @@ int main(){
     std::cout << ">> It is only interesting to wath the code to understand\n";
     std::cout << ">> how to use FTic time counter.\n";
     //////////////////////////////////////////////////////////////
-
+    {
 	FTic counter;	
-
 	counter.tic();
 	usleep(1500000);
 	//Sleep(1500); //on windows
 	counter.tac();
-
 	std::cout << counter.elapsed() << " (s)\n";
-
-	return 0;
+    }
+    {
+        FTic counter;
+        usleep(1500000);
+        //Sleep(1500); //on windows
+        std::cout << counter.tacAndElapsed() << " (s)\n";
+    }
+    {
+        FTic counter;
+        usleep(1500000);
+        //Sleep(1500); //on windows
+        counter.tac();
+        counter.tic();
+        usleep(1500000);
+        //Sleep(1500); //on windows
+        std::cout << counter.tacAndElapsed() << " (s)\n";
+        std::cout << counter.cumulated() << " (s)\n";
+    }
+    return 0;
 }
 
diff --git a/UTests/CMakeLists.txt b/UTests/CMakeLists.txt
index f705b02065cd3123c4145199faea3a4559328801..41deb71ffefe66b8486d09deb2e87656a3ca9df9 100644
--- a/UTests/CMakeLists.txt
+++ b/UTests/CMakeLists.txt
@@ -22,7 +22,7 @@ file(
 
 # Adding the project sources dir as an include dir
 INCLUDE_DIRECTORIES(
-     ${CMAKE_BINARY_DIR}/Sources 
+     ${CMAKE_BINARY_DIR}/Src 
 )
 
 # Add execs - 1 cpp = 1 exec