diff --git a/CMakeLists.txt b/CMakeLists.txt
index fe411a702260b870c5d5e0ca8acbef65a400a54e..db2983fd84758c9add26c19804f1c7df89be10e7 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -42,6 +42,7 @@ MESSAGE(STATUS " CXX  ${CMAKE_CXX_COMPILER_ID}" )
 #
 # Options
 OPTION( ScalFMM_USE_BLAS 		"Set to ON to build ScaFMM with BLAS" OFF )
+OPTION( ScalFMM_USE_FFTW 		"Set to ON to build ScaFMM with FFTW" OFF )
 OPTION( ScalFMM_USE_TRACE 		"Set to ON to print trace or use itac trace" OFF )
 OPTION( ScalFMM_BUILD_TESTS 		"Set to ON to build fonctionnalities Tests" OFF )
 OPTION( ScalFMM_BUILD_UTESTS 		"Set to ON to build UTests" OFF )
@@ -138,7 +139,8 @@ endif()
 if( ScalFMM_USE_BLAS )
   OPTION( ScalFMM_USE_MKL_AS_BLAS "Set to ON to use MKL CBLAS" OFF )
   if( ScalFMM_USE_MKL_AS_BLAS )
-     SET(BLAS_LIBRARIES "-L$ENV{MKLROOT}/lib;-lmkl_intel_lp64;-lmkl_sequential;-lmkl_core" CACHE STRING "Set your MKL flags")
+     SET(BLAS_LIBRARIES
+  "-L$ENV{MKLROOT}/lib;-lmkl_intel_lp64;-lmkl_sequential;-lmkl_core" CACHE STRING "Set your MKL flags")
      SET(LAPACK_LIBRARIES "")
   elseif(ScalFMM_USE_EXTERNAL_BLAS) 
          MESSAGE(STATUS "BLAS SET BY EXTERNAL PROGRAM = ${BLAS_LIBRARIES}")
@@ -150,6 +152,18 @@ if( ScalFMM_USE_BLAS )
   MESSAGE(STATUS "SCALFMM_LIBRARIES          = ${SCALFMM_LIBRARIES}")
 endif(ScalFMM_USE_BLAS)
 
+# FFTW option
+if( ScalFMM_USE_FFTW )
+  OPTION( ScalFMM_USE_MKL_AS_FFTW "Set to ON to use MKL FFTW" ON )
+  if( ScalFMM_USE_MKL_AS_FFTW )
+    SET(FFTW_LIBRARIES "-I$ENV{MKLROOT}/include/fftw; -L$ENV{MKLROOT}/lib; -lmkl_intel_lp64; -lmkl_sequential; -lmkl_core; -lpthread; -lm" CACHE STRING "Set your MKL flags")
+  else()
+    SET(FFTW_LIBRARIES "-lfftw3" CACHE STRING "Use LIBFFTW")
+  endif()
+  SET(SCALFMM_LIBRARIES "${SCALFMM_LIBRARIES}; ${FFTW_LIBRARIES}")
+  MESSAGE(STATUS "SCALFMM_LIBRARIES          = ${SCALFMM_LIBRARIES}")
+endif(ScalFMM_USE_FFTW)
+
 # Compile option
 #ADD_DEFINITIONS(-Wall -Wshadow -Wpointer-arith -Wcast-qual -Wconversion -fpic )
 # 
diff --git a/Src/Kernels/Chebyshev/FAbstractChebKernel.hpp b/Src/Kernels/Chebyshev/FAbstractChebKernel.hpp
index c65e4a598d7fdde90ad9a09456c723d7fd587ea9..ecf9ba5d87aa7d1641c43ead36e057ca8ecfc3bf 100755
--- a/Src/Kernels/Chebyshev/FAbstractChebKernel.hpp
+++ b/Src/Kernels/Chebyshev/FAbstractChebKernel.hpp
@@ -22,7 +22,7 @@
 
 #include "../../Components/FAbstractKernels.hpp"
 
-#include "FChebP2PKernels.hpp"
+#include "../Interpolation/FInterpP2PKernels.hpp"
 #include "./FChebInterpolator.hpp"
 
 #include "../../Containers/FTreeCoordinate.hpp"
diff --git a/Src/Kernels/Chebyshev/FChebInterpolator.hpp b/Src/Kernels/Chebyshev/FChebInterpolator.hpp
index ff3133670738108e6ead437f52e3a9c7f4ec302d..71c1b2263c40b9cb8cb2a6541874f9333b15216b 100755
--- a/Src/Kernels/Chebyshev/FChebInterpolator.hpp
+++ b/Src/Kernels/Chebyshev/FChebInterpolator.hpp
@@ -17,7 +17,7 @@
 #define FCHEBINTERPOLATOR_HPP
 
 
-#include "./FChebMapping.hpp"
+#include "./../Interpolation/FInterpMapping.hpp"
 #include "./FChebTensor.hpp"
 #include "./FChebRoots.hpp"
 
@@ -296,7 +296,7 @@ class FChebInterpolator : FNoCopyable
 
             // set child info
             FChebTensor<ORDER>::setRelativeChildCenter(child, ChildCenter);
-            FChebTensor<ORDER>::setChebyshevRoots(ChildCenter, ChildWidth, ChildCoords);
+            FChebTensor<ORDER>::setPolynomialsRoots(ChildCenter, ChildWidth, ChildCoords);
 
             // allocate memory
             ChildParentInterpolator[child] = new FReal [3 * ORDER*ORDER];
diff --git a/Src/Kernels/Chebyshev/FChebKernel.hpp b/Src/Kernels/Chebyshev/FChebKernel.hpp
index c064d568d882628154dd3862d7327b5e386abc15..2c169262dccee226664a81bf6f2d7846239669d8 100755
--- a/Src/Kernels/Chebyshev/FChebKernel.hpp
+++ b/Src/Kernels/Chebyshev/FChebKernel.hpp
@@ -70,8 +70,8 @@ public:
 			M2LHandler(new M2LHandlerClass(Epsilon))
 	{
 		// read precomputed compressed m2l operators from binary file
-		M2LHandler->ReadFromBinaryFileAndSet();
-		//M2LHandler->ComputeAndCompressAndSet();
+		//M2LHandler->ReadFromBinaryFileAndSet();
+		M2LHandler->ComputeAndCompressAndSet();
 	}
 
 
@@ -209,14 +209,14 @@ public:
                      ContainerClass* const NeighborSourceParticles[27],
                      const int /* size */)
     {
-        DirectInteactionComputer<MatrixKernelClass::Identifier, NVALS>::P2P(TargetParticles,NeighborSourceParticles);
+        DirectInteractionComputer<MatrixKernelClass::Identifier, NVALS>::P2P(TargetParticles,NeighborSourceParticles);
     }
 
 
     void P2PRemote(const FTreeCoordinate& /*inPosition*/,
                    ContainerClass* const FRestrict inTargets, const ContainerClass* const FRestrict /*inSources*/,
                    ContainerClass* const inNeighbors[27], const int /*inSize*/){
-        DirectInteactionComputer<MatrixKernelClass::Identifier, NVALS>::P2PRemote(inTargets,inNeighbors,27);
+        DirectInteractionComputer<MatrixKernelClass::Identifier, NVALS>::P2PRemote(inTargets,inNeighbors,27);
     }
 
 };
diff --git a/Src/Kernels/Chebyshev/FChebM2LHandler.hpp b/Src/Kernels/Chebyshev/FChebM2LHandler.hpp
index fe33b955491ab569a26736370ca3ed54478060f2..4c84f77d0b09f69896a7885dc40266fbbd488a3c 100755
--- a/Src/Kernels/Chebyshev/FChebM2LHandler.hpp
+++ b/Src/Kernels/Chebyshev/FChebM2LHandler.hpp
@@ -99,8 +99,12 @@ public:
 		// check if aready set
 		if (U||C||B) throw std::runtime_error("Compressed M2L operator already set");
 		rank = ComputeAndCompress(epsilon, U, C, B);
+
+    unsigned long sizeM2L = 343*rank*rank*sizeof(FReal);
+
+
 		// write info
-		std::cout << "Compressed and set M2L operators (" << rank << ") in "
+		std::cout << "Compressed and set M2L operators (" << long(sizeM2L) << " B) in "
 							<< time.tacAndElapsed() << "sec."	<< std::endl;
 	}
 
diff --git a/Src/Kernels/Chebyshev/FChebMatrixKernel.hpp b/Src/Kernels/Chebyshev/FChebMatrixKernel.hpp
deleted file mode 100755
index bbbbc5d3a49982e6ddff5c6602f04d473d385191..0000000000000000000000000000000000000000
--- a/Src/Kernels/Chebyshev/FChebMatrixKernel.hpp
+++ /dev/null
@@ -1,215 +0,0 @@
-// ===================================================================================
-// Copyright ScalFmm 2011 INRIA, Olivier Coulaud, Bérenger Bramas, Matthias Messner
-// olivier.coulaud@inria.fr, berenger.bramas@inria.fr
-// This software is a computer program whose purpose is to compute the FMM.
-//
-// This software is governed by the CeCILL-C and LGPL licenses and
-// abiding by the rules of distribution of free software.  
-// 
-// This program is distributed in the hope that it will be useful,
-// but WITHOUT ANY WARRANTY; without even the implied warranty of
-// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-// GNU General Public and CeCILL-C Licenses for more details.
-// "http://www.cecill.info". 
-// "http://www.gnu.org/licenses".
-// ===================================================================================
-#ifndef FCHEBMATRIXKERNEL_HPP
-#define FCHEBMATRIXKERNEL_HPP
-
-#include "../../Utils/FPoint.hpp"
-#include "../../Utils/FNoCopyable.hpp"
-#include "../../Utils/FMath.hpp"
-#include "../../Utils/FBlas.hpp"
-
-
-// extendable
-enum KERNEL_FUNCCTION_IDENTIFIER {ONE_OVER_R,
-																	ONE_OVER_R_SQUARED,
-																	LEONARD_JONES_POTENTIAL};
-
-// probably not extedable :)
-enum KERNEL_FUNCTION_TYPE {HOMOGENEOUS, NON_HOMOGENEOUS};
-
-
-/**
- * @author Matthias Messner (matthias.messner@inria.fr)
- * @class FChebMatrixKernels
- * Please read the license
- */
-struct FChebAbstractMatrixKernel : FNoCopyable
-{
-    virtual ~FChebAbstractMatrixKernel(){} // to remove warning
-	virtual FReal evaluate(const FPoint&, const FPoint&) const = 0;
-	// I need both functions because required arguments are not always given
-	virtual FReal getScaleFactor(const FReal, const int) const = 0;
-	virtual FReal getScaleFactor(const FReal) const = 0;
-};
-
-
-
-/// One over r
-struct FChebMatrixKernelR : FChebAbstractMatrixKernel
-{
-	static const KERNEL_FUNCTION_TYPE Type = HOMOGENEOUS;
-	static const KERNEL_FUNCCTION_IDENTIFIER Identifier = ONE_OVER_R;
-
-	FChebMatrixKernelR() {}
-
-	FReal evaluate(const FPoint& x, const FPoint& y) const
-	{
-		const FPoint xy(x-y);
-		return FReal(1.) / FMath::Sqrt(xy.getX()*xy.getX() +
-																	 xy.getY()*xy.getY() +
-																	 xy.getZ()*xy.getZ());
-	}
-
-	FReal getScaleFactor(const FReal RootCellWidth, const int TreeLevel) const
-	{
-		const FReal CellWidth(RootCellWidth / FReal(FMath::pow(2, TreeLevel)));
-		return getScaleFactor(CellWidth);
-	}
-
-	FReal getScaleFactor(const FReal CellWidth) const
-	{
-		return FReal(2.) / CellWidth;
-	}
-};
-
-
-
-/// One over r^2
-struct FChebMatrixKernelRR : FChebAbstractMatrixKernel
-{
-	static const KERNEL_FUNCTION_TYPE Type = HOMOGENEOUS;
-	static const KERNEL_FUNCCTION_IDENTIFIER Identifier = ONE_OVER_R_SQUARED;
-
-	FChebMatrixKernelRR() {}
-
-	FReal evaluate(const FPoint& x, const FPoint& y) const
-	{
-		const FPoint xy(x-y);
-		return FReal(1.) / FReal(xy.getX()*xy.getX() +
-														 xy.getY()*xy.getY() +
-														 xy.getZ()*xy.getZ());
-	}
-
-	FReal getScaleFactor(const FReal RootCellWidth, const int TreeLevel) const
-	{
-		const FReal CellWidth(RootCellWidth / FReal(FMath::pow(2, TreeLevel)));
-		return getScaleFactor(CellWidth);
-	}
-
-	FReal getScaleFactor(const FReal CellWidth) const
-	{
-		return FReal(4.) / CellWidth;
-	}
-};
-
-
-
-/// One over r^12 - One over r^6
-struct FChebMatrixKernelLJ : FChebAbstractMatrixKernel
-{
-	static const KERNEL_FUNCTION_TYPE Type = NON_HOMOGENEOUS;
-	static const KERNEL_FUNCCTION_IDENTIFIER Identifier = LEONARD_JONES_POTENTIAL;
-
-	FChebMatrixKernelLJ() {}
-
-	FReal evaluate(const FPoint& x, const FPoint& y) const
-	{
-		const FPoint xy(x-y);
-		const FReal r = xy.norm();
-		const FReal r3 = r*r*r;
-		const FReal one_over_r6 = FReal(1.) / (r3*r3);
-		//return one_over_r6 * one_over_r6;
-		//return one_over_r6;
-		return one_over_r6 * one_over_r6 - one_over_r6;
-	}
-
-	FReal getScaleFactor(const FReal, const int) const
-	{
-		// return 1 because non homogeneous kernel functions cannot be scaled!!!
-		return FReal(1.);
-	}
-
-	FReal getScaleFactor(const FReal) const
-	{
-		// return 1 because non homogeneous kernel functions cannot be scaled!!!
-		return FReal(1.);
-	}
-};
-
-
-
-
-
-/*!  Functor which provides the interface to assemble a matrix based on the
-  number of rows and cols and on the coordinates x and y and the type of the
-  generating matrix-kernel function.
- */
-template <typename MatrixKernel>
-class EntryComputer
-{
-	const MatrixKernel Kernel;
-
-	const unsigned int nx, ny;
-	const FPoint *const px, *const py;
-
-	const FReal *const weights;
-
-public:
-	explicit EntryComputer(const unsigned int _nx, const FPoint *const _px,
-												 const unsigned int _ny, const FPoint *const _py,
-												 const FReal *const _weights = NULL)
-		: Kernel(),	nx(_nx), ny(_ny), px(_px), py(_py), weights(_weights) {}
-
-//	template <typename Point>
-//	void operator()(const unsigned int nx, const Point *const px,
-//									const unsigned int ny, const Point *const py,
-//									FReal *const data) const
-//	{
-//		for (unsigned int j=0; j<ny; ++j)
-//			for (unsigned int i=0; i<nx; ++i)
-//				data[j*nx + i] = Kernel.evaluate(px[i], py[j]);
-//	}
-
-	void operator()(const unsigned int xbeg, const unsigned int xend,
-									const unsigned int ybeg, const unsigned int yend,
-									FReal *const data) const
-	{
-		unsigned int idx = 0;
-		if (weights) {
-			for (unsigned int j=ybeg; j<yend; ++j)
-				for (unsigned int i=xbeg; i<xend; ++i)
-					data[idx++] = weights[i] * weights[j] * Kernel.evaluate(px[i], py[j]);
-		} else {
-			for (unsigned int j=ybeg; j<yend; ++j)
-				for (unsigned int i=xbeg; i<xend; ++i)
-					data[idx++] = Kernel.evaluate(px[i], py[j]);
-		}
-
-		/*
-		// apply weighting matrices
-		if (weights) {
-			if ((xend-xbeg) == (yend-ybeg) && (xend-xbeg) == nx)
-				for (unsigned int n=0; n<nx; ++n) {
-					FBlas::scal(nx, weights[n], data + n,  nx); // scale rows
-					FBlas::scal(nx, weights[n], data + n * nx); // scale cols
-				}
-			else if ((xend-xbeg) == 1 && (yend-ybeg) == ny)
-				for (unsigned int j=0; j<ny; ++j)	data[j] *= weights[j];
-			else if ((yend-ybeg) == 1 && (xend-xbeg) == nx)
-				for (unsigned int i=0; i<nx; ++i)	data[i] *= weights[i];
-		}
-		*/
-
-	}
-};
-
-
-
-
-
-#endif // FCHEBMATRIXKERNEL_HPP
-
-// [--END--]
diff --git a/Src/Kernels/Chebyshev/FChebP2PKernels.hpp b/Src/Kernels/Chebyshev/FChebP2PKernels.hpp
deleted file mode 100644
index f54b41c669e098477460e2388b83d6ad6a47c340..0000000000000000000000000000000000000000
--- a/Src/Kernels/Chebyshev/FChebP2PKernels.hpp
+++ /dev/null
@@ -1,101 +0,0 @@
-#ifndef FCHEBP2PKERNELS_HPP
-#define FCHEBP2PKERNELS_HPP
-
-
-#include "../P2P/FP2P.hpp"
-
-template <KERNEL_FUNCCTION_IDENTIFIER Identifier, int NVALS>
-struct DirectInteactionComputer;
-
-///////////////////////////////////////////////////////
-// P2P Wrappers
-///////////////////////////////////////////////////////
-
-/*! Specialization for Laplace potential */
-template <>
-struct DirectInteactionComputer<ONE_OVER_R, 1>
-{
-    template <typename ContainerClass>
-    static void P2P(		 ContainerClass* const FRestrict TargetParticles,
-                     ContainerClass* const NeighborSourceParticles[27]){
-        FP2P::FullMutual(TargetParticles,NeighborSourceParticles,14);
-    }
-
-    template <typename ContainerClass>
-    static void P2PRemote( ContainerClass* const FRestrict inTargets,
-                           ContainerClass* const inNeighbors[27],
-                           const int inSize){
-        FP2P::FullRemote(inTargets,inNeighbors,inSize);
-    }
-};
-
-
-/*! Specialization for Leonard-Jones potential */
-template <>
-struct DirectInteactionComputer<LEONARD_JONES_POTENTIAL, 1>
-{
-    template <typename ContainerClass>
-    static void P2P(		 ContainerClass* const FRestrict TargetParticles,
-                     ContainerClass* const NeighborSourceParticles[27]){
-        FP2P::FullMutualLJ(TargetParticles,NeighborSourceParticles,14);
-    }
-
-    template <typename ContainerClass>
-    static void P2PRemote( ContainerClass* const FRestrict inTargets,
-                           ContainerClass* const inNeighbors[27],
-                           const int inSize){
-        FP2P::FullRemoteLJ(inTargets,inNeighbors,inSize);
-    }
-};
-
-///////////////////////////////////////////////////////
-// In case of multi right hand side
-///////////////////////////////////////////////////////
-
-
-
-template <int NVALS>
-struct DirectInteactionComputer<ONE_OVER_R, NVALS>
-{
-    template <typename ContainerClass>
-    static void P2P(		 ContainerClass* const FRestrict TargetParticles,
-                     ContainerClass* const NeighborSourceParticles[27]){
-        for(int idxRhs = 0 ; idxRhs < NVALS ; ++idxRhs){
-            FP2P::FullMutual(TargetParticles,NeighborSourceParticles,14);
-        }
-    }
-
-    template <typename ContainerClass>
-    static void P2PRemote( ContainerClass* const FRestrict inTargets,
-                           ContainerClass* const inNeighbors[27],
-                           const int inSize){
-        for(int idxRhs = 0 ; idxRhs < NVALS ; ++idxRhs){
-            FP2P::FullRemote(inTargets,inNeighbors,inSize);
-        }
-    }
-};
-
-
-/*! Specialization for Leonard-Jones potential */
-template <int NVALS>
-struct DirectInteactionComputer<LEONARD_JONES_POTENTIAL, NVALS>
-{
-    template <typename ContainerClass>
-    static void P2P(		 ContainerClass* const FRestrict TargetParticles,
-                     ContainerClass* const NeighborSourceParticles[27]){
-        for(int idxRhs = 0 ; idxRhs < NVALS ; ++idxRhs){
-            FP2P::FullMutualLJ(TargetParticles,NeighborSourceParticles,14);
-        }
-    }
-
-    template <typename ContainerClass>
-    static void P2PRemote( ContainerClass* const FRestrict inTargets,
-                           ContainerClass* const inNeighbors[27],
-                           const int inSize){
-        for(int idxRhs = 0 ; idxRhs < NVALS ; ++idxRhs){
-            FP2P::FullRemoteLJ(inTargets,inNeighbors,inSize);
-        }
-    }
-};
-
-#endif // FCHEBP2PKERNELS_HPP
diff --git a/Src/Kernels/Chebyshev/FChebSymKernel.hpp b/Src/Kernels/Chebyshev/FChebSymKernel.hpp
index b54556a86e095806d7084380a59a5e63671f0a24..9143c569d3df31db88f6bedca3e98c498b4db249 100755
--- a/Src/Kernels/Chebyshev/FChebSymKernel.hpp
+++ b/Src/Kernels/Chebyshev/FChebSymKernel.hpp
@@ -454,14 +454,14 @@ public:
                      ContainerClass* const NeighborSourceParticles[27],
                      const int /* size */)
     {
-        DirectInteactionComputer<MatrixKernelClass::Identifier, NVALS>::P2P(TargetParticles,NeighborSourceParticles);
+        DirectInteractionComputer<MatrixKernelClass::Identifier, NVALS>::P2P(TargetParticles,NeighborSourceParticles);
     }
 
 
     void P2PRemote(const FTreeCoordinate& /*inPosition*/,
                    ContainerClass* const FRestrict inTargets, const ContainerClass* const FRestrict /*inSources*/,
                    ContainerClass* const inNeighbors[27], const int /*inSize*/){
-        DirectInteactionComputer<MatrixKernelClass::Identifier, NVALS>::P2PRemote(inTargets,inNeighbors,27);
+        DirectInteractionComputer<MatrixKernelClass::Identifier, NVALS>::P2PRemote(inTargets,inNeighbors,27);
     }
 
 };
diff --git a/Src/Kernels/Chebyshev/FChebSymM2LHandler.hpp b/Src/Kernels/Chebyshev/FChebSymM2LHandler.hpp
index c3595be8b987bfb8814a9167e7cefdb75abc85f1..bd205555f55635dd4e85c142cdd4bb9667d1b36a 100755
--- a/Src/Kernels/Chebyshev/FChebSymM2LHandler.hpp
+++ b/Src/Kernels/Chebyshev/FChebSymM2LHandler.hpp
@@ -479,9 +479,12 @@ static void precompute(const MatrixKernelClass *const MatrixKernel, const FReal
 			}
 		}
 	}
-	//	std::cout << "The approximation of the " << counter
-	//					<< " far-field interactions (overall rank " << overall_rank << ") took "
-	//					<< overall_time << "s\n" << std::endl;
+		std::cout << "The approximation of the " << counter
+              << " far-field interactions (overall rank " << overall_rank 
+              << " / " << 16*nnodes 
+              << " , sizeM2L= " << 2*overall_rank*nnodes*sizeof(FReal) << ""
+              << " / " << 16*nnodes*nnodes*sizeof(FReal) << " B"
+              << ") took " << overall_time << "s\n" << std::endl;
 	delete [] U;
 	delete [] WORK;
 	delete [] VT;
diff --git a/Src/Kernels/Chebyshev/FChebTensor.hpp b/Src/Kernels/Chebyshev/FChebTensor.hpp
index 37af0d4c520e84bc90f778b79dae89f1eafe743b..627f09c10ef5f923b069e027875290d0b5820527 100755
--- a/Src/Kernels/Chebyshev/FChebTensor.hpp
+++ b/Src/Kernels/Chebyshev/FChebTensor.hpp
@@ -4,13 +4,13 @@
 // This software is a computer program whose purpose is to compute the FMM.
 //
 // This software is governed by the CeCILL-C and LGPL licenses and
-// abiding by the rules of distribution of free software.  
-// 
+// abiding by the rules of distribution of free software.
+//
 // This program is distributed in the hope that it will be useful,
 // but WITHOUT ANY WARRANTY; without even the implied warranty of
 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 // GNU General Public and CeCILL-C Licenses for more details.
-// "http://www.cecill.info". 
+// "http://www.cecill.info".
 // "http://www.gnu.org/licenses".
 // ===================================================================================
 #ifndef FCHEBTENSOR_HPP
@@ -19,7 +19,7 @@
 #include "../../Utils/FMath.hpp"
 
 #include "./FChebRoots.hpp"
-#include "./FChebMapping.hpp"
+#include "./../Interpolation/FInterpTensor.hpp"
 
 
 /**
@@ -27,19 +27,6 @@
  * Please read the license
  */
 
-/**
- * @class TensorTraits
- *
- * The class @p TensorTraits gives the number of interpolation nodes per
- * cluster in 3D, depending on the interpolation order.
- *
- * @tparam ORDER interpolation order
- */
-template <int ORDER> struct TensorTraits
-{
-	enum {nnodes = ORDER*ORDER*ORDER};
-};
-
 
 
 /**
@@ -51,123 +38,37 @@ template <int ORDER> struct TensorTraits
  * @tparam ORDER interpolation order \f$\ell\f$
  */
 template <int ORDER>
-class FChebTensor : FNoCopyable
+class FChebTensor : public FInterpTensor<ORDER,FChebRoots<ORDER>>
 {
-	enum {nnodes = TensorTraits<ORDER>::nnodes};
-  typedef FChebRoots<ORDER> BasisType;
-	
-public:
-	/**
-	 * Sets the ids of the coordinates of all \f$\ell^3\f$ interpolation
-	 * nodes
-	 *
-	 * @param[out] NodeIds ids of coordinates of interpolation nodes
-	 */
-	static
-	void setNodeIds(unsigned int NodeIds[nnodes][3])
-	{
-		for (unsigned int n=0; n<nnodes; ++n) {
-			NodeIds[n][0] =  n         % ORDER;
-			NodeIds[n][1] = (n/ ORDER) % ORDER;
-			NodeIds[n][2] =  n/(ORDER  * ORDER);
-		}
-	}
-
-
-	/**
-	 * Sets the roots of the Chebyshev quadrature weights defined as \f$w_i =
-	 * \frac{\pi}{\ell}\sqrt{1-\bar x_i^2}\f$ with the Chebyshev roots \f$\bar
-	 * x\f$.
-	 *
-	 * @param weights[out] the root of the weights \f$\sqrt{w_i}\f$
-	 */
-	static
-	void setRootOfWeights(FReal weights[nnodes])
-	{
-		// weights in 1d
-		FReal weights_1d[ORDER];
-		for (unsigned int o=0; o<ORDER; ++o)
-			weights_1d[o] = FMath::FPi/ORDER * FMath::Sqrt(FReal(1.)-FReal(BasisType::roots[o])*FReal(BasisType::roots[o]));
-		// weights in 3d (tensor structure)
-		unsigned int node_ids[nnodes][3];
-		setNodeIds(node_ids);
-		for (unsigned int n=0; n<nnodes; ++n) {
-			weights[n] = FMath::Sqrt(weights_1d[node_ids[n][0]]*weights_1d[node_ids[n][1]]*weights_1d[node_ids[n][2]]);
-		}
-	}
-
-
-	/**
-	 * Sets the interpolation points in the cluster with @p center and @p width
-	 *
-	 * @param[in] center of cluster
-	 * @param[in] width of cluster
-	 * @param[out] rootPositions coordinates of interpolation points
-	 */
-	static
-	void setRoots(const FPoint& center, const FReal width, FPoint rootPositions[nnodes])
-	{
-		unsigned int node_ids[nnodes][3];
-		setNodeIds(node_ids);
-		const map_loc_glob map(center, width);
-		FPoint localPosition;
+  enum {nnodes = TensorTraits<ORDER>::nnodes};
+  typedef FChebRoots<ORDER> BasisType; 
+  typedef FInterpTensor<ORDER,BasisType> ParentTensor;
+
+ public:
+
+  /**
+   * Sets the roots of the Chebyshev quadrature weights defined as \f$w_i =
+   * \frac{\pi}{\ell}\sqrt{1-\bar x_i^2}\f$ with the Chebyshev roots \f$\bar
+   * x\f$.
+   *
+   * @param weights[out] the root of the weights \f$\sqrt{w_i}\f$
+   */
+  static
+    void setRootOfWeights(FReal weights[nnodes])
+  {
+    // weights in 1d
+    FReal weights_1d[ORDER];
+    for (unsigned int o=0; o<ORDER; ++o)
+      weights_1d[o] = FMath::FPi/ORDER * FMath::Sqrt(FReal(1.)-FReal(BasisType::roots[o])*FReal(BasisType::roots[o]));
+    // weights in 3d (tensor structure)
+    unsigned int node_ids[nnodes][3];
+    ParentTensor::setNodeIds(node_ids);
     for (unsigned int n=0; n<nnodes; ++n) {
-			localPosition.setX(FReal(BasisType::roots[node_ids[n][0]]));
-			localPosition.setY(FReal(BasisType::roots[node_ids[n][1]]));
-			localPosition.setZ(FReal(BasisType::roots[node_ids[n][2]]));
-			map(localPosition, rootPositions[n]);
-		}
-	}
+      weights[n] = FMath::Sqrt(weights_1d[node_ids[n][0]]*weights_1d[node_ids[n][1]]*weights_1d[node_ids[n][2]]);
+    }
+  }
 
-	/**
-	 * Sets the Chebyshev roots in the cluster with @p center and @p width
-	 *
-	 * @param[in] center of cluster
-	 * @param[in] width of cluster
-	 * @param[out] roots coordinates of Chebyshev roots
-	 */
-	static
-	void setChebyshevRoots(const FPoint& center, const FReal width, FReal roots[3][ORDER])
-	{
-		const map_loc_glob map(center, width);
-		FPoint lPos, gPos;
-    for (unsigned int n=0; n<ORDER; ++n) {
-			lPos.setX(FReal(BasisType::roots[n]));
-			lPos.setY(FReal(BasisType::roots[n]));
-			lPos.setZ(FReal(BasisType::roots[n]));
-			map(lPos, gPos);
-			roots[0][n] = gPos.getX();
-			roots[1][n] = gPos.getY();
-			roots[2][n] = gPos.getZ();
-		}
-	}
-
-	/**
-	 * Set the relative child (width = 1) center according to the Morton index.
-	 *
-	 * @param[in] ChildIndex index of child according to Morton index
-	 * @param[out] center
-	 */
-	static
-	void setRelativeChildCenter(const unsigned int ChildIndex,
-															FPoint& ChildCenter)
-	{
-		const int RelativeChildPositions[][3] = { {-1, -1, -1},
-																							{-1, -1,  1},
-																							{-1,  1, -1},
-																							{-1,  1,  1},
-																							{ 1, -1, -1},
-																							{ 1, -1,  1},
-																							{ 1,  1, -1},
-																							{ 1,  1,  1} };
-		ChildCenter.setX(FReal(RelativeChildPositions[ChildIndex][0]) / FReal(2.));
-		ChildCenter.setY(FReal(RelativeChildPositions[ChildIndex][1]) / FReal(2.));
-		ChildCenter.setZ(FReal(RelativeChildPositions[ChildIndex][2]) / FReal(2.));
-	}
 };
 
 
-
-
-
 #endif
diff --git a/Src/Kernels/Chebyshev/FChebMapping.hpp b/Src/Kernels/Interpolation/FInterpMapping.hpp
similarity index 91%
rename from Src/Kernels/Chebyshev/FChebMapping.hpp
rename to Src/Kernels/Interpolation/FInterpMapping.hpp
index 43b13ff4b53625c0e48e35915ba4849905eb610f..21a4b6b25302636c0dfa9737ee9401a73ad8ea42 100755
--- a/Src/Kernels/Chebyshev/FChebMapping.hpp
+++ b/Src/Kernels/Interpolation/FInterpMapping.hpp
@@ -13,8 +13,8 @@
 // "http://www.cecill.info". 
 // "http://www.gnu.org/licenses".
 // ===================================================================================
-#ifndef FCHEBMAPPING_HPP
-#define FCHEBMAPPING_HPP
+#ifndef FINTERPMAPPING_HPP
+#define FINTERPMAPPING_HPP
 
 #include <limits>
 
@@ -27,19 +27,19 @@
  */
 
 /**
- * @class FChebMapping
+ * @class FInterpMapping
  *
- * The class @p FChebMapping is the base class for the affine mapping
+ * The class @p FInterpMapping is the base class for the affine mapping
  * \f$\Phi:[-1,1]\rightarrow[a,b] \f$ and the inverse affine mapping
  * \f$\Phi^{-1}:[a,b]\rightarrow[-1,1]\f$.
  */
-class FChebMapping : FNoCopyable
+class FInterpMapping : FNoCopyable
 {
 protected:
   FPoint a;
   FPoint b;
  
-  explicit FChebMapping(const FPoint& center,
+  explicit FInterpMapping(const FPoint& center,
 											 const FReal width)
 		: a(center.getX() - width / FReal(2.),
 				center.getY() - width / FReal(2.),
@@ -94,11 +94,11 @@ public:
  * \f$\Phi^{-1}:[a,b]\rightarrow[-1,1]\f$. It maps from global coordinates to
  * local ones.
  */
-class map_glob_loc : public FChebMapping
+class map_glob_loc : public FInterpMapping
 {
 public:
   explicit map_glob_loc(const FPoint& center, const FReal width)
-		: FChebMapping(center, width) {}
+		: FInterpMapping(center, width) {}
   
 	/**
 	 * Maps from a global position to its local position: \f$\Phi^{-1}(x) =
@@ -130,11 +130,11 @@ public:
  * This class defines the affine mapping \f$\Phi:[-1,1]\rightarrow[a,b]\f$. It
  * maps from local coordinates to global ones.
  */
-class map_loc_glob : public FChebMapping
+class map_loc_glob : public FInterpMapping
 {
 public:
   explicit map_loc_glob(const FPoint& center, const FReal width)
-		: FChebMapping(center, width) {}
+		: FInterpMapping(center, width) {}
   
 	// globPos = (a + b) / 2 + (b - a) * loclPos / 2;
 	/**
@@ -157,4 +157,4 @@ public:
 
 
 
-#endif
+#endif /*FUNIFTENSOR_HPP*/
diff --git a/Src/Kernels/Interpolation/FInterpMatrixKernel.hpp b/Src/Kernels/Interpolation/FInterpMatrixKernel.hpp
new file mode 100755
index 0000000000000000000000000000000000000000..fa61c83aada050e9112f2fc602464c6abc08480c
--- /dev/null
+++ b/Src/Kernels/Interpolation/FInterpMatrixKernel.hpp
@@ -0,0 +1,215 @@
+// ===================================================================================
+// Copyright ScalFmm 2011 INRIA, Olivier Coulaud, Bérenger Bramas, Matthias Messner
+// olivier.coulaud@inria.fr, berenger.bramas@inria.fr
+// This software is a computer program whose purpose is to compute the FMM.
+//
+// This software is governed by the CeCILL-C and LGPL licenses and
+// abiding by the rules of distribution of free software.
+//
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public and CeCILL-C Licenses for more details.
+// "http://www.cecill.info".
+// "http://www.gnu.org/licenses".
+// ===================================================================================
+#ifndef FINTERPMATRIXKERNEL_HPP
+#define FINTERPMATRIXKERNEL_HPP
+
+#include "../../Utils/FPoint.hpp"
+#include "../../Utils/FNoCopyable.hpp"
+#include "../../Utils/FMath.hpp"
+#include "../../Utils/FBlas.hpp"
+
+
+// extendable
+enum KERNEL_FUNCTION_IDENTIFIER {ONE_OVER_R,
+                                 ONE_OVER_R_SQUARED,
+                                 LENNARD_JONES_POTENTIAL};
+
+// probably not extedable :)
+enum KERNEL_FUNCTION_TYPE {HOMOGENEOUS, NON_HOMOGENEOUS};
+
+
+/**
+ * @author Matthias Messner (matthias.messner@inria.fr)
+ * @class FInterpMatrixKernels
+ * Please read the license
+ */
+struct FInterpAbstractMatrixKernel : FNoCopyable
+{
+  virtual ~FInterpAbstractMatrixKernel(){} // to remove warning
+  virtual FReal evaluate(const FPoint&, const FPoint&) const = 0;
+  // I need both functions because required arguments are not always given
+  virtual FReal getScaleFactor(const FReal, const int) const = 0;
+  virtual FReal getScaleFactor(const FReal) const = 0;
+};
+
+
+
+/// One over r
+struct FInterpMatrixKernelR : FInterpAbstractMatrixKernel
+{
+  static const KERNEL_FUNCTION_TYPE Type = HOMOGENEOUS;
+  static const KERNEL_FUNCTION_IDENTIFIER Identifier = ONE_OVER_R;
+
+  FInterpMatrixKernelR() {}
+
+  FReal evaluate(const FPoint& x, const FPoint& y) const
+  {
+    const FPoint xy(x-y);
+    return FReal(1.) / FMath::Sqrt(xy.getX()*xy.getX() +
+                                   xy.getY()*xy.getY() +
+                                   xy.getZ()*xy.getZ());
+  }
+
+  FReal getScaleFactor(const FReal RootCellWidth, const int TreeLevel) const
+  {
+    const FReal CellWidth(RootCellWidth / FReal(FMath::pow(2, TreeLevel)));
+    return getScaleFactor(CellWidth);
+  }
+
+  FReal getScaleFactor(const FReal CellWidth) const
+  {
+    return FReal(2.) / CellWidth;
+  }
+};
+
+
+
+/// One over r^2
+struct FInterpMatrixKernelRR : FInterpAbstractMatrixKernel
+{
+  static const KERNEL_FUNCTION_TYPE Type = HOMOGENEOUS;
+  static const KERNEL_FUNCTION_IDENTIFIER Identifier = ONE_OVER_R_SQUARED;
+
+  FInterpMatrixKernelRR() {}
+
+  FReal evaluate(const FPoint& x, const FPoint& y) const
+  {
+    const FPoint xy(x-y);
+    return FReal(1.) / FReal(xy.getX()*xy.getX() +
+                             xy.getY()*xy.getY() +
+                             xy.getZ()*xy.getZ());
+  }
+
+  FReal getScaleFactor(const FReal RootCellWidth, const int TreeLevel) const
+  {
+    const FReal CellWidth(RootCellWidth / FReal(FMath::pow(2, TreeLevel)));
+    return getScaleFactor(CellWidth);
+  }
+
+  FReal getScaleFactor(const FReal CellWidth) const
+  {
+    return FReal(4.) / CellWidth;
+  }
+};
+
+
+
+/// One over r^12 - One over r^6
+struct FInterpMatrixKernelLJ : FInterpAbstractMatrixKernel
+{
+  static const KERNEL_FUNCTION_TYPE Type = NON_HOMOGENEOUS;
+  static const KERNEL_FUNCTION_IDENTIFIER Identifier = LENNARD_JONES_POTENTIAL;
+
+  FInterpMatrixKernelLJ() {}
+
+  FReal evaluate(const FPoint& x, const FPoint& y) const
+  {
+    const FPoint xy(x-y);
+    const FReal r = xy.norm();
+    const FReal r3 = r*r*r;
+    const FReal one_over_r6 = FReal(1.) / (r3*r3);
+    //return one_over_r6 * one_over_r6;
+    //return one_over_r6;
+    return one_over_r6 * one_over_r6 - one_over_r6;
+  }
+
+  FReal getScaleFactor(const FReal, const int) const
+  {
+    // return 1 because non homogeneous kernel functions cannot be scaled!!!
+    return FReal(1.);
+  }
+
+  FReal getScaleFactor(const FReal) const
+  {
+    // return 1 because non homogeneous kernel functions cannot be scaled!!!
+    return FReal(1.);
+  }
+};
+
+
+
+
+
+/*!  Functor which provides the interface to assemble a matrix based on the
+  number of rows and cols and on the coordinates x and y and the type of the
+  generating matrix-kernel function.
+*/
+template <typename MatrixKernel>
+class EntryComputer
+{
+  const MatrixKernel Kernel;
+
+  const unsigned int nx, ny;
+  const FPoint *const px, *const py;
+
+  const FReal *const weights;
+
+public:
+  explicit EntryComputer(const unsigned int _nx, const FPoint *const _px,
+                         const unsigned int _ny, const FPoint *const _py,
+                         const FReal *const _weights = NULL)
+    : Kernel(),	nx(_nx), ny(_ny), px(_px), py(_py), weights(_weights) {}
+
+  //	template <typename Point>
+  //	void operator()(const unsigned int nx, const Point *const px,
+  //									const unsigned int ny, const Point *const py,
+  //									FReal *const data) const
+  //	{
+  //		for (unsigned int j=0; j<ny; ++j)
+  //			for (unsigned int i=0; i<nx; ++i)
+  //				data[j*nx + i] = Kernel.evaluate(px[i], py[j]);
+  //	}
+
+  void operator()(const unsigned int xbeg, const unsigned int xend,
+                  const unsigned int ybeg, const unsigned int yend,
+                  FReal *const data) const
+  {
+    unsigned int idx = 0;
+    if (weights) {
+      for (unsigned int j=ybeg; j<yend; ++j)
+        for (unsigned int i=xbeg; i<xend; ++i)
+          data[idx++] = weights[i] * weights[j] * Kernel.evaluate(px[i], py[j]);
+    } else {
+      for (unsigned int j=ybeg; j<yend; ++j)
+        for (unsigned int i=xbeg; i<xend; ++i)
+          data[idx++] = Kernel.evaluate(px[i], py[j]);
+    }
+
+    /*
+    // apply weighting matrices
+    if (weights) {
+    if ((xend-xbeg) == (yend-ybeg) && (xend-xbeg) == nx)
+    for (unsigned int n=0; n<nx; ++n) {
+    FBlas::scal(nx, weights[n], data + n,  nx); // scale rows
+    FBlas::scal(nx, weights[n], data + n * nx); // scale cols
+    }
+    else if ((xend-xbeg) == 1 && (yend-ybeg) == ny)
+    for (unsigned int j=0; j<ny; ++j)	data[j] *= weights[j];
+    else if ((yend-ybeg) == 1 && (xend-xbeg) == nx)
+    for (unsigned int i=0; i<nx; ++i)	data[i] *= weights[i];
+    }
+    */
+
+  }
+};
+
+
+
+
+
+#endif // FINTERPMATRIXKERNEL_HPP
+
+// [--END--]
diff --git a/Src/Kernels/Interpolation/FInterpP2PKernels.hpp b/Src/Kernels/Interpolation/FInterpP2PKernels.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..476d42d7863f8058b017dc62ee8f78258420c68c
--- /dev/null
+++ b/Src/Kernels/Interpolation/FInterpP2PKernels.hpp
@@ -0,0 +1,101 @@
+#ifndef FINTERPP2PKERNELS_HPP
+#define FINTERPP2PKERNELS_HPP
+
+
+#include "../P2P/FP2P.hpp"
+
+template <KERNEL_FUNCTION_IDENTIFIER Identifier, int NVALS>
+struct DirectInteractionComputer;
+
+///////////////////////////////////////////////////////
+// P2P Wrappers
+///////////////////////////////////////////////////////
+
+/*! Specialization for Laplace potential */
+template <>
+struct DirectInteractionComputer<ONE_OVER_R, 1>
+{
+  template <typename ContainerClass>
+  static void P2P( ContainerClass* const FRestrict TargetParticles,
+                   ContainerClass* const NeighborSourceParticles[27]){
+    FP2P::FullMutual(TargetParticles,NeighborSourceParticles,14);
+  }
+
+  template <typename ContainerClass>
+  static void P2PRemote( ContainerClass* const FRestrict inTargets,
+                         ContainerClass* const inNeighbors[27],
+                         const int inSize){
+    FP2P::FullRemote(inTargets,inNeighbors,inSize);
+  }
+};
+
+
+/*! Specialization for Lennard-Jones potential */
+template <>
+struct DirectInteractionComputer<LENNARD_JONES_POTENTIAL, 1>
+{
+  template <typename ContainerClass>
+  static void P2P(ContainerClass* const FRestrict TargetParticles,
+                  ContainerClass* const NeighborSourceParticles[27]){
+    FP2P::FullMutualLJ(TargetParticles,NeighborSourceParticles,14);
+  }
+
+  template <typename ContainerClass>
+  static void P2PRemote(ContainerClass* const FRestrict inTargets,
+                        ContainerClass* const inNeighbors[27],
+                        const int inSize){
+    FP2P::FullRemoteLJ(inTargets,inNeighbors,inSize);
+  }
+};
+
+///////////////////////////////////////////////////////
+// In case of multi right hand side
+///////////////////////////////////////////////////////
+
+
+
+template <int NVALS>
+struct DirectInteractionComputer<ONE_OVER_R, NVALS>
+{
+  template <typename ContainerClass>
+  static void P2P(ContainerClass* const FRestrict TargetParticles,
+                  ContainerClass* const NeighborSourceParticles[27]){
+    for(int idxRhs = 0 ; idxRhs < NVALS ; ++idxRhs){
+      FP2P::FullMutual(TargetParticles,NeighborSourceParticles,14);
+    }
+  }
+
+  template <typename ContainerClass>
+  static void P2PRemote(ContainerClass* const FRestrict inTargets,
+                        ContainerClass* const inNeighbors[27],
+                        const int inSize){
+    for(int idxRhs = 0 ; idxRhs < NVALS ; ++idxRhs){
+      FP2P::FullRemote(inTargets,inNeighbors,inSize);
+    }
+  }
+};
+
+
+/*! Specialization for Lennard-Jones potential */
+template <int NVALS>
+struct DirectInteractionComputer<LENNARD_JONES_POTENTIAL, NVALS>
+{
+  template <typename ContainerClass>
+  static void P2P(ContainerClass* const FRestrict TargetParticles,
+                  ContainerClass* const NeighborSourceParticles[27]){
+    for(int idxRhs = 0 ; idxRhs < NVALS ; ++idxRhs){
+      FP2P::FullMutualLJ(TargetParticles,NeighborSourceParticles,14);
+    }
+  }
+
+  template <typename ContainerClass>
+  static void P2PRemote(ContainerClass* const FRestrict inTargets,
+                        ContainerClass* const inNeighbors[27],
+                        const int inSize){
+    for(int idxRhs = 0 ; idxRhs < NVALS ; ++idxRhs){
+      FP2P::FullRemoteLJ(inTargets,inNeighbors,inSize);
+    }
+  }
+};
+
+#endif // FINTERPP2PKERNELS_HPP
diff --git a/Src/Kernels/Interpolation/FInterpTensor.hpp b/Src/Kernels/Interpolation/FInterpTensor.hpp
new file mode 100755
index 0000000000000000000000000000000000000000..38ee2a2f4f9f57b5c3746a32019bb3449d90cd3d
--- /dev/null
+++ b/Src/Kernels/Interpolation/FInterpTensor.hpp
@@ -0,0 +1,154 @@
+// ===================================================================================
+// Copyright ScalFmm 2011 INRIA, Olivier Coulaud, Bérenger Bramas, Matthias Messner
+// olivier.coulaud@inria.fr, berenger.bramas@inria.fr
+// This software is a computer program whose purpose is to compute the FMM.
+//
+// This software is governed by the CeCILL-C and LGPL licenses and
+// abiding by the rules of distribution of free software.
+//
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public and CeCILL-C Licenses for more details.
+// "http://www.cecill.info".
+// "http://www.gnu.org/licenses".
+// ===================================================================================
+#ifndef FINTERPTENSOR_HPP
+#define FINTERPTENSOR_HPP
+
+#include "../../Utils/FMath.hpp"
+
+#include "./FInterpMapping.hpp"
+
+
+/**
+ * @author Pierre Blanchard (pierre.blanchard@inria.fr)
+ * Please read the license
+ */
+
+/**
+ * @class TensorTraits
+ *
+ * The class @p TensorTraits gives the number of interpolation nodes per
+ * cluster in 3D, depending on the interpolation order.
+ *
+ * @tparam ORDER interpolation order
+ */
+template <int ORDER> struct TensorTraits
+{
+	enum {nnodes = ORDER*ORDER*ORDER};
+};
+
+
+/**
+ * @class FInterpTensor
+ *
+ * The class FInterpTensor provides function considering the tensor product
+ * interpolation.
+ *
+ * @tparam ORDER interpolation order \f$\ell\f$
+ * @tparam RootsClass class containing the roots choosen for the interpolation 
+ * (e.g. FChebRoots, FUnifRoots...)
+
+ */
+template <int ORDER, typename RootsClass>
+class FInterpTensor : FNoCopyable
+{
+  enum {nnodes = TensorTraits<ORDER>::nnodes};
+  typedef RootsClass BasisType;
+
+public:
+
+  /**
+   * Sets the ids of the coordinates of all \f$\ell^3\f$ interpolation
+   * nodes
+   *
+   * @param[out] NodeIds ids of coordinates of interpolation nodes
+   */
+  static
+  void setNodeIds(unsigned int NodeIds[nnodes][3])
+  {
+    for (unsigned int n=0; n<nnodes; ++n) {
+      NodeIds[n][0] =  n         % ORDER;
+      NodeIds[n][1] = (n/ ORDER) % ORDER;
+      NodeIds[n][2] =  n/(ORDER  * ORDER);
+    }
+  }
+
+
+  /**
+   * Sets the interpolation points in the cluster with @p center and @p width
+   *
+   * PB: tensorial version
+   *
+   * @param[in] center of cluster
+   * @param[in] width of cluster
+   * @param[out] rootPositions coordinates of interpolation points
+   */
+  static
+  void setRoots(const FPoint& center, const FReal width, FPoint rootPositions[nnodes])
+  {
+    unsigned int node_ids[nnodes][3];
+    setNodeIds(node_ids);
+    const map_loc_glob map(center, width);
+    FPoint localPosition;
+    for (unsigned int n=0; n<nnodes; ++n) {
+      localPosition.setX(FReal(BasisType::roots[node_ids[n][0]]));
+      localPosition.setY(FReal(BasisType::roots[node_ids[n][1]]));
+      localPosition.setZ(FReal(BasisType::roots[node_ids[n][2]]));
+      map(localPosition, rootPositions[n]);
+    }
+  }
+
+  /**
+   * Sets the equispaced roots in the cluster with @p center and @p width
+   *
+   * @param[in] center of cluster
+   * @param[in] width of cluster
+   * @param[out] roots coordinates of equispaced roots
+   */
+  static
+  void setPolynomialsRoots(const FPoint& center, const FReal width, FReal roots[3][ORDER])
+  {
+    const map_loc_glob map(center, width);
+    FPoint lPos, gPos;
+    for (unsigned int n=0; n<ORDER; ++n) {
+      lPos.setX(FReal(BasisType::roots[n]));
+      lPos.setY(FReal(BasisType::roots[n]));
+      lPos.setZ(FReal(BasisType::roots[n]));
+      map(lPos, gPos);
+      roots[0][n] = gPos.getX();
+      roots[1][n] = gPos.getY();
+      roots[2][n] = gPos.getZ();
+    }
+  }
+
+  /**
+   * Set the relative child (width = 1) center according to the Morton index.
+   *
+   * @param[in] ChildIndex index of child according to Morton index
+   * @param[out] center
+   */
+  static
+  void setRelativeChildCenter(const unsigned int ChildIndex,
+                              FPoint& ChildCenter)
+  {
+    const int RelativeChildPositions[][3] = { {-1, -1, -1},
+                                              {-1, -1,  1},
+                                              {-1,  1, -1},
+                                              {-1,  1,  1},
+                                              { 1, -1, -1},
+                                              { 1, -1,  1},
+                                              { 1,  1, -1},
+                                              { 1,  1,  1} };
+    ChildCenter.setX(FReal(RelativeChildPositions[ChildIndex][0]) / FReal(2.));
+    ChildCenter.setY(FReal(RelativeChildPositions[ChildIndex][1]) / FReal(2.));
+    ChildCenter.setZ(FReal(RelativeChildPositions[ChildIndex][2]) / FReal(2.));
+  }
+};
+
+
+
+
+
+#endif /*FINTERPTENSOR_HPP*/
diff --git a/Src/Kernels/Uniform/FAbstractUnifKernel.hpp b/Src/Kernels/Uniform/FAbstractUnifKernel.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..18bcf6242ff89cf7bc5d9a9614f4c184d7843efb
--- /dev/null
+++ b/Src/Kernels/Uniform/FAbstractUnifKernel.hpp
@@ -0,0 +1,146 @@
+// ===================================================================================
+// Copyright ScalFmm 2011 INRIA, Olivier Coulaud, Bérenger Bramas, Matthias Messner
+// olivier.coulaud@inria.fr, berenger.bramas@inria.fr
+// This software is a computer program whose purpose is to compute the FMM.
+//
+// This software is governed by the CeCILL-C and LGPL licenses and
+// abiding by the rules of distribution of free software.
+//
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public and CeCILL-C Licenses for more details.
+// "http://www.cecill.info".
+// "http://www.gnu.org/licenses".
+// ===================================================================================
+#ifndef FABSTRACTUNIFKERNEL_HPP
+#define FABSTRACTUNIFKERNEL_HPP
+
+#include "../../Utils/FGlobal.hpp"
+#include "../../Utils/FTrace.hpp"
+#include "../../Utils/FSmartPointer.hpp"
+
+#include "../../Components/FAbstractKernels.hpp"
+
+#include "../Interpolation/FInterpP2PKernels.hpp"
+#include "./FUnifInterpolator.hpp"
+
+#include "../../Containers/FTreeCoordinate.hpp"
+
+/**
+ * @author Pierre Blanchard (pierre.blanchard@inria.fr)
+ * @class FAbstractUnifKernel
+ * @brief
+ * This kernels implement the Lagrange interpolation based FMM operators. It
+ * implements all interfaces (P2P, P2M, M2M, M2L, L2L, L2P) which are required by
+ * the FFmmAlgorithm and FFmmAlgorithmThread.
+ *
+ * @tparam CellClass Type of cell
+ * @tparam ContainerClass Type of container to store particles
+ * @tparam MatrixKernelClass Type of matrix kernel function
+ * @tparam ORDER Lagrange interpolation order
+ */
+template < class CellClass,	class ContainerClass,	class MatrixKernelClass, int ORDER, int NVALS = 1>
+class FAbstractUnifKernel : public FAbstractKernels< CellClass, ContainerClass>
+{
+protected:
+  enum {nnodes = TensorTraits<ORDER>::nnodes};
+  typedef FUnifInterpolator<ORDER> InterpolatorClass;
+
+  /// Needed for P2M, M2M, L2L and L2P operators
+  const FSmartPointer<InterpolatorClass,FSmartPointerMemory> Interpolator;
+  /// Needed for P2P operator
+  const FSmartPointer<MatrixKernelClass,FSmartPointerMemory> MatrixKernel;
+  /// Height of the entire oct-tree
+  const unsigned int TreeHeight;
+  /// Corner of oct-tree box
+  const FPoint BoxCorner;
+  /// Width of oct-tree box
+  const FReal BoxWidth;
+  /// Width of a leaf cell box
+  const FReal BoxWidthLeaf;
+
+  /**
+   * Compute center of leaf cell from its tree coordinate.
+   * @param[in] Coordinate tree coordinate
+   * @return center of leaf cell
+   */
+  const FPoint getLeafCellCenter(const FTreeCoordinate& Coordinate) const
+  {
+    return FPoint(BoxCorner.getX() + (FReal(Coordinate.getX()) + FReal(.5)) * BoxWidthLeaf,
+                  BoxCorner.getY() + (FReal(Coordinate.getY()) + FReal(.5)) * BoxWidthLeaf,
+                  BoxCorner.getZ() + (FReal(Coordinate.getZ()) + FReal(.5)) * BoxWidthLeaf);
+  }
+
+public:
+  /**
+   * The constructor initializes all constant attributes and it reads the
+   * precomputed and compressed M2L operators from a binary file (an
+   * runtime_error is thrown if the required file is not valid).
+   */
+  FAbstractUnifKernel(const int inTreeHeight,
+                      const FPoint& inBoxCenter,
+                      const FReal inBoxWidth)
+    : Interpolator(new InterpolatorClass()),
+      MatrixKernel(new MatrixKernelClass()),
+      TreeHeight(inTreeHeight),
+      BoxCorner(inBoxCenter - inBoxWidth / FReal(2.)),
+      BoxWidth(inBoxWidth),
+      BoxWidthLeaf(BoxWidth / FReal(FMath::pow(2, inTreeHeight - 1)))
+  {
+    /* empty */
+  }
+
+  virtual ~FAbstractUnifKernel(){
+    // should not be used
+  }
+
+  const InterpolatorClass *const getPtrToInterpolator() const
+  { return Interpolator.getPtr(); }
+
+
+  virtual void P2M(CellClass* const LeafCell,
+                   const ContainerClass* const SourceParticles) = 0;
+
+
+  virtual void M2M(CellClass* const FRestrict ParentCell,
+                   const CellClass*const FRestrict *const FRestrict ChildCells,
+                   const int TreeLevel) = 0;
+
+
+  virtual void M2L(CellClass* const FRestrict TargetCell,
+                   const CellClass* SourceCells[343],
+                   const int NumSourceCells,
+                   const int TreeLevel) = 0;
+
+
+  virtual void L2L(const CellClass* const FRestrict ParentCell,
+                   CellClass* FRestrict *const FRestrict ChildCells,
+                   const int TreeLevel) = 0;
+
+
+  virtual void L2P(const CellClass* const LeafCell,
+                   ContainerClass* const TargetParticles) = 0;
+
+
+
+  virtual void P2P(const FTreeCoordinate& /* LeafCellCoordinate */, // needed for periodic boundary conditions
+                   ContainerClass* const FRestrict TargetParticles,
+                   const ContainerClass* const FRestrict /*SourceParticles*/,
+                   ContainerClass* const NeighborSourceParticles[27],
+                   const int /* size */) = 0;
+
+
+  virtual void P2PRemote(const FTreeCoordinate& /*inPosition*/,
+                         ContainerClass* const FRestrict inTargets, const ContainerClass* const FRestrict /*inSources*/,
+                         ContainerClass* const inNeighbors[27], const int /*inSize*/) = 0;
+
+};
+
+
+
+
+
+#endif //FABSTRACTUNIFKERNEL_HPP
+
+// [--END--]
diff --git a/Src/Kernels/Uniform/FUnifCell.hpp b/Src/Kernels/Uniform/FUnifCell.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..7584f857572cff4619668ac6537f398bd053ea84
--- /dev/null
+++ b/Src/Kernels/Uniform/FUnifCell.hpp
@@ -0,0 +1,142 @@
+// ===================================================================================
+// Copyright ScalFmm 2011 INRIA, Olivier Coulaud, Bérenger Bramas, Matthias Messner
+// olivier.coulaud@inria.fr, berenger.bramas@inria.fr
+// This software is a computer program whose purpose is to compute the FMM.
+//
+// This software is governed by the CeCILL-C and LGPL licenses and
+// abiding by the rules of distribution of free software.
+//
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public and CeCILL-C Licenses for more details.
+// "http://www.cecill.info".
+// "http://www.gnu.org/licenses".
+// ===================================================================================
+#ifndef FUNIFCELL_HPP
+#define FUNIFCELL_HPP
+
+
+#include "../../Extensions/FExtendMortonIndex.hpp"
+#include "../../Extensions/FExtendCoordinate.hpp"
+
+#include "./FUnifTensor.hpp"
+
+#include "../../Extensions/FExtendCellType.hpp"
+
+#include "../../Utils/FComplexe.hpp"
+
+/**
+ * @author Pierre Blanchard (pierre.blanchard@inria.fr)
+ * @class FUnifCell
+ * Please read the license
+ *
+ * This class defines a cell used in the Lagrange based FMM.
+ * 
+ * PB: !!! exactly the same as FChebCell except the TensorTraits used is 
+ * the one from FUnifTensor. This is a quick work around to avoid multiple 
+ * definition of TensorTraits until it is factorized for all interpolations.
+ *
+ * PB: This class also contains the storage and accessors for the tranformed 
+ * expansion  (in Fourier space, i.e. complex valued).
+ *
+ * @param NVALS is the number of right hand side.
+ */
+template <int ORDER, int NVALS = 1>
+class FUnifCell : public FExtendMortonIndex, public FExtendCoordinate
+{
+  static const int VectorSize = TensorTraits<ORDER>::nnodes;
+  static const int TransformedVectorSize = (2*ORDER-1)*(2*ORDER-1)*(2*ORDER-1);
+
+  FReal multipole_exp[NVALS * VectorSize]; //< Multipole expansion
+  FReal     local_exp[NVALS * VectorSize]; //< Local expansion
+
+  // PB: Store multipole and local expansion in Fourier space
+  FComplexe transformed_multipole_exp[NVALS * TransformedVectorSize];
+  FComplexe     transformed_local_exp[NVALS * TransformedVectorSize];
+
+public:
+  FUnifCell(){
+    memset(multipole_exp, 0, sizeof(FReal) * NVALS * VectorSize);
+    memset(local_exp, 0, sizeof(FReal) * NVALS * VectorSize);
+    memset(transformed_multipole_exp, 0, 
+           sizeof(FComplexe) * NVALS * TransformedVectorSize);
+    memset(transformed_local_exp, 0, 
+           sizeof(FComplexe) * NVALS * TransformedVectorSize);
+  }
+
+  ~FUnifCell() {}
+
+  /** Get Multipole */
+  const FReal* getMultipole(const int inRhs) const
+  {	return this->multipole_exp + inRhs*VectorSize;
+  }
+  /** Get Local */
+  const FReal* getLocal(const int inRhs) const{
+    return this->local_exp + inRhs*VectorSize;
+  }
+
+  /** Get Multipole */
+  FReal* getMultipole(const int inRhs){
+    return this->multipole_exp + inRhs*VectorSize;
+  }
+  /** Get Local */
+  FReal* getLocal(const int inRhs){
+    return this->local_exp + inRhs*VectorSize;
+  }
+
+  /** To get the leading dim of a vec */
+  int getVectorSize() const{
+    return VectorSize;
+  }
+
+  /** Make it like the begining */
+  void resetToInitialState(){
+    memset(multipole_exp, 0, sizeof(FReal) * NVALS * VectorSize);
+    memset(local_exp, 0, sizeof(FReal) * NVALS * VectorSize);
+    memset(transformed_multipole_exp, 0, 
+           sizeof(FComplexe) * NVALS * TransformedVectorSize);
+    memset(transformed_local_exp, 0, 
+           sizeof(FComplexe) * NVALS * TransformedVectorSize);
+  }
+
+  /** Get Transformed Multipole */
+  const FComplexe* getTransformedMultipole(const int inRhs) const
+  {	return this->transformed_multipole_exp + inRhs*TransformedVectorSize;
+  }
+  /** Get Transformed Local */
+  const FComplexe* getTransformedLocal(const int inRhs) const{
+    return this->transformed_local_exp + inRhs*TransformedVectorSize;
+  }
+
+  /** Get Transformed Multipole */
+  FComplexe* getTransformedMultipole(const int inRhs){
+    return this->transformed_multipole_exp + inRhs*TransformedVectorSize;
+  }
+  /** Get Transformed Local */
+  FComplexe* getTransformedLocal(const int inRhs){
+    return this->transformed_local_exp + inRhs*TransformedVectorSize;
+  }
+
+};
+
+template <int ORDER, int NVALS = 1>
+class FTypedUnifCell : public FUnifCell<ORDER,NVALS>, public FExtendCellType {
+public:
+  template <class BufferWriterClass>
+  void save(BufferWriterClass& buffer) const{
+    FUnifCell<ORDER,NVALS>::save(buffer);
+    FExtendCellType::save(buffer);
+  }
+  template <class BufferReaderClass>
+  void restore(BufferReaderClass& buffer){
+    FUnifCell<ORDER,NVALS>::restore(buffer);
+    FExtendCellType::restore(buffer);
+  }
+  void resetToInitialState(){
+    FUnifCell<ORDER,NVALS>::resetToInitialState();
+    FExtendCellType::resetToInitialState();
+  }
+};
+
+#endif //FUNIFCELL_HPP
diff --git a/Src/Kernels/Uniform/FUnifInterpolator.hpp b/Src/Kernels/Uniform/FUnifInterpolator.hpp
new file mode 100755
index 0000000000000000000000000000000000000000..4f7acccbf064c7643a97c12e3436148f56835b3a
--- /dev/null
+++ b/Src/Kernels/Uniform/FUnifInterpolator.hpp
@@ -0,0 +1,550 @@
+// ===================================================================================
+// Copyright ScalFmm 2011 INRIA, Olivier Coulaud, Bérenger Bramas, Matthias Messner
+// olivier.coulaud@inria.fr, berenger.bramas@inria.fr
+// This software is a computer program whose purpose is to compute the FMM.
+//
+// This software is governed by the CeCILL-C and LGPL licenses and
+// abiding by the rules of distribution of free software.
+//
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public and CeCILL-C Licenses for more details.
+// "http://www.cecill.info".
+// "http://www.gnu.org/licenses".
+// ===================================================================================
+#ifndef FUNIFINTERPOLATOR_HPP
+#define FUNIFINTERPOLATOR_HPP
+
+
+#include "./../Interpolation/FInterpMapping.hpp"
+
+#include "./FUnifTensor.hpp"
+#include "./FUnifRoots.hpp"
+
+#include "../../Utils/FBlas.hpp"
+
+
+
+/**
+ * @author Pierre Blanchard (pierre.blanchard@inria.fr)
+ * Please read the license
+ */
+
+/**
+ * @class FUnifInterpolator
+ *
+ * The class @p FUnifInterpolator defines the anterpolation (M2M) and
+ * interpolation (L2L) concerning operations.
+ */
+template <int ORDER>
+class FUnifInterpolator : FNoCopyable
+{
+  // compile time constants and types
+  enum {nnodes = TensorTraits<ORDER>::nnodes};
+  typedef FUnifRoots< ORDER>  BasisType;
+  typedef FUnifTensor<ORDER> TensorType;
+
+  //  FReal T_of_roots[ORDER][ORDER];
+  //  FReal T[ORDER * (ORDER-1)];
+  unsigned int node_ids[nnodes][3];
+  FReal* ChildParentInterpolator[8];
+
+  // permutations (only needed in the tensor product interpolation case)
+  unsigned int perm[3][nnodes];
+
+  ////////////////////////////////////////////////////////////////////
+
+
+
+  /**
+   * Initialize the child - parent - interpolator, it is basically the matrix
+   * S which is precomputed and reused for all M2M and L2L operations, ie for
+   * all non leaf inter/anterpolations.
+   */
+  void initM2MandL2L()
+  {
+    FPoint ParentRoots[nnodes], ChildRoots[nnodes];
+    const FReal ParentWidth(2.);
+    const FPoint ParentCenter(0., 0., 0.);
+    FUnifTensor<ORDER>::setRoots(ParentCenter, ParentWidth, ParentRoots);
+
+    FPoint ChildCenter;
+    const FReal ChildWidth(1.);
+
+    // loop: child cells
+    for (unsigned int child=0; child<8; ++child) {
+
+      // allocate memory
+      ChildParentInterpolator[child] = new FReal [nnodes * nnodes];
+
+      // set child info
+      FUnifTensor<ORDER>::setRelativeChildCenter(child, ChildCenter);
+      FUnifTensor<ORDER>::setRoots(ChildCenter, ChildWidth, ChildRoots);
+
+      // assemble child - parent - interpolator
+      assembleInterpolator(nnodes, ChildRoots, ChildParentInterpolator[child]);
+    }
+  }
+
+  /**
+   * Initialize the child - parent - interpolator, it is basically the matrix
+   * S which is precomputed and reused for all M2M and L2L operations, ie for
+   * all non leaf inter/anterpolations.
+   */
+  void initTensorM2MandL2L()
+  {
+    FPoint ParentRoots[nnodes];
+    FReal ChildCoords[3][ORDER];
+    const FReal ParentWidth(2.);
+    const FPoint ParentCenter(0., 0., 0.);
+    FUnifTensor<ORDER>::setRoots(ParentCenter, ParentWidth, ParentRoots);
+
+    FPoint ChildCenter;
+    const FReal ChildWidth(1.);
+
+    // loop: child cells
+    for (unsigned int child=0; child<8; ++child) {
+
+      // set child info
+      FUnifTensor<ORDER>::setRelativeChildCenter(child, ChildCenter);
+      FUnifTensor<ORDER>::setPolynomialsRoots(ChildCenter, ChildWidth, ChildCoords);
+
+      // allocate memory
+      ChildParentInterpolator[child] = new FReal [3 * ORDER*ORDER];
+      assembleInterpolator(ORDER, ChildCoords[0], ChildParentInterpolator[child]);
+      assembleInterpolator(ORDER, ChildCoords[1], ChildParentInterpolator[child] + 1 * ORDER*ORDER);
+      assembleInterpolator(ORDER, ChildCoords[2], ChildParentInterpolator[child] + 2 * ORDER*ORDER);
+    }
+
+
+    // init permutations
+    for (unsigned int i=0; i<ORDER; ++i) {
+      for (unsigned int j=0; j<ORDER; ++j) {
+        for (unsigned int k=0; k<ORDER; ++k) {
+          const unsigned int index = k*ORDER*ORDER + j*ORDER + i;
+          perm[0][index] = k*ORDER*ORDER + j*ORDER + i;
+          perm[1][index] = i*ORDER*ORDER + k*ORDER + j;
+          perm[2][index] = j*ORDER*ORDER + i*ORDER + k;
+        }
+      }
+    }
+
+  }
+
+
+
+public:
+  /**
+   * Constructor: Initialize the Lagrange polynomials at the equispaced
+   * roots/interpolation point
+   */
+  explicit FUnifInterpolator()
+  {
+    //    // initialize chebyshev polynomials of root nodes: T_o(x_j)
+    //    for (unsigned int o=1; o<ORDER; ++o)
+    //      for (unsigned int j=0; j<ORDER; ++j)
+    //        T_of_roots[o][j] = FReal(BasisType::T(o, FReal(BasisType::roots[j])));
+    //
+    //    // initialize chebyshev polynomials of root nodes: T_o(x_j)
+    //    for (unsigned int o=1; o<ORDER; ++o)
+    //      for (unsigned int j=0; j<ORDER; ++j)
+    //        T[(o-1)*ORDER + j] = FReal(BasisType::T(o, FReal(BasisType::roots[j])));
+
+
+    // initialize root node ids
+    TensorType::setNodeIds(node_ids);
+
+    // initialize interpolation operator for non M2M and L2L (non leaf
+    // operations)
+    //this -> initM2MandL2L();     // non tensor-product interpolation
+    this -> initTensorM2MandL2L(); // tensor-product interpolation
+  }
+
+
+  /**
+   * Destructor: Delete dynamically allocated memory for M2M and L2L operator
+   */
+  ~FUnifInterpolator()
+  {
+    for (unsigned int child=0; child<8; ++child)
+      delete [] ChildParentInterpolator[child];
+  }
+
+
+  /**
+   * Assembles the interpolator \f$S_\ell\f$ of size \f$N\times
+   * \ell^3\f$. Here local points is meant as points whose global coordinates
+   * have already been mapped to the reference interval [-1,1].
+   *
+   * @param[in] NumberOfLocalPoints
+   * @param[in] LocalPoints
+   * @param[out] Interpolator
+   */
+  void assembleInterpolator(const unsigned int NumberOfLocalPoints,
+                            const FPoint *const LocalPoints,
+                            FReal *const Interpolator) const
+  {
+    // values of Lagrange polynomials of source particle: L_o(x_i)
+    FReal L_of_x[ORDER][3];
+    // loop: local points (mapped in [-1,1])
+    for (unsigned int m=0; m<NumberOfLocalPoints; ++m) {
+      // evaluate Lagrange polynomials at local points
+      for (unsigned int o=0; o<ORDER; ++o) {
+        L_of_x[o][0] = BasisType::L(o, LocalPoints[m].getX());
+        L_of_x[o][1] = BasisType::L(o, LocalPoints[m].getY());
+        L_of_x[o][2] = BasisType::L(o, LocalPoints[m].getZ());
+      }
+
+      // assemble interpolator
+      for (unsigned int n=0; n<nnodes; ++n) {
+        Interpolator[n*NumberOfLocalPoints + m] = FReal(1.);
+        for (unsigned int d=0; d<3; ++d) {
+          const unsigned int j = node_ids[n][d];
+          // The Lagrange case is much simpler than the Chebyshev case
+          // as no summation is required
+          Interpolator[n*NumberOfLocalPoints + m] *= L_of_x[j][d];
+        }
+
+      }
+
+    }
+
+  }
+
+
+  void assembleInterpolator(const unsigned int M, const FReal *const x, FReal *const S) const
+  {
+    // loop: local points (mapped in [-1,1])
+    for (unsigned int m=0; m<M; ++m) {
+      // evaluate Lagrange polynomials at local points
+      for (unsigned int o=0; o<ORDER; ++o)
+        S[o*M + m] = BasisType::L(o, x[m]);
+
+    }
+
+  }
+
+
+
+  const FReal *const *const getChildParentInterpolator() const
+  { return ChildParentInterpolator; }
+  const unsigned int *const getPermutationsM2ML2L(unsigned int i) const
+  { return perm[i]; }
+
+
+
+
+
+
+  /**
+   * Particle to moment: application of \f$S_\ell(y,\bar y_n)\f$
+   * (anterpolation, it is the transposed interpolation)
+   */
+  template <class ContainerClass>
+  void applyP2M(const FPoint& center,
+                const FReal width,
+                FReal *const multipoleExpansion,
+                const ContainerClass *const sourceParticles) const;
+
+
+
+  /**
+   * Local to particle operation: application of \f$S_\ell(x,\bar x_m)\f$ (interpolation)
+   */
+  template <class ContainerClass>
+  void applyL2P(const FPoint& center,
+                const FReal width,
+                const FReal *const localExpansion,
+                ContainerClass *const localParticles) const;
+
+
+  /**
+   * Local to particle operation: application of \f$\nabla_x S_\ell(x,\bar x_m)\f$ (interpolation)
+   */
+  template <class ContainerClass>
+  void applyL2PGradient(const FPoint& center,
+                        const FReal width,
+                        const FReal *const localExpansion,
+                        ContainerClass *const localParticles) const;
+
+  /**
+   * Local to particle operation: application of \f$S_\ell(x,\bar x_m)\f$ and
+   * \f$\nabla_x S_\ell(x,\bar x_m)\f$ (interpolation)
+   */
+  template <class ContainerClass>
+  void applyL2PTotal(const FPoint& center,
+                     const FReal width,
+                     const FReal *const localExpansion,
+                     ContainerClass *const localParticles) const;
+
+
+  /*
+    void applyM2M(const unsigned int ChildIndex,
+    const FReal *const ChildExpansion,
+    FReal *const ParentExpansion) const
+    {
+    FBlas::gemtva(nnodes, nnodes, FReal(1.),
+    ChildParentInterpolator[ChildIndex],
+    const_cast<FReal*>(ChildExpansion), ParentExpansion);
+    }
+
+    void applyL2L(const unsigned int ChildIndex,
+    const FReal *const ParentExpansion,
+    FReal *const ChildExpansion) const
+    {
+    FBlas::gemva(nnodes, nnodes, FReal(1.),
+    ChildParentInterpolator[ChildIndex],
+    const_cast<FReal*>(ParentExpansion), ChildExpansion);
+    }
+  */
+
+  // PB: improvement of applyM2M/L2L can be preserved (TOFACTO)
+  // since relative position of child and parent interpolation is remains unchanged
+
+  void applyM2M(const unsigned int ChildIndex,
+                const FReal *const ChildExpansion,
+                FReal *const ParentExpansion) const
+  {
+    FReal Exp[nnodes], PermExp[nnodes];
+    // ORDER*ORDER*ORDER * (2*ORDER-1)
+    FBlas::gemtm(ORDER, ORDER, ORDER*ORDER, FReal(1.),
+                 ChildParentInterpolator[ChildIndex], ORDER,
+                 const_cast<FReal*>(ChildExpansion), ORDER, PermExp, ORDER);
+
+    for (unsigned int n=0; n<nnodes; ++n)	Exp[n] = PermExp[perm[1][n]];
+    // ORDER*ORDER*ORDER * (2*ORDER-1)
+    FBlas::gemtm(ORDER, ORDER, ORDER*ORDER, FReal(1.),
+                 ChildParentInterpolator[ChildIndex] + 2 * ORDER*ORDER, ORDER,
+                 Exp, ORDER, PermExp, ORDER);
+
+    for (unsigned int n=0; n<nnodes; ++n)	Exp[perm[1][n]] = PermExp[perm[2][n]];
+    // ORDER*ORDER*ORDER * (2*ORDER-1)
+    FBlas::gemtm(ORDER, ORDER, ORDER*ORDER, FReal(1.),
+                 ChildParentInterpolator[ChildIndex] + 1 * ORDER*ORDER, ORDER,
+                 Exp, ORDER, PermExp, ORDER);
+
+    for (unsigned int n=0; n<nnodes; ++n)	ParentExpansion[perm[2][n]] += PermExp[n];
+  }
+
+
+  void applyL2L(const unsigned int ChildIndex,
+                const FReal *const ParentExpansion,
+                FReal *const ChildExpansion) const
+  {
+    FReal Exp[nnodes], PermExp[nnodes];
+    // ORDER*ORDER*ORDER * (2*ORDER-1)
+    FBlas::gemm(ORDER, ORDER, ORDER*ORDER, FReal(1.),
+                ChildParentInterpolator[ChildIndex], ORDER,
+                const_cast<FReal*>(ParentExpansion), ORDER, PermExp, ORDER);
+
+    for (unsigned int n=0; n<nnodes; ++n)	Exp[n] = PermExp[perm[1][n]];
+    // ORDER*ORDER*ORDER * (2*ORDER-1)
+    FBlas::gemm(ORDER, ORDER, ORDER*ORDER, FReal(1.),
+                ChildParentInterpolator[ChildIndex] + 2 * ORDER*ORDER, ORDER,
+                Exp, ORDER, PermExp, ORDER);
+
+    for (unsigned int n=0; n<nnodes; ++n)	Exp[perm[1][n]] = PermExp[perm[2][n]];
+    // ORDER*ORDER*ORDER * (2*ORDER-1)
+    FBlas::gemm(ORDER, ORDER, ORDER*ORDER, FReal(1.),
+                ChildParentInterpolator[ChildIndex] + 1 * ORDER*ORDER, ORDER,
+                Exp, ORDER, PermExp, ORDER);
+
+    for (unsigned int n=0; n<nnodes; ++n)	ChildExpansion[perm[2][n]] += PermExp[n];
+  }
+  // total flops count: 3 * ORDER*ORDER*ORDER * (2*ORDER-1)
+};
+
+
+
+
+
+
+
+/**
+ * Particle to moment: application of \f$S_\ell(y,\bar y_n)\f$
+ * (anterpolation, it is the transposed interpolation)
+ */
+template <int ORDER>
+template <class ContainerClass>
+inline void FUnifInterpolator<ORDER>::applyP2M(const FPoint& center,
+                                               const FReal width,
+                                               FReal *const multipoleExpansion,
+                                               const ContainerClass *const inParticles) const
+{
+  // set all multipole expansions to zero
+  FBlas::setzero(nnodes, multipoleExpansion);
+
+  // allocate stuff
+  const map_glob_loc map(center, width);
+  FPoint localPosition;
+
+  // loop over source particles
+  const FReal*const physicalValues = inParticles->getPhysicalValues();
+  const FReal*const positionsX = inParticles->getPositions()[0];
+  const FReal*const positionsY = inParticles->getPositions()[1];
+  const FReal*const positionsZ = inParticles->getPositions()[2];
+
+  for(int idxPart = 0 ; idxPart < inParticles->getNbParticles() ; ++idxPart){
+    // map global position to [-1,1]
+    map(FPoint(positionsX[idxPart],positionsY[idxPart],positionsZ[idxPart]), localPosition); // 15 flops
+    // evaluate Lagrange polynomial at local position
+    FReal L_of_x[ORDER][3];
+    for (unsigned int o=0; o<ORDER; ++o) {
+      L_of_x[o][0] = BasisType::L(o, localPosition.getX()); // 3 * ORDER*(ORDER-1) flops PB: TODO confirm
+      L_of_x[o][1] = BasisType::L(o, localPosition.getY()); // 3 * ORDER*(ORDER-1) flops
+      L_of_x[o][2] = BasisType::L(o, localPosition.getZ()); // 3 * ORDER*(ORDER-1) flops
+    }
+    // read physicalValue
+    const FReal weight = physicalValues[idxPart];
+
+    // assemble multipole expansions
+    for (unsigned int i=0; i<ORDER; ++i) {
+      for (unsigned int j=0; j<ORDER; ++j) {
+        for (unsigned int k=0; k<ORDER; ++k) {
+          const unsigned int idx = k*ORDER*ORDER + j*ORDER + i;
+          multipoleExpansion[idx] += L_of_x[i][0] * L_of_x[j][1] * L_of_x[k][2] * weight; // 3 * ORDER*ORDER*ORDER flops
+        }
+      }
+    }
+
+  } // flops: N * (3 * ORDER*ORDER*ORDER + 3 * 3 * ORDER*(ORDER-1)) flops
+
+}
+
+
+/**
+ * Local to particle operation: application of \f$S_\ell(x,\bar x_m)\f$ (interpolation)
+ */
+template <int ORDER>
+template <class ContainerClass>
+inline void FUnifInterpolator<ORDER>::applyL2P(const FPoint& center,
+                                               const FReal width,
+                                               const FReal *const localExpansion,
+                                               ContainerClass *const inParticles) const
+{
+  // loop over particles
+  const map_glob_loc map(center, width);
+  FPoint localPosition;
+
+  //const FReal*const physicalValues = inParticles->getPhysicalValues();
+  const FReal*const positionsX = inParticles->getPositions()[0];
+  const FReal*const positionsY = inParticles->getPositions()[1];
+  const FReal*const positionsZ = inParticles->getPositions()[2];
+  FReal*const potentials = inParticles->getPotentials();
+
+  for(int idxPart = 0 ; idxPart < inParticles->getNbParticles() ; ++ idxPart){
+
+    // map global position to [-1,1]
+    map(FPoint(positionsX[idxPart],positionsY[idxPart],positionsZ[idxPart]), localPosition); // 15 flops
+
+    // evaluate Lagrange polynomial at local position
+    FReal L_of_x[ORDER][3];
+    for (unsigned int o=0; o<ORDER; ++o) {
+      L_of_x[o][0] = BasisType::L(o, localPosition.getX()); // 3 * ORDER*(ORDER-1) flops PB: TODO confirm
+      L_of_x[o][1] = BasisType::L(o, localPosition.getY()); // 3 * ORDER*(ORDER-1) flops
+      L_of_x[o][2] = BasisType::L(o, localPosition.getZ()); // 3 * ORDER*(ORDER-1) flops
+    }
+
+    // interpolate and increment target value
+    FReal targetValue=0.;
+    {
+      for (unsigned int l=0; l<ORDER; ++l) {
+        for (unsigned int m=0; m<ORDER; ++m) {
+          for (unsigned int n=0; n<ORDER; ++n) {
+            const unsigned int idx = n*ORDER*ORDER + m*ORDER + l;
+            targetValue +=
+              L_of_x[l][0] * L_of_x[m][1] * L_of_x[n][2] * localExpansion[idx];
+          } // ORDER * 4 flops
+        } // ORDER * ORDER * 4 flops
+      } // ORDER * ORDER * ORDER * 4 flops
+    }
+
+    // set potential
+    potentials[idxPart] += (targetValue);
+  } // N * (4 * ORDER * ORDER * ORDER + 9 * ORDER*(ORDER-1) ) flops
+}
+
+
+
+/**
+ * Local to particle operation: application of \f$\nabla_x S_\ell(x,\bar x_m)\f$ (interpolation)
+ */
+template <int ORDER>
+template <class ContainerClass>
+inline void FUnifInterpolator<ORDER>::applyL2PGradient(const FPoint& center,
+                                                       const FReal width,
+                                                       const FReal *const localExpansion,
+                                                       ContainerClass *const inParticles) const
+{
+  ////////////////////////////////////////////////////////////////////
+  // TENSOR-PRODUCT INTERPOLUTION NOT IMPLEMENTED YET HERE!!! ////////
+  ////////////////////////////////////////////////////////////////////
+
+  // setup local to global mapping
+  const map_glob_loc map(center, width);
+  FPoint Jacobian;
+  map.computeJacobian(Jacobian);
+  const FReal jacobian[3] = {Jacobian.getX(), Jacobian.getY(), Jacobian.getZ()};
+  FPoint localPosition;
+  FReal L_of_x[ORDER][3];
+  FReal dL_of_x[ORDER][3];
+
+  const FReal*const physicalValues = inParticles->getPhysicalValues();
+  const FReal*const positionsX = inParticles->getPositions()[0];
+  const FReal*const positionsY = inParticles->getPositions()[1];
+  const FReal*const positionsZ = inParticles->getPositions()[2];
+  FReal*const forcesX = inParticles->getForcesX();
+  FReal*const forcesY = inParticles->getForcesY();
+  FReal*const forcesZ = inParticles->getForcesZ();
+  //FReal*const potentials = inParticles->getPotentials();
+
+  for(int idxPart = 0 ; idxPart < inParticles->getNbParticles() ; ++ idxPart){
+
+    // map global position to [-1,1]
+    map(FPoint(positionsX[idxPart],positionsY[idxPart],positionsZ[idxPart]), localPosition);
+
+    // evaluate Lagrange polynomials of source particle
+    for (unsigned int o=0; o<ORDER; ++o) {
+      L_of_x[o][0] = BasisType::L(o, localPosition.getX()); // 3 * ORDER*(ORDER-1) flops PB: TODO confirm
+      L_of_x[o][1] = BasisType::L(o, localPosition.getY()); // 3 * ORDER*(ORDER-1) flops
+      L_of_x[o][2] = BasisType::L(o, localPosition.getZ()); // 3 * ORDER*(ORDER-1) flops
+      dL_of_x[o][0] = BasisType::dL(o, localPosition.getX()); // TODO verify 3 * ORDER*(ORDER-1) flops PB: TODO confirm
+      dL_of_x[o][1] = BasisType::dL(o, localPosition.getY()); // TODO verify 3 * ORDER*(ORDER-1) flops
+      dL_of_x[o][2] = BasisType::dL(o, localPosition.getZ()); // TODO verify 3 * ORDER*(ORDER-1) flops
+    }
+
+    // interpolate and increment forces value
+    FReal forces[3] = {FReal(0.), FReal(0.), FReal(0.)};
+    {
+      for (unsigned int l=0; l<ORDER; ++l) {
+        for (unsigned int m=0; m<ORDER; ++m) {
+          for (unsigned int n=0; n<ORDER; ++n) {
+            const unsigned int idx = n*ORDER*ORDER + m*ORDER + l;
+            forces[0] +=
+              dL_of_x[l][0] * L_of_x[m][1] * L_of_x[n][2] * localExpansion[idx];
+            forces[1] +=
+              L_of_x[l][0] * dL_of_x[m][1] * L_of_x[n][2] * localExpansion[idx];
+            forces[2] +=
+              L_of_x[l][0] * L_of_x[m][1] * dL_of_x[n][2] * localExpansion[idx];
+
+          } // ORDER * 4 flops
+        } // ORDER * ORDER * 4 flops
+      } // ORDER * ORDER * ORDER * 4 flops
+
+
+      // scale forces
+      forces[0] *= jacobian[0];// / nnodes;
+      forces[1] *= jacobian[1];// / nnodes;
+      forces[2] *= jacobian[2];// / nnodes;
+    }
+
+    // set computed forces
+    forcesX[idxPart] += forces[0] * physicalValues[idxPart];
+    forcesY[idxPart] += forces[1] * physicalValues[idxPart];
+    forcesZ[idxPart] += forces[2] * physicalValues[idxPart];
+  }
+}
+
+
+#endif /* FUNIFINTERPOLATOR_HPP */
diff --git a/Src/Kernels/Uniform/FUnifKernel.hpp b/Src/Kernels/Uniform/FUnifKernel.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..4bd37ed90025df313ee5f8bdeb02e42974fc27bf
--- /dev/null
+++ b/Src/Kernels/Uniform/FUnifKernel.hpp
@@ -0,0 +1,229 @@
+// ===================================================================================
+// Copyright ScalFmm 2011 INRIA, Olivier Coulaud, Bérenger Bramas, Matthias Messner
+// olivier.coulaud@inria.fr, berenger.bramas@inria.fr
+// This software is a computer program whose purpose is to compute the FMM.
+//
+// This software is governed by the CeCILL-C and LGPL licenses and
+// abiding by the rules of distribution of free software.
+//
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public and CeCILL-C Licenses for more details.
+// "http://www.cecill.info".
+// "http://www.gnu.org/licenses".
+// ===================================================================================
+#ifndef FUNIFKERNEL_HPP
+#define FUNIFKERNEL_HPP
+
+#include "../../Utils/FGlobal.hpp"
+#include "../../Utils/FTrace.hpp"
+#include "../../Utils/FSmartPointer.hpp"
+
+#include "./FAbstractUnifKernel.hpp"
+#include "./FUnifM2LHandler.hpp"
+
+class FTreeCoordinate;
+
+/**
+ * @author Pierre Blanchard (pierre.blanchard@inria.fr)
+ * @class FUnifKernel
+ * @brief
+ * Please read the license
+ *
+ * This kernels implement the Lagrange interpolation based FMM operators. It
+ * implements all interfaces (P2P,P2M,M2M,M2L,L2L,L2P) which are required by
+ * the FFmmAlgorithm and FFmmAlgorithmThread.
+ *
+ * @tparam CellClass Type of cell
+ * @tparam ContainerClass Type of container to store particles
+ * @tparam MatrixKernelClass Type of matrix kernel function
+ * @tparam ORDER Lagrange interpolation order
+ */
+template < class CellClass,	class ContainerClass,	class MatrixKernelClass, int ORDER, int NVALS = 1>
+class FUnifKernel
+  : public FAbstractUnifKernel< CellClass, ContainerClass, MatrixKernelClass, ORDER, NVALS>
+{
+  // private types
+  typedef FUnifM2LHandler<ORDER,MatrixKernelClass> M2LHandlerClass;
+
+  // using from
+  typedef FAbstractUnifKernel< CellClass, ContainerClass, MatrixKernelClass, ORDER, NVALS>
+  AbstractBaseClass;
+
+  /// Needed for M2L operator
+  FSmartPointer<  M2LHandlerClass,FSmartPointerMemory> M2LHandler;
+
+public:
+  /**
+   * The constructor initializes all constant attributes and it reads the
+   * precomputed and compressed M2L operators from a binary file (an
+   * runtime_error is thrown if the required file is not valid).
+   */
+  FUnifKernel(const int inTreeHeight,
+              const FPoint& inBoxCenter,
+              const FReal inBoxWidth)
+    : FAbstractUnifKernel< CellClass, ContainerClass, MatrixKernelClass, ORDER, NVALS>(inTreeHeight,
+                                                                                       inBoxCenter,
+                                                                                       inBoxWidth),
+      M2LHandler(new M2LHandlerClass())
+  {
+    // read precomputed compressed m2l operators from binary file
+    //M2LHandler->ReadFromBinaryFileAndSet(); // PB: TODO?
+    M2LHandler->ComputeAndSet();
+  }
+
+
+  void P2M(CellClass* const LeafCell,
+           const ContainerClass* const SourceParticles)
+  {
+    const FPoint LeafCellCenter(AbstractBaseClass::getLeafCellCenter(LeafCell->getCoordinate()));
+    for(int idxRhs = 0 ; idxRhs < NVALS ; ++idxRhs){
+      // 1) apply Sy
+      AbstractBaseClass::Interpolator->applyP2M(LeafCellCenter, AbstractBaseClass::BoxWidthLeaf,
+                                                LeafCell->getMultipole(idxRhs), SourceParticles);
+      // 2) apply Discrete Fourier Transform
+      M2LHandler->applyZeroPaddingAndDFT(LeafCell->getMultipole(idxRhs), 
+                                         LeafCell->getTransformedMultipole(idxRhs));
+
+    }
+  }
+
+
+  void M2M(CellClass* const FRestrict ParentCell,
+           const CellClass*const FRestrict *const FRestrict ChildCells,
+           const int /*TreeLevel*/)
+  {
+    for(int idxRhs = 0 ; idxRhs < NVALS ; ++idxRhs){
+      // 1) apply Sy
+      FBlas::scal(AbstractBaseClass::nnodes*2, FReal(0.), ParentCell->getMultipole(idxRhs));
+      for (unsigned int ChildIndex=0; ChildIndex < 8; ++ChildIndex){
+        if (ChildCells[ChildIndex]){
+          AbstractBaseClass::Interpolator->applyM2M(ChildIndex, ChildCells[ChildIndex]->getMultipole(idxRhs),
+                                                    ParentCell->getMultipole(idxRhs));
+        }
+      }
+      // 2) Apply Discete Fourier Transform
+      M2LHandler->applyZeroPaddingAndDFT(ParentCell->getMultipole(idxRhs), 
+                                         ParentCell->getTransformedMultipole(idxRhs));
+    }
+  }
+
+
+  //	void M2L(CellClass* const FRestrict TargetCell,
+  //           const CellClass* SourceCells[343],
+  //           const int NumSourceCells,
+  //           const int TreeLevel) const
+  //	{
+  //		const FReal CellWidth(BoxWidth / FReal(FMath::pow(2, TreeLevel)));
+  //		const FTreeCoordinate& cx = TargetCell->getCoordinate();
+  //		for (int idx=0; idx<NumSourceCells; ++idx) {
+  //			const FTreeCoordinate& cy = SourceCells[idx]->getCoordinate();
+  //			const int transfer[3] = {cy.getX()-cx.getX(),
+  //                               cy.getY()-cx.getY(),
+  //                               cy.getZ()-cx.getZ()};
+  //			M2LHandler->applyC(transfer, CellWidth,
+  //												SourceCells[idx]->getMultipole() + AbstractBaseClass::nnodes,
+  //												TargetCell->getLocal() + AbstractBaseClass::nnodes);
+  //		}
+  //	}
+
+  void M2L(CellClass* const FRestrict TargetCell,
+           const CellClass* SourceCells[343],
+           const int /*NumSourceCells*/,
+           const int TreeLevel)
+  {
+    for(int idxRhs = 0 ; idxRhs < NVALS ; ++idxRhs){
+      FComplexe *const TransformedLocalExpansion = TargetCell->getTransformedLocal(idxRhs);
+
+      const FReal CellWidth(AbstractBaseClass::BoxWidth / FReal(FMath::pow(2, TreeLevel)));
+      for (int idx=0; idx<343; ++idx){
+        if (SourceCells[idx]){
+          M2LHandler->applyFC(idx, CellWidth, SourceCells[idx]->getTransformedMultipole(idxRhs),
+                              TransformedLocalExpansion);
+
+
+        }
+      }
+    }
+  }
+
+  //	void M2L(CellClass* const FRestrict TargetCell,
+  //           const CellClass* SourceCells[343],
+  //           const int NumSourceCells,
+  //           const int TreeLevel) const
+  //	{
+  //		const unsigned int rank = M2LHandler.getRank();
+  //		FBlas::scal(343*rank, FReal(0.), MultipoleExpansion);
+  //		const FReal CellWidth(BoxWidth / FReal(FMath::pow(2, TreeLevel)));
+  //		for (int idx=0; idx<343; ++idx)
+  //			if (SourceCells[idx])
+  //				FBlas::copy(rank, const_cast<FReal *const>(SourceCells[idx]->getMultipole())+AbstractBaseClass::nnodes,
+  //										MultipoleExpansion+idx*rank);
+  //
+  //		M2LHandler->applyC(CellWidth, MultipoleExpansion, TargetCell->getLocal() + AbstractBaseClass::nnodes);
+  //	}
+
+
+  void L2L(const CellClass* const FRestrict ParentCell,
+           CellClass* FRestrict *const FRestrict ChildCells,
+           const int /*TreeLevel*/)
+  {
+    for(int idxRhs = 0 ; idxRhs < NVALS ; ++idxRhs){
+
+      // 1) Apply Inverse Discete Fourier Transform
+      M2LHandler->unapplyZeroPaddingAndDFT(ParentCell->getTransformedLocal(idxRhs),
+                                           const_cast<CellClass*>(ParentCell)->getLocal(idxRhs));
+      // 2) apply Sx
+      for (unsigned int ChildIndex=0; ChildIndex < 8; ++ChildIndex){
+        if (ChildCells[ChildIndex]){
+          AbstractBaseClass::Interpolator->applyL2L(ChildIndex, ParentCell->getLocal(idxRhs), ChildCells[ChildIndex]->getLocal(idxRhs));
+        }
+      }
+    }
+  }
+
+  void L2P(const CellClass* const LeafCell,
+           ContainerClass* const TargetParticles)
+  {
+    const FPoint LeafCellCenter(AbstractBaseClass::getLeafCellCenter(LeafCell->getCoordinate()));
+
+    for(int idxRhs = 0 ; idxRhs < NVALS ; ++idxRhs){
+
+      // 1)  Apply Inverse Discete Fourier Transform
+      M2LHandler->unapplyZeroPaddingAndDFT(LeafCell->getTransformedLocal(idxRhs), 
+                                           const_cast<CellClass*>(LeafCell)->getLocal(idxRhs));
+
+      // 2.a) apply Sx
+      AbstractBaseClass::Interpolator->applyL2P(LeafCellCenter, AbstractBaseClass::BoxWidthLeaf,
+                                                LeafCell->getLocal(idxRhs), TargetParticles);
+
+      // 2.b) apply Px (grad Sx)
+      AbstractBaseClass::Interpolator->applyL2PGradient(LeafCellCenter, AbstractBaseClass::BoxWidthLeaf,
+                                                        LeafCell->getLocal(idxRhs), TargetParticles);
+
+    }
+  }
+
+  void P2P(const FTreeCoordinate& /* LeafCellCoordinate */, // needed for periodic boundary conditions
+           ContainerClass* const FRestrict TargetParticles,
+           const ContainerClass* const FRestrict /*SourceParticles*/,
+           ContainerClass* const NeighborSourceParticles[27],
+           const int /* size */)
+  {
+    DirectInteractionComputer<MatrixKernelClass::Identifier, NVALS>::P2P(TargetParticles,NeighborSourceParticles);
+  }
+
+
+  void P2PRemote(const FTreeCoordinate& /*inPosition*/,
+                 ContainerClass* const FRestrict inTargets, const ContainerClass* const FRestrict /*inSources*/,
+                 ContainerClass* const inNeighbors[27], const int /*inSize*/){
+    DirectInteractionComputer<MatrixKernelClass::Identifier, NVALS>::P2PRemote(inTargets,inNeighbors,27);
+  }
+
+};
+
+
+#endif //FUNIFKERNEL_HPP
+
+// [--END--]
diff --git a/Src/Kernels/Uniform/FUnifM2LHandler.hpp b/Src/Kernels/Uniform/FUnifM2LHandler.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..8a6c2fee98bfbca66077d1e28c412dbcf07769cc
--- /dev/null
+++ b/Src/Kernels/Uniform/FUnifM2LHandler.hpp
@@ -0,0 +1,481 @@
+// ===================================================================================
+// Copyright ScalFmm 2011 INRIA, Olivier Coulaud, Bérenger Bramas, Matthias Messner
+// olivier.coulaud@inria.fr, berenger.bramas@inria.fr
+// This software is a computer program whose purpose is to compute the FMM.
+//
+// This software is governed by the CeCILL-C and LGPL licenses and
+// abiding by the rules of distribution of free software.  
+// 
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public and CeCILL-C Licenses for more details.
+// "http://www.cecill.info". 
+// "http://www.gnu.org/licenses".
+// ===================================================================================
+#ifndef FUNIFM2LHANDLER_HPP
+#define FUNIFM2LHANDLER_HPP
+
+#include <numeric>
+#include <stdexcept>
+#include <string>
+#include <sstream>
+#include <fstream>
+#include <typeinfo>
+
+#include "../../Utils/FBlas.hpp"
+#include "../../Utils/FTic.hpp"
+#include "../../Utils/FDft.hpp"
+
+#include "../../Utils/FComplexe.hpp"
+
+
+#include "./FUnifTensor.hpp"
+
+
+
+/**
+ * @author Pierre Blanchard (pierre.blanchard@inria.fr)
+ * @class FUnifM2LHandler
+ * Please read the license
+ *
+ * This class precomputes and efficiently stores the M2L operators
+ * \f$[K_1,\dots,K_{316}]\f$ for all (\f$7^3-3^3 = 316\f$ possible interacting
+ * cells in the far-field) interactions for the Lagrange interpolation
+ * approach. The resulting Lagrange operators have a Circulant Toeplitz 
+ * structure and can be applied efficiently in Fourier Space. Hence, the
+ * originally \f$K_t\f$ of size \f$\ell^3\times\ell^3\f$ times \f$316\f$ for
+ * all interactions is reduced to \f$316\f$ \f$C_t\f$, each of size \f$2\ell-1\f$.
+ *
+ * @tparam ORDER interpolation order \f$\ell\f$
+ */
+template <int ORDER, class MatrixKernelClass>
+class FUnifM2LHandler : FNoCopyable
+{
+	enum {order = ORDER,
+				nnodes = TensorTraits<ORDER>::nnodes,
+				ninteractions = 316}; // 7^3 - 3^3 (max num cells in far-field)
+
+	const MatrixKernelClass MatrixKernel;
+
+	FComplexe *FC;
+
+  unsigned int rc; 
+  unsigned int opt_rc; 
+
+  typedef FUnifTensor<ORDER> TensorType;
+  unsigned int node_diff[nnodes*nnodes]; // PB: used in applyC to get id=i-j
+
+  //  FDft Dft; // Direct Discrete Fourier Transformator
+  FFft Dft; // Fast Discrete Fourier Transformator
+
+	static const std::string getFileName()
+	{
+		const char precision_type = (typeid(FReal)==typeid(double) ? 'd' : 'f');
+		std::stringstream stream;
+		stream << "m2l_k"<< MatrixKernelClass::Identifier << "_" << precision_type
+					 << "_o" << order << ".bin";
+		return stream.str();
+	}
+
+	
+public:
+	FUnifM2LHandler()
+		: MatrixKernel(), FC(NULL), 
+      rc((2*ORDER-1)*(2*ORDER-1)*(2*ORDER-1)),
+      opt_rc(rc/2+1),
+      Dft(rc) // initialize Discrete Fourier Transformator
+	{    
+    // initialize root node ids
+    TensorType::setNodeIdsDiff(node_diff);
+
+    // for real valued kernel only n/2+1 complex values are stored 
+    // after performing the DFT (the rest is deduced by conjugation)
+
+  }
+
+	~FUnifM2LHandler()
+	{
+		if (FC != NULL) delete [] FC;
+	}
+
+	/**
+	 * Computes and sets the matrix \f$C_t\f$
+	 */
+	void ComputeAndSet()
+	{
+		// measure time
+		FTic time; time.tic();
+		// check if aready set
+    		if (FC) throw std::runtime_error("M2L operator already set");
+    // Compute matrix of interactions
+		Compute(FC);
+
+    // Compute memory usage
+    unsigned long sizeM2L = 343*opt_rc*sizeof(FComplexe);
+
+
+		// write info
+		std::cout << "Compute and Set M2L operators ("<< long(sizeM2L/**1e-6*/) <<" Bytes) in "
+							<< time.tacAndElapsed() << "sec."	<< std::endl;
+	}
+
+	/**
+	 * Computes, writes to binary file, reads it and sets the matrices \f$Y, C_t, B\f$
+	 */
+	void ComputeAndStoreInBinaryFileAndReadFromFileAndSet()
+	{
+		FUnifM2LHandler<ORDER,MatrixKernelClass>::ComputeAndStoreInBinaryFile();
+		this->ReadFromBinaryFileAndSet();
+	}
+
+	/**
+	 * Computes all \f$K_t\f$.
+	 *
+	 * @param[out] C matrix of size \f$r\times 316 r\f$ storing \f$[C_1,\dots,C_{316}]\f$
+	 */
+	static void Compute(FComplexe* &FC);
+
+	/**
+	 * Computes and stores the matrix \f$C_t\f$ in a binary
+	 * file
+	 */
+	static void ComputeAndStoreInBinaryFile();
+
+	/**
+	 * Reads the matrices \f$Y, C_t, B\f$ from the respective binary file
+	 */
+	void ReadFromBinaryFileAndSet();
+		
+
+  /**
+	 * Expands potentials \f$x+=IDFT(X)\f$ of a target cell. This operation can be
+	 * seen as part of the L2L operation.
+	 *
+	 * @param[in] X transformed local expansion of size \f$r\f$
+	 * @param[out] x local expansion of size \f$\ell^3\f$
+	 */
+  void unapplyZeroPaddingAndDFT(const FComplexe *const FX, FReal *const x) const
+  { 
+    FReal Px[rc];
+    FBlas::setzero(rc,Px);
+    // Apply forward Discrete Fourier Transform
+    Dft.applyIDFT(FX,Px);
+
+    // Unapply Zero Padding
+    for (unsigned int j=0; j<nnodes; ++j)
+      x[j]+=Px[node_diff[nnodes-j-1]];
+  }
+
+  /**
+	 * The M2L operation \f$X+=C_tY\f$ is performed in Fourier space by 
+   * application of the convolution theorem (i.e. \f$FX+=FC_t:FY\f$), 
+   * where \f$FY\f$ is the transformed multipole expansion and 
+   * \f$FX\f$ is the transformed local expansion, both padded with zeros and
+	 * of size \f$r_c=(2\times ORDER-1)^3\f$ or \f$r_c^{opt}=\frac{r_c}{2}+1\f$ 
+   * for real valued kernels. The index \f$t\f$ denotes the transfer vector 
+	 * of the target cell to the source cell.
+	 *
+	 * @param[in] transfer transfer vector
+	 * @param[in] FY transformed multipole expansion
+	 * @param[out] FX transformed local expansion
+	 * @param[in] CellWidth needed for the scaling of the compressed M2L operators which are based on a homogeneous matrix kernel computed for the reference cell width \f$w=2\f$, ie in \f$[-1,1]^3\f$.
+	 */
+  void applyFC(const unsigned int idx, FReal CellWidth,
+                  const FComplexe *const FY, FComplexe *const FX) const
+  {
+		const FReal scale(MatrixKernel.getScaleFactor(CellWidth));
+
+    FComplexe tmpFX; 
+
+    // Perform entrywise product manually
+    for (unsigned int j=0; j<opt_rc; ++j){
+      tmpFX=FC[idx*opt_rc + j];
+      tmpFX*=FY[j];
+      tmpFX*=scale;
+      FX[j]+=tmpFX;
+    }
+
+//    // Perform entrywise product using BLAS and MKL routines
+//    // PB: not necessary faster than the naive version
+//    FComplexe tmpFX[rc]; 
+//    FBlas::c_setzero(rc,reinterpret_cast<FReal*>(tmpFX));
+//    FMkl::c_had(rc,reinterpret_cast<const FReal* const>(FC + idx*rc),
+//                reinterpret_cast<const FReal* const>(FY),
+//                reinterpret_cast<FReal* const>(tmpFX));
+//    // Scale
+//    FBlas::c_axpy(rc,&scale,reinterpret_cast<FReal* const>(tmpFX),
+//                  reinterpret_cast<FReal* const>(FX));
+
+  }
+
+
+  /**
+	 * Transform densities \f$Y= DFT(y)\f$ of a source cell. This operation
+	 * can be seen as part of the M2M operation.
+	 *
+	 * @param[in] y multipole expansion of size \f$\ell^3\f$
+	 * @param[out] Y transformed multipole expansion of size \f$r\f$
+	 */
+  void applyZeroPaddingAndDFT(FReal *const y, FComplexe *const FY) const
+  {
+    FReal Py[rc];
+    FBlas::setzero(rc,Py);
+
+    FComplexe tmpFY[rc]; // not mandatory?
+
+    // Apply Zero Padding
+    for (unsigned int i=0; i<nnodes; ++i)
+      Py[node_diff[i*nnodes]]=y[i];
+
+    // Apply forward Discrete Fourier Transform
+    Dft.applyDFT(Py,tmpFY);
+
+    // not mandatory?
+    for (unsigned int j=0; j<rc; ++j) // could be opt_rc
+      FY[j]+=tmpFY[j];
+
+  }
+
+
+};
+
+
+
+
+
+
+//////////////////////////////////////////////////////////////////////
+// definition ////////////////////////////////////////////////////////
+//////////////////////////////////////////////////////////////////////
+
+
+
+
+
+
+template <int ORDER, class MatrixKernelClass>
+void
+FUnifM2LHandler<ORDER, MatrixKernelClass>::Compute(FComplexe* &FC)
+{
+	// allocate memory and store compressed M2L operators
+  	if (FC) throw std::runtime_error("M2L operators are already set");
+
+	// interpolation points of source (Y) and target (X) cell
+	FPoint X[nnodes], Y[nnodes];
+	// set roots of target cell (X)
+	FUnifTensor<order>::setRoots(FPoint(0.,0.,0.), FReal(2.), X);
+	// init matrix kernel
+	const MatrixKernelClass MatrixKernel;
+
+	// allocate memory and compute 316 m2l operators
+	FReal *_C;
+	FComplexe *_FC;
+
+  // reduce storage from nnodes^2=order^6 to (2order-1)^3
+  const unsigned int rc = (2*order-1)*(2*order-1)*(2*order-1);
+	_C = new FReal [rc];
+	_FC = new FComplexe [rc * ninteractions];
+
+  // init Discrete Fourier Transformator
+//	FDft Dft(rc);
+	FFft Dft(rc);
+
+  // get first column of K via permutation
+  unsigned int perm[rc];
+  for(unsigned int p=0; p<rc; ++p){
+    // permutation to order WHILE computing the entire 1st row
+    if(p<rc-1) perm[p]=p+1;
+    else perm[p]=p+1-rc;
+    //    std::cout << "perm["<< p << "]="<< perm[p] << std::endl;
+  }
+
+	unsigned int counter = 0;
+  unsigned int li,lj, mi,mj, ni,nj;
+  unsigned int idi, idj, ido;
+	for (int i=-3; i<=3; ++i) {
+		for (int j=-3; j<=3; ++j) {
+			for (int k=-3; k<=3; ++k) {
+				if (abs(i)>1 || abs(j)>1 || abs(k)>1) {
+					// set roots of source cell (Y)
+					const FPoint cy(FReal(2.*i), FReal(2.*j), FReal(2.*k));
+					FUnifTensor<order>::setRoots(cy, FReal(2.), Y);
+					// evaluate m2l operator
+          ido=0;
+          for(unsigned int l=0; l<2*order-1; ++l)
+            for(unsigned int m=0; m<2*order-1; ++m)
+              for(unsigned int n=0; n<2*order-1; ++n){   
+          
+                // l=0:(2*order-1) => li-lj=-(order-1):(order-1)
+                // Convention:
+                // lj=order-1 & li=0:order-1 => li-lj=1-order:0
+                // lj=1 & li=0:order-1 => li-lj=1:order-1
+                if(l<order-1) lj=order-1; else lj=0;
+                if(m<order-1) mj=order-1; else mj=0;
+                if(n<order-1) nj=order-1; else nj=0;
+                li=(l-(order-1))+lj; mi=(m-(order-1))+mj; ni=(n-(order-1))+nj;
+                // deduce corresponding index of K[nnodes x nnodes] 
+                idi=li*order*order + mi*order + ni;
+                idj=lj*order*order + mj*order + nj;
+                
+                // store value at current position in C
+                // use permutation if DFT is used because 
+                // the storage of the first column is required
+                // i.e. C[0] C[rc-1] C[rc-2] .. C[1] < WRONG!
+                // i.e. C[rc-1] C[0] C[1] .. C[rc-2] < RIGHT!
+                //                _C[counter*rc + ido]
+                _C[perm[ido]]
+                  = MatrixKernel.evaluate(X[idi], Y[idj]);
+                ido++;
+              }
+
+          // Apply Discrete Fourier Transformation
+          Dft.applyDFT(_C,_FC+counter*rc);
+
+					// increment interaction counter
+					counter++;
+				}
+			}
+		}
+	}
+	if (counter != ninteractions)
+		throw std::runtime_error("Number of interactions must correspond to 316");
+
+  // Free _C
+	delete [] _C;
+
+	// store FC
+	counter = 0;
+  // reduce storage if real valued kernel
+  const unsigned int opt_rc = rc/2+1;
+  // allocate M2L
+	FC = new FComplexe[343 * opt_rc];
+
+	for (int i=-3; i<=3; ++i)
+		for (int j=-3; j<=3; ++j)
+			for (int k=-3; k<=3; ++k) {
+				const unsigned int idx = (i+3)*7*7 + (j+3)*7 + (k+3);
+				if (abs(i)>1 || abs(j)>1 || abs(k)>1) {
+					FBlas::c_copy(opt_rc, reinterpret_cast<FReal*>(_FC + counter*rc), 
+                        reinterpret_cast<FReal*>(FC + idx*opt_rc));
+//          for (unsigned int n=0; n<rc; ++n){
+//            FC[idx*rc+n]=_FC[counter*rc+n];
+//          }
+					counter++;
+				} else{ 
+          FBlas::c_setzero(opt_rc, reinterpret_cast<FReal*>(FC + idx*opt_rc));
+//          for (unsigned int n=0; n<rc; ++n){
+//            FC[idx*rc+n]=FComplexe(0.0,0.0);
+//          }
+        }
+      }
+
+	if (counter != ninteractions)
+		throw std::runtime_error("Number of interactions must correspond to 316");
+  delete [] _FC;  	
+}
+
+
+
+
+
+
+//template <int ORDER, class MatrixKernelClass>
+//void
+//FUnifM2LHandler<ORDER, MatrixKernelClass>::ComputeAndStoreInBinaryFile()
+//{
+//	// measure time
+//	FTic time; time.tic();
+//	// start computing process
+//	FReal *C;
+//	C = NULL;
+//	const unsigned int rc = Compute(C);
+//	// store into binary file
+//	const std::string filename(getFileName());
+//	std::ofstream stream(filename.c_str(),
+//											 std::ios::out | std::ios::binary | std::ios::trunc);
+//	if (stream.good()) {
+//		stream.seekp(0);
+//		// 1) write number of interpolation points (int)
+//		int _nnodes = nnodes;
+//		stream.write(reinterpret_cast<char*>(&_nnodes), sizeof(int));
+//		// 2) write 343 C (343 * rc * FReal)
+//		stream.write(reinterpret_cast<char*>(C), sizeof(FReal)*rc*343);
+//	} 	else throw std::runtime_error("File could not be opened to write");
+//	stream.close();
+//	// free memory
+//	if (C != NULL) delete [] C;
+//	// write info
+//	std::cout << "M2L operators (r2="<< rc 
+//            << ",nnodes2="<< nnodes*nnodes 
+//            <<") stored in binary file "	<< filename
+//						<< " in " << time.tacAndElapsed() << "sec."	<< std::endl;
+//}
+//
+//
+//template <int ORDER, class MatrixKernelClass>
+//void
+//FUnifM2LHandler<ORDER, MatrixKernelClass>::ReadFromBinaryFileAndSet()
+//{
+//	// measure time
+//	FTic time; time.tic();
+//	// start reading process
+//	if (C) throw std::runtime_error("M2L operator already set");
+//	const std::string filename(getFileName());
+//	std::ifstream stream(filename.c_str(),
+//											 std::ios::in | std::ios::binary | std::ios::ate);
+//	const std::ifstream::pos_type size = stream.tellg();
+//	if (size<=0) {
+//		std::cout << "Info: The requested binary file " << filename
+//							<< " does not yet exist. Compute it now ... " << std::endl;
+//		this->ComputeAndStoreInBinaryFileAndReadFromFileAndSet();
+//		return;
+//	} 
+//	if (stream.good()) {
+//		stream.seekg(0);
+//		// 1) read number of interpolation points (int)
+//		int npts;
+//		stream.read(reinterpret_cast<char*>(&npts), sizeof(int));
+//		if (npts!=nnodes) throw std::runtime_error("nnodes and npts do not correspond");
+//		// 2) write 343 C (343 * rank*rank * FReal)
+//		C = new FReal [343 * rc];
+//		stream.read(reinterpret_cast<char*>(C), sizeof(FReal)*rc*343);
+//	}	else throw std::runtime_error("File could not be opened to read");
+//	stream.close();
+//	// write info
+//	std::cout << "M2L operators (rc=" << rc 
+//            <<",nnodes2="<< nnodes*nnodes 
+//            << ") read from binary file "
+//						<< filename << " in " << time.tacAndElapsed() << "sec."	<< std::endl;
+//}
+
+/*
+unsigned int ReadRankFromBinaryFile(const std::string& filename)
+{
+	// start reading process
+	std::ifstream stream(filename.c_str(), std::ios::in | std::ios::binary | std::ios::ate);
+	const std::ifstream::pos_type size = stream.tellg();
+	if (size<=0) throw std::runtime_error("The requested binary file does not exist.");
+	unsigned int rank = -1;
+	if (stream.good()) {
+		stream.seekg(0);
+		// 1) read number of interpolation points (int)
+		int npts;
+		stream.read(reinterpret_cast<char*>(&npts), sizeof(int));
+		// 2) read low rank (int)
+		stream.read(reinterpret_cast<char*>(&rank), sizeof(int));
+		return rank;
+	}	else throw std::runtime_error("File could not be opened to read");
+	stream.close();
+	return rank;
+}
+*/
+
+
+
+
+
+
+#endif // FUNIFM2LHANDLER_HPP
+
+// [--END--]
diff --git a/Src/Kernels/Uniform/FUnifRoots.cpp b/Src/Kernels/Uniform/FUnifRoots.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..644c2bc329d54d5c7f3ee2c6b1c5e771e9ad3c09
--- /dev/null
+++ b/Src/Kernels/Uniform/FUnifRoots.cpp
@@ -0,0 +1,134 @@
+#include "FUnifInterpolator.hpp" //PB: if include FUnifRoots.hpp then Error: "FReal is not a type"...
+
+// In case of multiple include of FUnifInterpolator.hpp, this has to be defined only once!!
+
+// order 2
+template<> const double FUnifRoots<2>::roots[] = {-1.,
+                                                  1.};
+
+// order 3
+template<> const double FUnifRoots<3>::roots[] = {-1.,
+                                                  0.0,
+                                                  1.};
+
+// order 4
+template<> const double FUnifRoots<4>::roots[] = {-1.,
+                                                  -0.333333333333333,
+                                                  0.333333333333333,
+                                                  1.};
+
+// order 5
+template<> const double FUnifRoots<5>::roots[] = {-1.,
+                                                  -0.5,
+                                                  0.,
+                                                  0.5,
+                                                  1.};
+
+// order 6
+template<> const double FUnifRoots<6>::roots[] = {-1.,
+                                                  -0.6,
+                                                  -0.2,
+                                                  0.2,
+                                                  0.6,
+                                                  1.};
+
+// order 7
+template<> const double FUnifRoots<7>::roots[] = {-1.,
+                                                  -0.666666666666666,
+                                                  -0.333333333333333,
+                                                  0.,
+                                                  0.333333333333333,
+                                                  0.666666666666666,
+                                                  1.};
+
+// order 8
+template<> const double FUnifRoots<8>::roots[] = {-1.,
+                                                  -0.714285714285714,
+                                                  -0.428571428571429,
+                                                  -0.142857142857143,
+                                                  0.142857142857143,
+                                                  0.428571428571429,
+                                                  0.714285714285714,
+                                                  1.};
+
+// order 9
+template<> const double FUnifRoots<9>::roots[] = {-1.,
+                                                  -0.75,
+                                                  -0.5,
+                                                  -0.25,
+                                                  0.0,
+                                                  0.25,
+                                                  0.5,
+                                                  0.75,
+                                                  1.};
+
+// order 10
+template<> const double FUnifRoots<10>::roots[] = {-1.,
+                                                   -0.777777777777777,
+                                                   -0.555555555555555,
+                                                   -0.333333333333333,
+                                                   -0.111111111111111,
+                                                   0.111111111111111,
+                                                   0.333333333333333,
+                                                   0.555555555555555,
+                                                   0.777777777777777,
+                                                   1.};
+
+// order 11
+template<> const double FUnifRoots<11>::roots[] = {-1.,
+                                                   -0.888888888888888,
+                                                   -0.666666666666666,
+                                                   -0.444444444444444,
+                                                   -0.222222222222222,
+                                                   0.0,
+                                                   0.2222222222222222,
+                                                   0.444444444444444,
+                                                   0.666666666666666,
+                                                   0.888888888888888,
+                                                   1.};
+
+// order 12
+template<> const double FUnifRoots<12>::roots[] = {-1.,
+                                                   -0.818181818181818,
+                                                   -0.636363636363636,
+                                                   -0.454545454545455,
+                                                   -0.272727272727273,
+                                                   -0.090909090909091,
+                                                   0.090909090909091,
+                                                   0.272727272727273,
+                                                   0.454545454545455,
+                                                   0.636363636363636,
+                                                   0.818181818181818,
+                                                   1.};
+
+
+// order 13
+template<> const double FUnifRoots<13>::roots[] = {-1.,
+                                                   -0.833333333333333,
+                                                   -0.666666666666666,
+                                                   -0.5,
+                                                   -0.333333333333333,
+                                                   -0.166666666666666,
+                                                   0.0,
+                                                   0.166666666666666,
+                                                   0.333333333333333,
+                                                   0.5,
+                                                   0.666666666666666,
+                                                   0.833333333333333,
+                                                   1.};
+
+// order 14
+template<> const double FUnifRoots<14>::roots[] = {-1.,
+                                                   -0.846153846153846,
+                                                   -0.692307692307692,
+                                                   -0.538461538461538,
+                                                   -0.384615384615385,
+                                                   -0.230769230769231,
+                                                   -0.076923076923077,
+                                                   0.076923076923077,
+                                                   0.230769230769231,
+                                                   0.384615384615385,
+                                                   0.538461538461538,
+                                                   0.692307692307692,
+                                                   0.846153846153846,
+                                                   1.};
diff --git a/Src/Kernels/Uniform/FUnifRoots.hpp b/Src/Kernels/Uniform/FUnifRoots.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..b3897e8efe5e9976b8577c8139993059ecd4a2ea
--- /dev/null
+++ b/Src/Kernels/Uniform/FUnifRoots.hpp
@@ -0,0 +1,159 @@
+// ===================================================================================
+// Copyright ScalFmm 2011 INRIA, Olivier Coulaud, Bérenger Bramas, Matthias Messner
+// olivier.coulaud@inria.fr, berenger.bramas@inria.fr
+// This software is a computer program whose purpose is to compute the FMM.
+//
+// This software is governed by the CeCILL-C and LGPL licenses and
+// abiding by the rules of distribution of free software.
+//
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public and CeCILL-C Licenses for more details.
+// "http://www.cecill.info".
+// "http://www.gnu.org/licenses".
+// ===================================================================================
+#ifndef FUNIFROOTS_HPP
+#define FUNIFROOTS_HPP
+
+#include <cmath>
+#include <limits>
+#include <cassert>
+
+#include "../../Utils/FNoCopyable.hpp"
+
+
+/**
+ * @author Pierre Blanchard (pierre.blanchard@inria.fr)
+ * Please read the license
+ */
+
+/**
+ * @class FUnifRoots
+ *
+ * The class @p FUnifRoots provides the equispaced roots of order \f$\ell\f$
+ * and the Lagrange polynomials \f$L_n(x)\f$.
+ *
+ * @tparam ORDER interpolation order \f$\ell\f$
+ */
+template <int ORDER>
+struct FUnifRoots : FNoCopyable
+{
+  enum {order = ORDER}; //!< interpolation order
+
+  /**
+   * Lagrange roots in [-1,1] computed as \f$\bar x_n =
+   * -1 + 2\frac{n-1}{\ell}\f$ for \f$n=1,\dots,\ell\f$
+   */
+  const static double roots[];
+
+  /**
+   * Lagrange polynomials \f$ L_n(x) = \Pi_{m=1 \atop m\neq n}^\ell \frac{x-\bar x_m}{\bar x_n-\bar x_m} \f$
+   *
+   * @param[in] n index
+   * @param[in] x coordinate in [-1,1]
+   * @return function value
+   */
+  static FReal L(const unsigned int n, FReal x)
+  {
+    //std::cout << x << std::endl;
+    assert(std::fabs(x)-1.<10.*std::numeric_limits<FReal>::epsilon());
+    if (std::fabs(x)>1.) {
+      x = (x > FReal( 1.) ? FReal( 1.) : x);
+      x = (x < FReal(-1.) ? FReal(-1.) : x);
+    }
+
+    FReal tmpL=FReal(1.);
+    for(unsigned int m=0;m<order;++m){
+      if(m!=n)
+        tmpL *= (x-FUnifRoots<order>::roots[m])/(FUnifRoots<order>::roots[n]-FUnifRoots<order>::roots[m]);
+      
+    }
+
+    return FReal(tmpL);
+  }
+
+
+  /**
+   * For the derivation of the Lagrange polynomials
+   * \f$ \frac{\mathrm{d} L_n(x)}{\mathrm{d}x} = ... \f$
+   *
+   * @param[in] n index
+   * @param[in] x coordinate in [-1,1]
+   * @return function value
+   */
+  static FReal dL(const unsigned int n, FReal x)
+  {
+    assert(std::fabs(x)-1.<10.*std::numeric_limits<FReal>::epsilon());
+    if (std::fabs(x)>1.) {
+      x = (x > FReal( 1.) ? FReal( 1.) : x);
+      x = (x < FReal(-1.) ? FReal(-1.) : x);
+    }
+
+    FReal tmpdL;
+    FReal dL=FReal(0.);
+
+    for(unsigned int p=0;p<order;++p){
+      if(p!=n){
+        tmpdL=1./(FUnifRoots<order>::roots[n]-FUnifRoots<order>::roots[p]);
+
+      for(unsigned int m=0;m<order;++m){
+        if(m!=n && m!=p)
+          tmpdL *= (x-FUnifRoots<order>::roots[m])/(FUnifRoots<order>::roots[n]-FUnifRoots<order>::roots[m]);
+
+      }// m
+
+      dL+=tmpdL;
+      
+      }//endif
+
+    }// p
+
+    return FReal(dL);
+
+  }
+};
+
+// We declare the roots here only once Please look to .cpp for definitions
+
+// order 2
+template<> const double FUnifRoots<2>::roots[];
+
+// order 3
+template<> const double FUnifRoots<3>::roots[];
+
+// order 4
+template<> const double FUnifRoots<4>::roots[];
+
+// order 5
+template<> const double FUnifRoots<5>::roots[];
+
+// order 6
+template<> const double FUnifRoots<6>::roots[];
+
+// order 7
+template<> const double FUnifRoots<7>::roots[];
+
+// order 8
+template<> const double FUnifRoots<8>::roots[];
+
+// order 9
+template<> const double FUnifRoots<9>::roots[];
+
+// order 10
+template<> const double FUnifRoots<10>::roots[];
+
+// order 11
+template<> const double FUnifRoots<11>::roots[];
+
+// order 12
+template<> const double FUnifRoots<12>::roots[];
+
+// order 13
+template<> const double FUnifRoots<13>::roots[];
+
+// order 14
+template<> const double FUnifRoots<14>::roots[];
+
+
+#endif
diff --git a/Src/Kernels/Uniform/FUnifTensor.hpp b/Src/Kernels/Uniform/FUnifTensor.hpp
new file mode 100755
index 0000000000000000000000000000000000000000..004ca44461a77e3b455c0cc627f09f26f2c3331c
--- /dev/null
+++ b/Src/Kernels/Uniform/FUnifTensor.hpp
@@ -0,0 +1,76 @@
+// ===================================================================================
+// Copyright ScalFmm 2011 INRIA, Olivier Coulaud, Bérenger Bramas, Matthias Messner
+// olivier.coulaud@inria.fr, berenger.bramas@inria.fr
+// This software is a computer program whose purpose is to compute the FMM.
+//
+// This software is governed by the CeCILL-C and LGPL licenses and
+// abiding by the rules of distribution of free software.
+//
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public and CeCILL-C Licenses for more details.
+// "http://www.cecill.info".
+// "http://www.gnu.org/licenses".
+// ===================================================================================
+#ifndef FUNIFTENSOR_HPP
+#define FUNIFTENSOR_HPP
+
+#include "../../Utils/FMath.hpp"
+
+#include "./FUnifRoots.hpp"
+#include "./../Interpolation/FInterpTensor.hpp"
+
+
+/**
+ * @author Pierre Blanchard (pierre.blanchard@inria.fr)
+ * Please read the license
+ */
+
+
+
+/**
+ * @class FUnifTensor
+ *
+ * The class FUnifTensor provides function considering the tensor product
+ * interpolation.
+ *
+ * @tparam ORDER interpolation order \f$\ell\f$
+ */
+template <int ORDER>
+class FUnifTensor : public FInterpTensor<ORDER,FUnifRoots<ORDER>>
+{
+  enum {nnodes = TensorTraits<ORDER>::nnodes};
+  typedef FUnifRoots<ORDER> BasisType;
+  typedef FInterpTensor<ORDER,BasisType> ParentTensor;
+
+ public:
+
+  /**
+   * Sets the diff of ids of the coordinates of all \f$\ell^6\f$ interpolation
+   * nodes duet
+   *
+   * @param[out] NodeIdsDiff diff of ids of coordinates of interpolation nodes
+   */
+  static
+    void setNodeIdsDiff(unsigned int NodeIdsDiff[nnodes*nnodes])
+  {
+    unsigned int node_ids[nnodes][3];
+    ParentTensor::setNodeIds(node_ids);
+
+    for (unsigned int i=0; i<nnodes; ++i) {
+      for (unsigned int j=0; j<nnodes; ++j) {
+        // 0 <= id < 2*ORDER-1
+        unsigned int idl = node_ids[i][0]-node_ids[j][0]+ORDER-1;
+        unsigned int idm = node_ids[i][1]-node_ids[j][1]+ORDER-1;
+        unsigned int idn = node_ids[i][2]-node_ids[j][2]+ORDER-1;
+        NodeIdsDiff[i*nnodes+j]
+          = idn*(2*ORDER-1)*(2*ORDER-1) + idm*(2*ORDER-1) + idl;
+      }
+    }
+  }
+
+};
+
+
+#endif /*FUNIFTENSOR_HPP*/
diff --git a/Src/Utils/FBlas.hpp b/Src/Utils/FBlas.hpp
index 1a2457e84b57df4218751a69f35c8f44f39b2cfd..01fea5dd01a4e61e9ce93e145a29955ab127eae2 100755
--- a/Src/Utils/FBlas.hpp
+++ b/Src/Utils/FBlas.hpp
@@ -71,6 +71,16 @@ extern "C"
 	void dorgqr_(const unsigned*, const unsigned*, const unsigned*,
 							 double*, const unsigned*, double*, double*, const unsigned*, int*);
 
+#ifdef ScalFMM_USE_MKL_AS_BLAS
+  // mkl: hadamard product is not implemented in mkl_blas
+  void vdmul_(const unsigned* n, const double*, const double*, double*);
+  void vsmul_(const unsigned* n, const float*, const float*, float*);
+  void vzmul_(const unsigned* n, const double*, const double*, double*);
+  void vcmul_(const unsigned* n, const float*, const float*, float*);
+#else
+  // TODO create interface for hadamard product in case an external Blas is used
+#endif
+
 	// single //////////////////////////////////////////////////////////
 	// blas 1
 	float sdot_(const unsigned*, const float*, const unsigned*,	const float*, const unsigned*);
@@ -127,18 +137,59 @@ extern "C"
 	void cgeqrf_(const unsigned*, const unsigned*, float*, const unsigned*,
 							 float*, float*, const unsigned*, int*);
 
+
+
+}
+
+
+// Hadamard (i.e. entrywise) product: 
+// NB: The following optimized routines are currently not used 
+// since they have not proved their efficiency in comparison 
+// with a naive application of the entrywise product.
+#ifdef ScalFMM_USE_MKL_AS_BLAS
+
+//#include "mkl_vml.h" 
+
+namespace FMkl{
+
+  // Hadamard product: dest[i]=a[i]*b[i]
+  inline void had(const unsigned n, const double* const a, const double* const b, double* const dest)
+  { vdmul_(&n, a, b, dest); }
+  inline void had(const unsigned n, const float* const a, const float* const b, float* const dest)
+  { vsmul_(&n, a, b, dest); }
+  inline void c_had(const unsigned n, const double* const a, const double* const b, double* const dest)
+  { vzmul_(&n, a, b, dest); }
+  inline void c_had(const unsigned n, const float* const a, const float* const b, float* const dest)
+  { vcmul_(&n, a, b, dest); }
+
 }
 
+#else
+
+namespace FBlas{
+
+  // TODO create interface for hadamard product in case an external Blas is used
+
+}
+
+#endif
+// end Hadamard product
+
+
 
 namespace FBlas {
 
 	// copy
 	inline void copy(const unsigned n, double* orig, double* dest)
 	{	dcopy_(&n, orig, &N_ONE, dest, &N_ONE);	}
+	inline void copy(const unsigned n, const double* orig, double* dest)
+	{	dcopy_(&n, orig, &N_ONE, dest, &N_ONE);	}
 	inline void copy(const unsigned n, float* orig, float* dest)
 	{	scopy_(&n, orig, &N_ONE, dest, &N_ONE);	}
 	inline void c_copy(const unsigned n, double* orig, double* dest)
 	{	zcopy_(&n, orig, &N_ONE, dest, &N_ONE);	}
+	inline void c_copy(const unsigned n, const double* orig, double* dest)
+	{	zcopy_(&n, orig, &N_ONE, dest, &N_ONE);	}
 	inline void c_copy(const unsigned n, float* orig, float* dest)
 	{	ccopy_(&n, orig, &N_ONE, dest, &N_ONE);	}
 
diff --git a/Src/Utils/FDft.hpp b/Src/Utils/FDft.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..fa61ade230f1aa6cfad0b00f92312ee9f0ef2615
--- /dev/null
+++ b/Src/Utils/FDft.hpp
@@ -0,0 +1,263 @@
+// ===================================================================================
+// Copyright ScalFmm 2011 INRIA, Olivier Coulaud, Bérenger Bramas, Matthias Messner
+// olivier.coulaud@inria.fr, berenger.bramas@inria.fr
+// This software is a computer program whose purpose is to compute the FMM.
+//
+// This software is governed by the CeCILL-C and LGPL licenses and
+// abiding by the rules of distribution of free software.  
+// 
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public and CeCILL-C Licenses for more details.
+// "http://www.cecill.info". 
+// "http://www.gnu.org/licenses".
+// ===================================================================================
+#ifndef FDFT_HPP
+#define FDFT_HPP
+
+#include <iostream>
+#include <stdlib.h>
+
+// include fftw3 (specify path in cmake)
+// if MKL: path/to/mkl/include/fftw/fftw3.h 
+// elseif libfftw_dev: usr/include/fftw3.h
+#include <fftw3.h>
+
+#include "FGlobal.hpp"
+#include "FComplexe.hpp"
+
+#include "FMath.hpp"
+
+#include "FTic.hpp"
+
+
+/**
+ * @author Pierre Blanchard (pierre.blanchard@inria.fr)
+ * @class FDft and @class FFft
+ * Please read the license
+ *
+ * These classes handle the forward and backward Discete Fourier Transform 
+ * (DFT).
+ * @class FDft implements a direct method while @class FFft uses the Fast 
+ * Fourier Transform (FFT). The FFT algorithm can either be provided by the 
+ * FFTW(3) library itself or a version that is wrapped in Intel MKL.
+ *
+ * The aim of writing these specific classes is to allow further customization 
+ * of the DFT such as implementing a tensorial variant, a weighted variant 
+ * or any other feature.
+ *
+ * TODO: some copies might slow down the routines, this needs to be optimized.
+ * 
+ */
+
+
+/**
+ * @class FDft
+ *
+ * @tparam nsteps number of sampled values \f$N\f$
+ */
+
+class FDft
+{
+
+  FReal* fftR_;
+  FComplexe* fftC_;
+
+  FReal *cosRS_, *sinRS_; 
+
+private:
+  unsigned int nsteps_;
+
+public:
+
+  FDft(const unsigned int nsteps)
+  : nsteps_(nsteps)
+  {
+    fftR_ = new FReal[nsteps_];
+    fftC_ = new FComplexe[nsteps_];
+
+    // Beware this is extremely HEAVY to store!!! => Use FDft only for debug!
+    cosRS_ = new FReal[nsteps_*nsteps_];
+    sinRS_ = new FReal[nsteps_*nsteps_];
+
+    for(unsigned int r=0; r<nsteps_; ++r)
+      for(unsigned int s=0; s<nsteps_; ++s){
+        FReal thetaRS = 2*M_PI*r*s/nsteps_;
+        cosRS_[r*nsteps_+s]=FMath::Cos(thetaRS);
+        sinRS_[r*nsteps_+s]=FMath::Sin(thetaRS);
+      }
+  }
+
+  virtual ~FDft()
+  {
+    delete [] fftR_;
+    delete [] fftC_;
+    delete [] cosRS_;
+    delete [] sinRS_;
+  }
+
+  void applyDFT(const FReal* sampledData,
+                FComplexe* transformedData) const
+  {
+//    FTic time;
+
+    // read sampled data
+//    std::cout<< "copy(";
+//    time.tic();
+    FBlas::c_setzero(nsteps_,reinterpret_cast<FReal*>(fftC_));
+    FBlas::copy(nsteps_, sampledData,fftR_);
+//    std::cout << time.tacAndElapsed() << ")";
+
+//    std::cout<< " - exe(";
+//    time.tic();
+    for(unsigned int r=0; r<nsteps_; ++r)
+      for(unsigned int s=0; s<nsteps_; ++s){
+        fftC_[r] += FComplexe(fftR_[s]*cosRS_[r*nsteps_+s], 
+                              -fftR_[s]*sinRS_[r*nsteps_+s]);
+      }
+//    std::cout << time.tacAndElapsed() << ")";
+
+    // write transformed data
+//    std::cout<< " - copy(";
+//    time.tic();
+    FBlas::c_copy(nsteps_,reinterpret_cast<FReal*>(fftC_),
+                  reinterpret_cast<FReal*>(transformedData));
+//    std::cout << time.tacAndElapsed() << ") ";
+
+  }
+
+
+  void applyIDFT(const FComplexe* transformedData,
+                 FReal* sampledData) const
+  {
+    // read transformed data
+    FBlas::setzero(nsteps_,fftR_);
+    FBlas::c_copy(nsteps_,reinterpret_cast<const FReal*>(transformedData),
+                  reinterpret_cast<FReal*>(fftC_));
+
+    for(unsigned int r=0; r<nsteps_; ++r){
+      for(unsigned int s=0; s<nsteps_; ++s){
+        fftR_[r] += fftC_[s].getReal()*cosRS_[r*nsteps_+s]
+          -fftC_[s].getImag()*sinRS_[r*nsteps_+s];
+      }
+      fftR_[r]*=1./nsteps_;
+    }  
+
+    // write sampled data
+    FBlas::copy(nsteps_,fftR_,sampledData);
+
+  }
+
+
+};
+
+
+
+/**
+ * @class FFft
+ *
+ * @tparam nsteps number of sampled values \f$N\f$
+ */
+
+class FFft
+{
+  // arrays
+  FReal* fftR_;
+  FComplexe* fftC_;
+
+  // plans
+  fftw_plan plan_c2r_; // backward FFT plan
+  fftw_plan plan_r2c_; // forward FFT plan 
+
+private:
+  unsigned int nsteps_;
+
+public:
+
+  FFft(const unsigned int nsteps)
+  : nsteps_(nsteps)
+  {
+    // allocate arrays
+    fftR_ = (FReal*) fftw_malloc(sizeof(FReal) * nsteps_);
+    fftC_ = (FComplexe*) fftw_malloc(sizeof(FComplexe) * nsteps_);
+
+    // fftw plans
+    plan_c2r_ =
+      fftw_plan_dft_c2r_1d(nsteps_, 
+                           reinterpret_cast<fftw_complex*>(fftC_),
+                           fftR_, 
+                           FFTW_MEASURE);// TODO: test FFTW_ESTIMATE
+    plan_r2c_ =
+      fftw_plan_dft_r2c_1d(nsteps_, 
+                           fftR_, 
+                           reinterpret_cast<fftw_complex*>(fftC_), 
+                           FFTW_MEASURE);
+
+
+  }
+
+  virtual ~FFft()
+  {
+    fftw_destroy_plan(plan_c2r_);
+    fftw_destroy_plan(plan_r2c_);
+    fftw_free(fftR_);
+    fftw_free(fftC_);
+  }
+
+  void applyDFT(const FReal* sampledData,
+                FComplexe* transformedData) const
+  {
+//    FTic time;
+
+    // read sampled data
+//    std::cout<< "copy(";
+//    time.tic();
+    FBlas::c_setzero(nsteps_,reinterpret_cast<FReal*>(fftC_));
+    FBlas::copy(nsteps_, sampledData,fftR_);
+//    std::cout << time.tacAndElapsed() << ")";
+
+    // perform fft
+//    std::cout<< " - exe(";
+//    time.tic();
+    fftw_execute( plan_r2c_ );
+//    std::cout << time.tacAndElapsed() << ")";
+
+    // write transformed data
+//    std::cout<< " - copy(";
+//    time.tic();
+//    FBlas::c_copy(nsteps_,reinterpret_cast<FReal*>(fftC_),
+//                  reinterpret_cast<FReal*>(transformedData));
+
+    for(unsigned int s=0; s<nsteps_; ++s)
+      transformedData[s]=fftC_[s];
+
+//    std::cout << time.tacAndElapsed() << ") ";
+
+  }
+
+
+  void applyIDFT(const FComplexe* transformedData,
+                 FReal* sampledData) const
+  {
+    // read transformed data
+    FBlas::setzero(nsteps_,fftR_);
+    FBlas::c_copy(nsteps_,reinterpret_cast<const FReal*>(transformedData),
+                  reinterpret_cast<FReal*>(fftC_));
+
+    // perform ifft
+    fftw_execute( plan_c2r_ );
+
+    for(unsigned int s=0; s<nsteps_; ++s)
+      fftR_[s]/=nsteps_; // the fft from fftw is not scaled !!!!!!!!
+
+    // write sampled data
+    FBlas::copy(nsteps_,fftR_,sampledData);
+
+  }
+
+
+};
+
+#endif /* FDFT_HPP */
+
diff --git a/Tests/Kernels/testChebAlgorithm.cpp b/Tests/Kernels/testChebAlgorithm.cpp
index 1766b9f74e9185c6d2aa274a390fd1a8e61a91bb..e9e5a26dce0e4e4f568c1d98a484557c5dfcbd72 100755
--- a/Tests/Kernels/testChebAlgorithm.cpp
+++ b/Tests/Kernels/testChebAlgorithm.cpp
@@ -28,12 +28,12 @@
 
 
 #include "../../Src/Kernels/Chebyshev/FChebCell.hpp"
-#include "../../Src/Kernels/Chebyshev/FChebMatrixKernel.hpp"
+#include "../../Src/Kernels/Interpolation/FInterpMatrixKernel.hpp"
 #include "../../Src/Kernels/Chebyshev/FChebKernel.hpp"
 #include "../../Src/Kernels/Chebyshev/FChebSymKernel.hpp"
 
 #include "../../Src/Components/FSimpleLeaf.hpp"
-#include "../../Src/Kernels/P2P/FP2PParticleContainer.hpp"
+#include "../../Src/Kernels/P2P/FP2PParticleContainerIndexed.hpp"
 
 #include "../../Src/Utils/FParameters.hpp"
 #include "../../Src/Utils/FMemUtils.hpp"
@@ -44,273 +44,313 @@
 #include "../../Src/Core/FFmmAlgorithm.hpp"
 #include "../../Src/Core/FFmmAlgorithmThread.hpp"
 
-
+/**
+ * This program runs the FMM Algorithm with the Chebyshev kernel and compares the results with a direct computation.
+ */
 
 // Simply create particles and try the kernels
 int main(int argc, char* argv[])
 {
-	const char* const filename       = FParameters::getStr(argc,argv,"-f", "../Data/test20k.fma");
-	const unsigned int TreeHeight    = FParameters::getValue(argc, argv, "-h", 5);
-	const unsigned int SubTreeHeight = FParameters::getValue(argc, argv, "-sh", 2);
-	const unsigned int NbThreads     = FParameters::getValue(argc, argv, "-t", 1);
+  const char* const filename       = FParameters::getStr(argc,argv,"-f", "../Data/test20k.fma");
+  const unsigned int TreeHeight    = FParameters::getValue(argc, argv, "-h", 5);
+  const unsigned int SubTreeHeight = FParameters::getValue(argc, argv, "-sh", 2);
+  const unsigned int NbThreads     = FParameters::getValue(argc, argv, "-t", 1);
+
+#ifdef _OPENMP
+  omp_set_num_threads(NbThreads);
+  std::cout << "\n>> Using " << omp_get_max_threads() << " threads.\n" << std::endl;
+#else
+  std::cout << "\n>> Sequential version.\n" << std::
+#endif
+
+  // init timer
+  FTic time;
+
+  // init particles position and physical value
+  struct TestParticle{
+    FPoint position;
+    FReal forces[3];
+    FReal physicalValue;
+    FReal potential;
+  };
+
+  // open particle file
+  FFmaScanfLoader loader(filename);
+  if(!loader.isOpen()) throw std::runtime_error("Particle file couldn't be opened!");
+
+  TestParticle* const particles = new TestParticle[loader.getNumberOfParticles()];
+  for(int idxPart = 0 ; idxPart < loader.getNumberOfParticles() ; ++idxPart){
+    FPoint position;
+    FReal physicalValue = 0.0;
+    loader.fillParticle(&position,&physicalValue);
+    // get copy
+    particles[idxPart].position       = position;
+    particles[idxPart].physicalValue  = physicalValue;
+    particles[idxPart].potential      = 0.0;
+    particles[idxPart].forces[0]      = 0.0;
+    particles[idxPart].forces[1]      = 0.0;
+    particles[idxPart].forces[2]      = 0.0;
+  }
+
+  ////////////////////////////////////////////////////////////////////
+
+  { // begin direct computation
+
+    time.tic();
+    {
+      for(int idxTarget = 0 ; idxTarget < loader.getNumberOfParticles() ; ++idxTarget){
+        for(int idxOther = idxTarget + 1 ; idxOther < loader.getNumberOfParticles() ; ++idxOther){
+          FP2P::MutualParticles(particles[idxTarget].position.getX(), particles[idxTarget].position.getY(),
+                                particles[idxTarget].position.getZ(), particles[idxTarget].physicalValue,
+                                &particles[idxTarget].forces[0], &particles[idxTarget].forces[1],
+                                &particles[idxTarget].forces[2], &particles[idxTarget].potential,
+                                particles[idxOther].position.getX(), particles[idxOther].position.getY(),
+                                particles[idxOther].position.getZ(), particles[idxOther].physicalValue,
+                                &particles[idxOther].forces[0], &particles[idxOther].forces[1],
+                                &particles[idxOther].forces[2], &particles[idxOther].potential);
+        }
+      }
+    }
+    time.tac();
+    printf("Elapsed Time for direct computation: %f\n",time.elapsed());
 
-	const unsigned int ORDER = 9;
-	const FReal epsilon = FReal(1e-9);
+  } // end direct computation
 
-	// init timer
-	FTic time;
+  ////////////////////////////////////////////////////////////////////
 
+  {	// begin Chebyshev kernel
 
+    // accuracy
+    const unsigned int ORDER = 7;
+    const FReal epsilon = FReal(1e-7);
     // typedefs
-    typedef FP2PParticleContainer ContainerClass;
+    typedef FP2PParticleContainerIndexed ContainerClass;
     typedef FSimpleLeaf< ContainerClass >  LeafClass;
-	//typedef FChebMatrixKernelLJ MatrixKernelClass;
-	typedef FChebMatrixKernelR MatrixKernelClass;
-	typedef FChebCell<ORDER> CellClass;
+    //typedef FInterpMatrixKernelLJ MatrixKernelClass;
+    typedef FInterpMatrixKernelR MatrixKernelClass;
+    typedef FChebCell<ORDER> CellClass;
     typedef FOctree<CellClass,ContainerClass,LeafClass> OctreeClass;
-	//typedef FChebKernel<ParticleClass,CellClass,ContainerClass,MatrixKernelClass,ORDER> KernelClass;
+    //typedef FChebKernel<CellClass,ContainerClass,MatrixKernelClass,ORDER> KernelClass;
     typedef FChebSymKernel<CellClass,ContainerClass,MatrixKernelClass,ORDER> KernelClass;
-	//typedef FFmmAlgorithm<OctreeClass,ParticleClass,CellClass,ContainerClass,KernelClass,LeafClass> FmmClass;
-    typedef FFmmAlgorithmThread<OctreeClass,CellClass,ContainerClass,KernelClass,LeafClass> FmmClass;
-
-
-	// What we do //////////////////////////////////////////////////////
-	std::cout << ">> Testing the Chebyshev interpolation base FMM algorithm.\n";
-	
-	// open particle file
-    FFmaScanfLoader loader(filename);
-	//FFmaBinLoader<ParticleClass> loader(filename);
-	if(!loader.isOpen()) throw std::runtime_error("Particle file couldn't be opened!");
-	
-	// init oct-tree
-	OctreeClass tree(TreeHeight, SubTreeHeight, loader.getBoxWidth(), loader.getCenterOfBox());
-
-	// -----------------------------------------------------
-	std::cout << "Creating and inserting " << loader.getNumberOfParticles()
-						<< " particles in a octree of height " << TreeHeight
-						<< " ..." << std::endl;
-	time.tic();
-
-    {
-        FPoint particlePosition;
-        FReal physicalValue = 0.0;
-        for(int idxPart = 0 ; idxPart < loader.getNumberOfParticles() ; ++idxPart){
-            loader.fillParticle(&particlePosition,&physicalValue);
-            tree.insert(particlePosition, physicalValue);
-        }
+    typedef FFmmAlgorithm<OctreeClass,CellClass,ContainerClass,KernelClass,LeafClass> FmmClass;
+    //  typedef FFmmAlgorithmThread<OctreeClass,CellClass,ContainerClass,KernelClass,LeafClass> FmmClass;
+
+    // init oct-tree
+    OctreeClass tree(TreeHeight, SubTreeHeight, loader.getBoxWidth(), loader.getCenterOfBox());
+
+
+    { // -----------------------------------------------------
+      std::cout << "Creating & Inserting " << loader.getNumberOfParticles()
+                << " particles ..." << std::endl;
+      std::cout << "\tHeight : " << TreeHeight << " \t sub-height : " << SubTreeHeight << std::endl;
+      time.tic();
+
+      for(int idxPart = 0 ; idxPart < loader.getNumberOfParticles() ; ++idxPart){
+        // put in tree
+        tree.insert(particles[idxPart].position, idxPart, particles[idxPart].physicalValue);
+      }
+
+      time.tac();
+      std::cout << "Done  " << "(@Creating and Inserting Particles = "
+                << time.elapsed() << "s)." << std::endl;
+    } // -----------------------------------------------------
+
+    { // -----------------------------------------------------
+      std::cout << "\nChebyshev FMM (ORDER="<< ORDER << ",EPS="<< epsilon <<") ... " << std::endl;
+      time.tic();
+      KernelClass kernels(TreeHeight, loader.getCenterOfBox(), loader.getBoxWidth(), epsilon);
+      FmmClass algorithm(&tree, &kernels);
+      algorithm.execute();
+      time.tac();
+      std::cout << "Done  " << "(@Algorithm = " << time.elapsed() << "s)." << std::endl;
+    } // -----------------------------------------------------
+
+
+    { // -----------------------------------------------------
+      std::cout << "\nError computation ... " << std::endl;
+      FMath::FAccurater potentialDiff;
+      FMath::FAccurater fx, fy, fz;
+      { // Check that each particle has been summed with all other
+    
+        tree.forEachLeaf([&](LeafClass* leaf){
+            const FReal*const potentials = leaf->getTargets()->getPotentials();
+            const FReal*const forcesX = leaf->getTargets()->getForcesX();
+            const FReal*const forcesY = leaf->getTargets()->getForcesY();
+            const FReal*const forcesZ = leaf->getTargets()->getForcesZ();
+            const int nbParticlesInLeaf = leaf->getTargets()->getNbParticles();
+            const FVector<int>& indexes = leaf->getTargets()->getIndexes();
+    
+            for(int idxPart = 0 ; idxPart < nbParticlesInLeaf ; ++idxPart){
+              const int indexPartOrig = indexes[idxPart];
+              potentialDiff.add(particles[indexPartOrig].potential,potentials[idxPart]);
+              fx.add(particles[indexPartOrig].forces[0],forcesX[idxPart]);
+              fy.add(particles[indexPartOrig].forces[1],forcesY[idxPart]);
+              fz.add(particles[indexPartOrig].forces[2],forcesZ[idxPart]);
+            }
+          });
+      }
+
+      // Print for information
+      std::cout << "Potential " << potentialDiff << std::endl;
+      std::cout << "Fx " << fx << std::endl;
+      std::cout << "Fy " << fy << std::endl;
+      std::cout << "Fz " << fz << std::endl;
+    } // -----------------------------------------------------
+
+  } // end Chebyshev kernel
+
+
+
+
+  //// -----------------------------------------------------
+  //{ // cost of symmetric m2l opertors, weighted rank, etc.
+  //	const unsigned int nnodes = ORDER*ORDER*ORDER;
+  //	const SymmetryHandler<ORDER> *const SymHandler = kernels.getPtrToSymHandler();
+  //	unsigned int expansionCounter[343];
+  //	for (unsigned i=0; i<343; ++i) expansionCounter[i] = 0;
+  //	for (unsigned i=0; i<343; ++i) if (SymHandler->pindices[i]) expansionCounter[SymHandler->pindices[i]]++;
+  //
+  //	unsigned int overallCost = 0;
+  //	unsigned int overallWeightedRank = 0;
+  //	unsigned int nbExpansions = 0;
+  //	for (unsigned i=0; i<343; ++i)
+  //		if (expansionCounter[i]) {
+  //			const unsigned int cost = (2*nnodes*SymHandler->LowRank[i]) * expansionCounter[i];
+  //			const unsigned int weightedRank = SymHandler->LowRank[i] * expansionCounter[i];
+  //			overallCost += cost;
+  //			overallWeightedRank += weightedRank;
+  //			nbExpansions += expansionCounter[i];
+  //			std::cout << "expansionCounter[" << i << "] = " << expansionCounter[i]
+  //								<< "\tlow rank = " << SymHandler->LowRank[i]
+  //								<< "\t(2*nnodes*rank) * nb_exp = " << cost
+  //								<< std::endl;
+  //		}
+  //	std::cout << "=== Overall cost = " << overallCost << "\t Weighted rank = " << (double)overallWeightedRank / (double)nbExpansions << std::endl;
+  //	if (nbExpansions!=316) std::cout << "Something went wrong, number of counted expansions = " << nbExpansions << std::endl;
+  //}
+  //// -----------------------------------------------------
+
+
+  // -----------------------------------------------------
+  // find first non empty leaf cell
+  /*if (FParameters::findParameter(argc,argv,"-dont_check_accuracy") == FParameters::NotFound) {
+    OctreeClass::Iterator iLeafs(&tree);
+    iLeafs.gotoBottomLeft();
+
+    const ContainerClass *const Targets = iLeafs.getCurrentListTargets();
+    const unsigned int NumTargets = Targets->getSize();
+
+    FReal* Potential = new FReal [NumTargets];
+    FBlas::setzero(NumTargets, Potential);
+
+    FReal* Force = new FReal [NumTargets * 3];
+    FBlas::setzero(NumTargets * 3, Force);
+
+    std::cout << "\nDirect computation of " << NumTargets << " target particles ..." << std::endl;
+    const MatrixKernelClass MatrixKernel;
+    do {
+    const ContainerClass *const Sources = iLeafs.getCurrentListSrc();
+    unsigned int counter = 0;
+    ContainerClass::ConstBasicIterator iTarget(*Targets);
+    while(iTarget.hasNotFinished()) {
+    const FReal wt = iTarget.data().getPhysicalValue();
+    ContainerClass::ConstBasicIterator iSource(*Sources);
+    while(iSource.hasNotFinished()) {
+    if (&iTarget.data() != &iSource.data()) {
+    const FReal potential_value = MatrixKernel.evaluate(iTarget.data().getPosition(),
+    iSource.data().getPosition());
+    const FReal ws = iSource.data().getPhysicalValue();
+    // potential
+    Potential[counter] += potential_value * ws;
+    // force
+    if (MatrixKernelClass::Identifier == ONE_OVER_R) { // laplace force
+    FPoint force(iSource.data().getPosition() - iTarget.data().getPosition());
+    force *= ((ws*wt) * (potential_value*potential_value*potential_value));
+    Force[counter*3 + 0] += force.getX();
+    Force[counter*3 + 1] += force.getY();
+    Force[counter*3 + 2] += force.getZ();
+    } else if (MatrixKernelClass::Identifier == LEONARD_JONES_POTENTIAL) { // lenard-jones force
+    FPoint force(iSource.data().getPosition() - iTarget.data().getPosition());
+    const FReal one_over_r = FReal(1.) / FMath::Sqrt(force.getX()*force.getX() +
+    force.getY()*force.getY() +
+    force.getZ()*force.getZ()); // 1 + 15 + 5 = 21 flops
+    const FReal one_over_r3 = one_over_r * one_over_r * one_over_r;
+    const FReal one_over_r6 = one_over_r3 * one_over_r3;
+    const FReal one_over_r4 = one_over_r3 * one_over_r;
+    force *= ((ws*wt) * (FReal(12.)*one_over_r6*one_over_r4*one_over_r4 - FReal(6.)*one_over_r4*one_over_r4));
+    Force[counter*3 + 0] += force.getX();
+    Force[counter*3 + 1] += force.getY();
+    Force[counter*3 + 2] += force.getZ();
+    }
     }
+    iSource.gotoNext();
+    }
+    counter++;
+    iTarget.gotoNext();
+    }
+    } while(iLeafs.moveRight());
 
-	std::cout << "Done  " << "(" << time.tacAndElapsed() << ")." << std::endl;
-	// -----------------------------------------------------
-
-	
-	//// -----------------------------------------------------
-	//KernelClass kernels(TreeHeight, loader.getCenterOfBox(), loader.getBoxWidth(), epsilon);
-	//for (unsigned int NbThreads=1; NbThreads<=160; NbThreads+=3) {
-	//	omp_set_num_threads(NbThreads); 
-	//	std::cout << "\n================================================================"
-	//						<< "\nChebyshev FMM using" << omp_get_max_threads() << " threads ..." << std::endl;
-	//
-	//	{	// reinitialize //////////////////////////////////////
-	//		OctreeClass::Iterator octreeIterator(&tree);
-	//		octreeIterator.gotoBottomLeft();
-	//		OctreeClass::Iterator avoidGotoLeftIterator(octreeIterator);   // for each levels
-	//		
-	//		// set potential and forces to zero
-	//		do {
-	//			ContainerClass *const Sources = octreeIterator.getCurrentListSrc();
-	//			ContainerClass::BasicIterator iSource(*Sources);
-	//			while(iSource.hasNotFinished()) {
-	//				iSource.data().setPotential(FReal(0.));
-	//				iSource.data().setForces(FReal(0.), FReal(0.), FReal(0.));
-	//				iSource.gotoNext();
-	//			}
-	//		} while(octreeIterator.moveRight());
-	//		
-	//		// set multipole and local expansions
-	//		octreeIterator = avoidGotoLeftIterator;
-	//		for(int idxLevel = TreeHeight-1; idxLevel>1; --idxLevel) { // for each cells
-	//	
-	//			do{
-	//				FMemUtils::setall( octreeIterator.getCurrentCell()->getLocal(), FReal(0), ORDER*ORDER*ORDER * 2);
-	//				FMemUtils::setall( octreeIterator.getCurrentCell()->getMultipole(), FReal(0), ORDER*ORDER*ORDER * 2);
-	//			} while(octreeIterator.moveRight());
-	//	
-	//			avoidGotoLeftIterator.moveUp();
-	//			octreeIterator = avoidGotoLeftIterator;
-	//		}
-	//	} //////////////////////////////////////////////////////
-	//
-	//	FmmClass algorithm(&tree,&kernels);
-	//	time.tic();
-	//	algorithm.execute();
-	//	std::cout << "completed in " << time.tacAndElapsed() << "sec." << std::endl;
-	//}
-	//// -----------------------------------------------------
-
-	{
-		omp_set_num_threads(NbThreads); 
-		std::cout << "\nChebyshev FMM using " << omp_get_max_threads() << " threads ..." << std::endl;
-		KernelClass kernels(TreeHeight, loader.getCenterOfBox(), loader.getBoxWidth(), epsilon);
-		FmmClass algorithm(&tree, &kernels);
-		time.tic();
-		algorithm.execute();
-		std::cout << "completed in " << time.tacAndElapsed() << "sec." << std::endl;
-	}
-
-
-	//// -----------------------------------------------------
-	//{ // cost of symmetric m2l opertors, weighted rank, etc.
-	//	const unsigned int nnodes = ORDER*ORDER*ORDER;
-	//	const SymmetryHandler<ORDER> *const SymHandler = kernels.getPtrToSymHandler();
-	//	unsigned int expansionCounter[343];
-	//	for (unsigned i=0; i<343; ++i) expansionCounter[i] = 0;
-	//	for (unsigned i=0; i<343; ++i) if (SymHandler->pindices[i]) expansionCounter[SymHandler->pindices[i]]++;
-	//
-	//	unsigned int overallCost = 0;
-	//	unsigned int overallWeightedRank = 0;
-	//	unsigned int nbExpansions = 0;
-	//	for (unsigned i=0; i<343; ++i)
-	//		if (expansionCounter[i]) {
-	//			const unsigned int cost = (2*nnodes*SymHandler->LowRank[i]) * expansionCounter[i];
-	//			const unsigned int weightedRank = SymHandler->LowRank[i] * expansionCounter[i];
-	//			overallCost += cost;
-	//			overallWeightedRank += weightedRank;
-	//			nbExpansions += expansionCounter[i];
-	//			std::cout << "expansionCounter[" << i << "] = " << expansionCounter[i]
-	//								<< "\tlow rank = " << SymHandler->LowRank[i]
-	//								<< "\t(2*nnodes*rank) * nb_exp = " << cost
-	//								<< std::endl;
-	//		}
-	//	std::cout << "=== Overall cost = " << overallCost << "\t Weighted rank = " << (double)overallWeightedRank / (double)nbExpansions << std::endl;
-	//	if (nbExpansions!=316) std::cout << "Something went wrong, number of counted expansions = " << nbExpansions << std::endl;
-	//}
-	//// -----------------------------------------------------
-	
-
-	// -----------------------------------------------------
-	// find first non empty leaf cell 
-    /*if (FParameters::findParameter(argc,argv,"-dont_check_accuracy") == FParameters::NotFound) {
-		OctreeClass::Iterator iLeafs(&tree);
-		iLeafs.gotoBottomLeft();
-	
-		const ContainerClass *const Targets = iLeafs.getCurrentListTargets();
-		const unsigned int NumTargets = Targets->getSize();
-
-		FReal* Potential = new FReal [NumTargets];
-		FBlas::setzero(NumTargets, Potential);
-
-		FReal* Force = new FReal [NumTargets * 3];
-		FBlas::setzero(NumTargets * 3, Force);
-	
-		std::cout << "\nDirect computation of " << NumTargets << " target particles ..." << std::endl;
-		const MatrixKernelClass MatrixKernel;
-		do {
-			const ContainerClass *const Sources = iLeafs.getCurrentListSrc();
-			unsigned int counter = 0;
-			ContainerClass::ConstBasicIterator iTarget(*Targets);
-			while(iTarget.hasNotFinished()) {
-				const FReal wt = iTarget.data().getPhysicalValue();
-				ContainerClass::ConstBasicIterator iSource(*Sources);
-				while(iSource.hasNotFinished()) {
-					if (&iTarget.data() != &iSource.data()) {
-						const FReal potential_value = MatrixKernel.evaluate(iTarget.data().getPosition(),
-																													 iSource.data().getPosition());
-						const FReal ws = iSource.data().getPhysicalValue();
-						// potential
-						Potential[counter] += potential_value * ws;
-						// force
-						if (MatrixKernelClass::Identifier == ONE_OVER_R) { // laplace force
-							FPoint force(iSource.data().getPosition() - iTarget.data().getPosition());
-							force *= ((ws*wt) * (potential_value*potential_value*potential_value));
-							Force[counter*3 + 0] += force.getX();
-							Force[counter*3 + 1] += force.getY();
-							Force[counter*3 + 2] += force.getZ();
-						} else if (MatrixKernelClass::Identifier == LEONARD_JONES_POTENTIAL) { // lenard-jones force
-							FPoint force(iSource.data().getPosition() - iTarget.data().getPosition());
-							const FReal one_over_r = FReal(1.) / FMath::Sqrt(force.getX()*force.getX() +
-																															 force.getY()*force.getY() +
-																															 force.getZ()*force.getZ()); // 1 + 15 + 5 = 21 flops
-							const FReal one_over_r3 = one_over_r * one_over_r * one_over_r;
-							const FReal one_over_r6 = one_over_r3 * one_over_r3;
-							const FReal one_over_r4 = one_over_r3 * one_over_r;
-							force *= ((ws*wt) * (FReal(12.)*one_over_r6*one_over_r4*one_over_r4 - FReal(6.)*one_over_r4*one_over_r4));
-							Force[counter*3 + 0] += force.getX();
-							Force[counter*3 + 1] += force.getY();
-							Force[counter*3 + 2] += force.getZ();
-						}
-					}
-					iSource.gotoNext();
-				}
-				counter++;
-				iTarget.gotoNext();
-			}
-		} while(iLeafs.moveRight());
-
-	
-		FReal* ApproxPotential = new FReal [NumTargets];
-		FReal* ApproxForce     = new FReal [NumTargets * 3];
-
-		unsigned int counter = 0;
-		ContainerClass::ConstBasicIterator iTarget(*Targets);
-		while(iTarget.hasNotFinished()) {
-			ApproxPotential[counter]   = iTarget.data().getPotential();
-			ApproxForce[counter*3 + 0] = iTarget.data().getForces().getX();
-			ApproxForce[counter*3 + 1] = iTarget.data().getForces().getY();
-			ApproxForce[counter*3 + 2] = iTarget.data().getForces().getZ();
-			counter++;
-			iTarget.gotoNext();
-		}
-
-		std::cout << "\nPotential error:" << std::endl;
-		std::cout << "Relative L2 error   = " << computeL2norm( NumTargets, Potential, ApproxPotential)
-							<< std::endl;
-		std::cout << "Relative Lmax error = " << computeINFnorm(NumTargets, Potential, ApproxPotential)
-							<< std::endl;
-
-		std::cout << "\nForce error:" << std::endl;
-		std::cout << "Relative L2 error   = " << computeL2norm( NumTargets*3, Force, ApproxForce)
-							<< std::endl;
-		std::cout << "Relative Lmax error = " << computeINFnorm(NumTargets*3, Force, ApproxForce)
-							<< std::endl;
-		std::cout << std::endl;
-
-		// free memory
-		delete [] Potential;
-		delete [] ApproxPotential;
-		delete [] Force;
-		delete [] ApproxForce;
-    }*/
 
-	/*
-	// Check if particles are strictly within its containing cells
-	const FReal BoxWidthLeaf = loader.getBoxWidth() / FReal(FMath::pow(2, TreeHeight-1));
-	OctreeClass::Iterator octreeIterator(&tree);
-	octreeIterator.gotoBottomLeft();
-	do{
-	const CellClass *const LeafCell = octreeIterator.getCurrentCell();
-	const FPoint LeafCellCenter(LeafCell->getCoordinate().getX() * BoxWidthLeaf + BoxWidthLeaf/2 + loader.getCenterOfBox().getX(),
-	LeafCell->getCoordinate().getY() * BoxWidthLeaf + BoxWidthLeaf/2 + loader.getCenterOfBox().getY(),
-	LeafCell->getCoordinate().getZ() * BoxWidthLeaf + BoxWidthLeaf/2 + loader.getCenterOfBox().getZ());
-	const ContainerClass *const Particles = octreeIterator.getCurrentListSrc();
-	ContainerClass::ConstBasicIterator particleIterator(*Particles);
-	while(particleIterator.hasNotFinished()) {
-	const FPoint distance(LeafCellCenter-particleIterator.data().getPosition());
-	std::cout << "center - particle = " << distance << " < " << BoxWidthLeaf/FReal(2.) << std::endl;
-	if (std::abs(distance.getX())>BoxWidthLeaf/FReal(2.) ||
-	std::abs(distance.getY())>BoxWidthLeaf/FReal(2.) ||
-	std::abs(distance.getZ())>BoxWidthLeaf/FReal(2.)) {
-	std::cout << "stop" << std::endl;
-	exit(-1);
-	}
-	particleIterator.gotoNext();
-	}
-	} while(octreeIterator.moveRight());
-	*/
-
-
-	return 0;
-}
+    FReal* ApproxPotential = new FReal [NumTargets];
+    FReal* ApproxForce     = new FReal [NumTargets * 3];
 
+    unsigned int counter = 0;
+    ContainerClass::ConstBasicIterator iTarget(*Targets);
+    while(iTarget.hasNotFinished()) {
+    ApproxPotential[counter]   = iTarget.data().getPotential();
+    ApproxForce[counter*3 + 0] = iTarget.data().getForces().getX();
+    ApproxForce[counter*3 + 1] = iTarget.data().getForces().getY();
+    ApproxForce[counter*3 + 2] = iTarget.data().getForces().getZ();
+    counter++;
+    iTarget.gotoNext();
+    }
 
+    std::cout << "\nPotential error:" << std::endl;
+    std::cout << "Relative L2 error   = " << computeL2norm( NumTargets, Potential, ApproxPotential)
+    << std::endl;
+    std::cout << "Relative Lmax error = " << computeINFnorm(NumTargets, Potential, ApproxPotential)
+    << std::endl;
+
+    std::cout << "\nForce error:" << std::endl;
+    std::cout << "Relative L2 error   = " << computeL2norm( NumTargets*3, Force, ApproxForce)
+    << std::endl;
+    std::cout << "Relative Lmax error = " << computeINFnorm(NumTargets*3, Force, ApproxForce)
+    << std::endl;
+    std::cout << std::endl;
+
+    // free memory
+    delete [] Potential;
+    delete [] ApproxPotential;
+    delete [] Force;
+    delete [] ApproxForce;
+    }*/
 
+  /*
+  // Check if particles are strictly within its containing cells
+  const FReal BoxWidthLeaf = loader.getBoxWidth() / FReal(FMath::pow(2, TreeHeight-1));
+  OctreeClass::Iterator octreeIterator(&tree);
+  octreeIterator.gotoBottomLeft();
+  do{
+  const CellClass *const LeafCell = octreeIterator.getCurrentCell();
+  const FPoint LeafCellCenter(LeafCell->getCoordinate().getX() * BoxWidthLeaf + BoxWidthLeaf/2 + loader.getCenterOfBox().getX(),
+  LeafCell->getCoordinate().getY() * BoxWidthLeaf + BoxWidthLeaf/2 + loader.getCenterOfBox().getY(),
+  LeafCell->getCoordinate().getZ() * BoxWidthLeaf + BoxWidthLeaf/2 + loader.getCenterOfBox().getZ());
+  const ContainerClass *const Particles = octreeIterator.getCurrentListSrc();
+  ContainerClass::ConstBasicIterator particleIterator(*Particles);
+  while(particleIterator.hasNotFinished()) {
+  const FPoint distance(LeafCellCenter-particleIterator.data().getPosition());
+  std::cout << "center - particle = " << distance << " < " << BoxWidthLeaf/FReal(2.) << std::endl;
+  if (std::abs(distance.getX())>BoxWidthLeaf/FReal(2.) ||
+  std::abs(distance.getY())>BoxWidthLeaf/FReal(2.) ||
+  std::abs(distance.getZ())>BoxWidthLeaf/FReal(2.)) {
+  std::cout << "stop" << std::endl;
+  exit(-1);
+  }
+  particleIterator.gotoNext();
+  }
+  } while(octreeIterator.moveRight());
+  */
+
+
+  return 0;
+}
diff --git a/Tests/Kernels/testCompareKernels.cpp b/Tests/Kernels/testCompareKernels.cpp
index baff4fd20f5dcbbaaf8187d3d1a342bdcdb683e1..7e2dd3e19ce725ba1502b0163b8a5e014a75e512 100755
--- a/Tests/Kernels/testCompareKernels.cpp
+++ b/Tests/Kernels/testCompareKernels.cpp
@@ -37,7 +37,7 @@
 // chebyshev kernel
 
 #include "../../Src/Kernels/Chebyshev/FChebCell.hpp"
-#include "../../Src/Kernels/Chebyshev/FChebMatrixKernel.hpp"
+#include "../../Src/Kernels/Interpolation/FInterpMatrixKernel.hpp"
 #include "../../Src/Kernels/Chebyshev/FChebKernel.hpp"
 #include "../../Src/Kernels/Chebyshev/FChebSymKernel.hpp"
 
@@ -121,7 +121,7 @@ int main(int argc, char* argv[])
         // typedefs
         typedef FP2PParticleContainerIndexed ContainerClass;
         typedef FSimpleLeaf<ContainerClass> LeafClass;
-        typedef FChebMatrixKernelR MatrixKernelClass;
+        typedef FInterpMatrixKernelR MatrixKernelClass;
         typedef FChebCell<ORDER> CellClass;
         typedef FOctree<CellClass,ContainerClass,LeafClass> OctreeClass;
 
diff --git a/Tests/Kernels/testFlopsChebAlgorithm.cpp b/Tests/Kernels/testFlopsChebAlgorithm.cpp
index 1448eb446dfa8681cbb72851078ba0a6632ae268..3ff510e477b1c9e0ecea7d546f7fd149f06cd103 100755
--- a/Tests/Kernels/testFlopsChebAlgorithm.cpp
+++ b/Tests/Kernels/testFlopsChebAlgorithm.cpp
@@ -27,7 +27,7 @@
 #include "../../Src/Files/FFmaBinLoader.hpp"
 
 #include "../../Src/Kernels/Chebyshev/FChebCell.hpp"
-#include "../../Src/Kernels/Chebyshev/FChebMatrixKernel.hpp"
+#include "../../Src/Kernels/Interpolation/FInterpMatrixKernel.hpp"
 
 #include "../../Src/Kernels/Chebyshev/FChebFlopsSymKernel.hpp"
 
@@ -59,7 +59,7 @@ int main(int argc, char* argv[])
     // typedefs
     typedef FP2PParticleContainer ContainerClass;
     typedef FSimpleLeaf<ContainerClass> LeafClass;
-	typedef FChebMatrixKernelR MatrixKernelClass;
+	typedef FInterpMatrixKernelR MatrixKernelClass;
 	typedef FChebCell<ORDER> CellClass;
     typedef FOctree<CellClass,ContainerClass,LeafClass> OctreeClass;
     typedef FChebFlopsSymKernel<CellClass,ContainerClass,MatrixKernelClass,ORDER> KernelClass;
diff --git a/Tests/Kernels/testNewCompareKernels.cpp b/Tests/Kernels/testNewCompareKernels.cpp
index 6e3cb86035ff39a477ad96963600e6be094cb770..64e03b5a2f105edc769ca16d6d869f05dcf13108 100644
--- a/Tests/Kernels/testNewCompareKernels.cpp
+++ b/Tests/Kernels/testNewCompareKernels.cpp
@@ -39,7 +39,7 @@
 // chebyshev kernel
 
 #include "../../Src/Kernels/Chebyshev/FChebCell.hpp"
-#include "../../Src/Kernels/Chebyshev/FChebMatrixKernel.hpp"
+#include "../../Src/Kernels/Interpolation/FInterpMatrixKernel.hpp"
 #include "../../Src/Kernels/Chebyshev/FChebKernel.hpp"
 #include "../../Src/Kernels/Chebyshev/FChebSymKernel.hpp"
 #endif
@@ -62,6 +62,13 @@
 #include "../../Src/Kernels/Rotation/FRotationKernel.hpp"
 #include "../../Src/Kernels/Rotation/FRotationCell.hpp"
 
+#ifdef ScalFMM_USE_BLAS
+// Uniform grid kernel
+#include "../../Src/Kernels/Uniform/FUnifCell.hpp"
+//#include "../../Src/Kernels/Interpolation/FInterpMatrixKernel.hpp"
+#include "../../Src/Kernels/Uniform/FUnifKernel.hpp"
+#endif
+
 
 /**
  * This program compares two different kernels, eg., the Chebyshev kernel with
@@ -140,11 +147,12 @@ int main(int argc, char* argv[])
         // typedefs
         typedef FP2PParticleContainerIndexed ContainerClass;
         typedef FSimpleLeaf<ContainerClass> LeafClass;
-        typedef FChebMatrixKernelR MatrixKernelClass;
+        typedef FInterpMatrixKernelR MatrixKernelClass;
         typedef FChebCell<ORDER> CellClass;
         typedef FOctree<CellClass,ContainerClass,LeafClass> OctreeClass;
 
         typedef FChebSymKernel<CellClass,ContainerClass,MatrixKernelClass,ORDER> KernelClass;
+        //        typedef FChebKernel<CellClass,ContainerClass,MatrixKernelClass,ORDER> KernelClass;
         typedef FFmmAlgorithm<OctreeClass,CellClass,ContainerClass,KernelClass,LeafClass> FmmClass;
 
 
@@ -279,6 +287,85 @@ int main(int argc, char* argv[])
         std::cout << "Fy " << fy << std::endl;
         std::cout << "Fz " << fz << std::endl;
     } // end FFmaBlas kernel
+    //
+    ////////////////////////////////////////////////////////////////////
+    //
+    {	// begin Lagrange/Uniform Grid kernel
+
+      // TODO 
+
+      // accuracy
+      const unsigned int ORDER = 7;
+
+      // typedefs
+      typedef FP2PParticleContainerIndexed ContainerClass;
+      typedef FSimpleLeaf<ContainerClass> LeafClass;
+      typedef FInterpMatrixKernelR MatrixKernelClass;
+      typedef FUnifCell<ORDER> CellClass;
+      typedef FOctree<CellClass,ContainerClass,LeafClass> OctreeClass;
+
+      typedef FUnifKernel<CellClass,ContainerClass,MatrixKernelClass,ORDER> KernelClass;
+      typedef FFmmAlgorithm<OctreeClass,CellClass,ContainerClass,KernelClass,LeafClass> FmmClass;
+
+
+      // init oct-tree
+      OctreeClass tree(TreeHeight, SubTreeHeight, loader.getBoxWidth(), loader.getCenterOfBox());
+
+      { // -----------------------------------------------------
+        std::cout << "Creating & Inserting " << loader.getNumberOfParticles()
+                  << " particles ..." << std::endl;
+        std::cout << "\tHeight : " << TreeHeight << " \t sub-height : " << SubTreeHeight << std::endl;
+        time.tic();
+
+        for(int idxPart = 0 ; idxPart < loader.getNumberOfParticles() ; ++idxPart){
+          // put in tree
+          tree.insert(particles[idxPart].position, idxPart, particles[idxPart].physicalValue);
+        }
+
+        time.tac();
+        std::cout << "Done  " << "(@Creating and Inserting Particles = "
+                  << time.elapsed() << "s)." << std::endl;
+      } // -----------------------------------------------------
+
+      { // -----------------------------------------------------
+        std::cout << "\nLagrange FMM ... " << std::endl;
+        time.tic();
+        KernelClass kernels(TreeHeight, loader.getCenterOfBox(), loader.getBoxWidth());
+        FmmClass algorithm(&tree, &kernels);
+        algorithm.execute();
+        time.tac();
+        std::cout << "Done  " << "(@Algorithm = " << time.elapsed() << "s)." << std::endl;
+      } // -----------------------------------------------------
+
+      FMath::FAccurater potentialDiff;
+      FMath::FAccurater fx, fy, fz;
+      { // Check that each particle has been summed with all other
+
+        tree.forEachLeaf([&](LeafClass* leaf){
+            const FReal*const potentials = leaf->getTargets()->getPotentials();
+            const FReal*const forcesX = leaf->getTargets()->getForcesX();
+            const FReal*const forcesY = leaf->getTargets()->getForcesY();
+            const FReal*const forcesZ = leaf->getTargets()->getForcesZ();
+            const int nbParticlesInLeaf = leaf->getTargets()->getNbParticles();
+            const FVector<int>& indexes = leaf->getTargets()->getIndexes();
+
+            for(int idxPart = 0 ; idxPart < nbParticlesInLeaf ; ++idxPart){
+              const int indexPartOrig = indexes[idxPart];
+              potentialDiff.add(particles[indexPartOrig].potential,potentials[idxPart]);
+              fx.add(particles[indexPartOrig].forces[0],forcesX[idxPart]);
+              fy.add(particles[indexPartOrig].forces[1],forcesY[idxPart]);
+              fz.add(particles[indexPartOrig].forces[2],forcesZ[idxPart]);
+            }
+          });
+      }
+
+      // Print for information
+      std::cout << "Potential " << potentialDiff << std::endl;
+      std::cout << "Fx " << fx << std::endl;
+      std::cout << "Fy " << fy << std::endl;
+      std::cout << "Fz " << fz << std::endl;
+
+    } // end Lagrange/Uniform Grid kernel
 #endif
 
 {
diff --git a/Tests/Kernels/testUnifAlgorithm.cpp b/Tests/Kernels/testUnifAlgorithm.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..22379a0c9f31cfd88231fdb94746d04445ba79f6
--- /dev/null
+++ b/Tests/Kernels/testUnifAlgorithm.cpp
@@ -0,0 +1,201 @@
+// ===================================================================================
+// Ce LOGICIEL "ScalFmm" est couvert par le copyright Inria 20xx-2012.
+// Inria détient tous les droits de propriété sur le LOGICIEL, et souhaite que
+// la communauté scientifique l'utilise afin de le tester et de l'évaluer.
+// Inria donne gracieusement le droit d'utiliser ce LOGICIEL. Toute utilisation
+// dans un but lucratif ou à des fins commerciales est interdite sauf autorisation
+// expresse et préalable d'Inria.
+// Toute utilisation hors des limites précisées ci-dessus et réalisée sans l'accord
+// expresse préalable d'Inria constituerait donc le délit de contrefaçon.
+// Le LOGICIEL étant un produit en cours de développement, Inria ne saurait assurer
+// aucune responsabilité et notamment en aucune manière et en aucun cas, être tenu
+// de répondre d'éventuels dommages directs ou indirects subits par l'utilisateur.
+// Tout utilisateur du LOGICIEL s'engage à communiquer à Inria ses remarques
+// relatives à l'usage du LOGICIEL
+// ===================================================================================
+
+// ==== CMAKE =====
+// @FUSE_BLAS
+// ================
+
+#include <iostream>
+
+#include <cstdio>
+#include <cstdlib>
+
+#include "../../Src/Files/FFmaScanfLoader.hpp"
+#include "../../Src/Files/FFmaBinLoader.hpp"
+
+
+#include "../../Src/Kernels/Uniform/FUnifCell.hpp"
+#include "../../Src/Kernels/Interpolation/FInterpMatrixKernel.hpp"
+#include "../../Src/Kernels/Uniform/FUnifKernel.hpp"
+
+#include "../../Src/Components/FSimpleLeaf.hpp"
+#include "../../Src/Kernels/P2P/FP2PParticleContainerIndexed.hpp"
+
+#include "../../Src/Utils/FParameters.hpp"
+#include "../../Src/Utils/FMemUtils.hpp"
+
+#include "../../Src/Containers/FOctree.hpp"
+#include "../../Src/Containers/FVector.hpp"
+
+#include "../../Src/Core/FFmmAlgorithm.hpp"
+#include "../../Src/Core/FFmmAlgorithmThread.hpp"
+
+/**
+ * This program runs the FMM Algorithm with the Uniform kernel and compares the results with a direct computation.
+ */
+
+// Simply create particles and try the kernels
+int main(int argc, char* argv[])
+{
+  const char* const filename       = FParameters::getStr(argc,argv,"-f", "../Data/test20k.fma");
+  const unsigned int TreeHeight    = FParameters::getValue(argc, argv, "-h", 5);
+  const unsigned int SubTreeHeight = FParameters::getValue(argc, argv, "-sh", 2);
+  const unsigned int NbThreads     = FParameters::getValue(argc, argv, "-t", 1);
+
+#ifdef _OPENMP
+  omp_set_num_threads(NbThreads);
+  std::cout << "\n>> Using " << omp_get_max_threads() << " threads.\n" << std::endl;
+#else
+  std::cout << "\n>> Sequential version.\n" << std::
+#endif
+
+    // init timer
+    FTic time;
+
+  // init particles position and physical value
+  struct TestParticle{
+    FPoint position;
+    FReal forces[3];
+    FReal physicalValue;
+    FReal potential;
+  };
+
+  // open particle file
+  FFmaScanfLoader loader(filename);
+  if(!loader.isOpen()) throw std::runtime_error("Particle file couldn't be opened!");
+
+  TestParticle* const particles = new TestParticle[loader.getNumberOfParticles()];
+  for(int idxPart = 0 ; idxPart < loader.getNumberOfParticles() ; ++idxPart){
+    FPoint position;
+    FReal physicalValue = 0.0;
+    loader.fillParticle(&position,&physicalValue);
+    // get copy
+    particles[idxPart].position       = position;
+    particles[idxPart].physicalValue  = physicalValue;
+    particles[idxPart].potential      = 0.0;
+    particles[idxPart].forces[0]      = 0.0;
+    particles[idxPart].forces[1]      = 0.0;
+    particles[idxPart].forces[2]      = 0.0;
+  }
+
+  ////////////////////////////////////////////////////////////////////
+
+  { // begin direct computation
+    std::cout << "\nDirect Computation ... " << std::endl;
+    time.tic();
+    {
+      for(int idxTarget = 0 ; idxTarget < loader.getNumberOfParticles() ; ++idxTarget){
+        for(int idxOther = idxTarget + 1 ; idxOther < loader.getNumberOfParticles() ; ++idxOther){
+          FP2P::MutualParticles(particles[idxTarget].position.getX(), particles[idxTarget].position.getY(),
+                                particles[idxTarget].position.getZ(), particles[idxTarget].physicalValue,
+                                &particles[idxTarget].forces[0], &particles[idxTarget].forces[1],
+                                &particles[idxTarget].forces[2], &particles[idxTarget].potential,
+                                particles[idxOther].position.getX(), particles[idxOther].position.getY(),
+                                particles[idxOther].position.getZ(), particles[idxOther].physicalValue,
+                                &particles[idxOther].forces[0], &particles[idxOther].forces[1],
+                                &particles[idxOther].forces[2], &particles[idxOther].potential);
+        }
+      }
+    }
+    time.tac();
+    std::cout << "Done  " << "(@Direct Computation = "
+              << time.elapsed() << "s)." << std::endl;
+
+  } // end direct computation
+
+  ////////////////////////////////////////////////////////////////////
+
+  {	// begin Lagrange kernel
+
+    // accuracy
+    const unsigned int ORDER = 7;
+    // typedefs
+    typedef FP2PParticleContainerIndexed ContainerClass;
+    typedef FSimpleLeaf< ContainerClass >  LeafClass;
+    //typedef FInterpMatrixKernelLJ MatrixKernelClass;
+    typedef FInterpMatrixKernelR MatrixKernelClass;
+    typedef FUnifCell<ORDER> CellClass;
+    typedef FOctree<CellClass,ContainerClass,LeafClass> OctreeClass;
+    typedef FUnifKernel<CellClass,ContainerClass,MatrixKernelClass,ORDER> KernelClass;
+    typedef FFmmAlgorithm<OctreeClass,CellClass,ContainerClass,KernelClass,LeafClass> FmmClass;
+    //  typedef FFmmAlgorithmThread<OctreeClass,CellClass,ContainerClass,KernelClass,LeafClass> FmmClass;
+
+    // init oct-tree
+    OctreeClass tree(TreeHeight, SubTreeHeight, loader.getBoxWidth(), loader.getCenterOfBox());
+
+
+    { // -----------------------------------------------------
+      std::cout << "Creating & Inserting " << loader.getNumberOfParticles()
+                << " particles ..." << std::endl;
+      std::cout << "\tHeight : " << TreeHeight << " \t sub-height : " << SubTreeHeight << std::endl;
+      time.tic();
+
+      for(int idxPart = 0 ; idxPart < loader.getNumberOfParticles() ; ++idxPart){
+        // put in tree
+        tree.insert(particles[idxPart].position, idxPart, particles[idxPart].physicalValue);
+      }
+
+      time.tac();
+      std::cout << "Done  " << "(@Creating and Inserting Particles = "
+                << time.elapsed() << "s)." << std::endl;
+    } // -----------------------------------------------------
+
+    { // -----------------------------------------------------
+      std::cout << "\nLagrange/Uniform grid FMM (ORDER="<< ORDER << ") ... " << std::endl;
+      time.tic();
+      KernelClass kernels(TreeHeight, loader.getCenterOfBox(), loader.getBoxWidth());
+      FmmClass algorithm(&tree, &kernels);
+      algorithm.execute();
+      time.tac();
+      std::cout << "Done  " << "(@Algorithm = " << time.elapsed() << "s)." << std::endl;
+    } // -----------------------------------------------------
+
+
+    { // -----------------------------------------------------
+      std::cout << "\nError computation ... " << std::endl;
+      FMath::FAccurater potentialDiff;
+      FMath::FAccurater fx, fy, fz;
+      { // Check that each particle has been summed with all other
+
+        tree.forEachLeaf([&](LeafClass* leaf){
+            const FReal*const potentials = leaf->getTargets()->getPotentials();
+            const FReal*const forcesX = leaf->getTargets()->getForcesX();
+            const FReal*const forcesY = leaf->getTargets()->getForcesY();
+            const FReal*const forcesZ = leaf->getTargets()->getForcesZ();
+            const int nbParticlesInLeaf = leaf->getTargets()->getNbParticles();
+            const FVector<int>& indexes = leaf->getTargets()->getIndexes();
+
+            for(int idxPart = 0 ; idxPart < nbParticlesInLeaf ; ++idxPart){
+              const int indexPartOrig = indexes[idxPart];
+              potentialDiff.add(particles[indexPartOrig].potential,potentials[idxPart]);
+              fx.add(particles[indexPartOrig].forces[0],forcesX[idxPart]);
+              fy.add(particles[indexPartOrig].forces[1],forcesY[idxPart]);
+              fz.add(particles[indexPartOrig].forces[2],forcesZ[idxPart]);
+            }
+          });
+      }
+
+      // Print for information
+      std::cout << "Potential " << potentialDiff << std::endl;
+      std::cout << "Fx " << fx << std::endl;
+      std::cout << "Fy " << fy << std::endl;
+      std::cout << "Fz " << fz << std::endl;
+    } // -----------------------------------------------------
+
+  } // end Lagrange kernel
+
+  return 0;
+}
diff --git a/Tests/Utils/testACA.cpp b/Tests/Utils/testACA.cpp
index 2e40ff173794a1e9ae7d0bf9aaffa9b19a398bdc..445f338a8a271edf5d303e72ece60b97e6475a14 100755
--- a/Tests/Utils/testACA.cpp
+++ b/Tests/Utils/testACA.cpp
@@ -29,7 +29,7 @@
 
 #include "../../Src/Utils/FPoint.hpp"
 
-#include "../../Src/Kernels/Chebyshev/FChebMatrixKernel.hpp"
+#include "../../Src/Kernels/Interpolation/FInterpMatrixKernel.hpp"
 #include "../../Src/Kernels/Chebyshev/FChebRoots.hpp"
 #include "../../Src/Kernels/Chebyshev/FChebTensor.hpp"
 #include "../../Src/Kernels/Chebyshev/FChebSymM2LHandler.hpp"
@@ -42,7 +42,7 @@
 
 int main(int argc, char* argv[])
 {
-	typedef FChebMatrixKernelR MatrixKernelClass;
+	typedef FInterpMatrixKernelR MatrixKernelClass;
 	MatrixKernelClass MatrixKernel;
 
 	const unsigned int ORDER = 9;
diff --git a/Tests/Utils/testChebBinaryM2L.cpp b/Tests/Utils/testChebBinaryM2L.cpp
index c5cb1d0f3032e3f67cf80fa617c457bf05aac110..f2412491abd9175bafcdd9eae5e81c5ff49934f2 100755
--- a/Tests/Utils/testChebBinaryM2L.cpp
+++ b/Tests/Utils/testChebBinaryM2L.cpp
@@ -29,7 +29,7 @@
 
 #include "../../Src/Utils/FTic.hpp"
 #include "../../Src/Utils/FMath.hpp"
-#include "../../Src/Kernels/Chebyshev/FChebMatrixKernel.hpp"
+#include "../../Src/Kernels/Interpolation/FInterpMatrixKernel.hpp"
 #include "../../Src/Kernels/Chebyshev/FChebM2LHandler.hpp"
 
 
@@ -41,7 +41,7 @@
 int main(int argc, char* argv[])
 { 
 	// typedefs   
-	typedef FChebMatrixKernelR MatrixKernelClass;
+	typedef FInterpMatrixKernelR MatrixKernelClass;
 	
 
 	// instantiations
diff --git a/Tests/Utils/testChebBinarySymM2L.cpp b/Tests/Utils/testChebBinarySymM2L.cpp
index 9f894b34bc2fc7178e47cb90d2ad77231cd64e6a..958d5ea9d69f9e322a7bfcaafa19eb13148e7bda 100755
--- a/Tests/Utils/testChebBinarySymM2L.cpp
+++ b/Tests/Utils/testChebBinarySymM2L.cpp
@@ -30,7 +30,7 @@
 
 #include "../../Src/Utils/FTic.hpp"
 #include "../../Src/Utils/FMath.hpp"
-#include "../../Src/Kernels/Chebyshev/FChebMatrixKernel.hpp"
+#include "../../Src/Kernels/Interpolation/FInterpMatrixKernel.hpp"
 #include "../../Src/Kernels/Chebyshev/FChebSymM2LHandler.hpp"
 
 
@@ -42,7 +42,7 @@
 int main(int argc, char* argv[])
 { 
 	// typedefs   
-	typedef FChebMatrixKernelR MatrixKernelClass;
+	typedef FInterpMatrixKernelR MatrixKernelClass;
 
 	// instantiations
 	FTic time;
diff --git a/Tests/Utils/testChebInterpolator.cpp b/Tests/Utils/testChebInterpolator.cpp
index 2a7b18582deb751b05c01a44f85f901df96ea32d..31af57253db722fa8a10e7485edce796e866e05b 100755
--- a/Tests/Utils/testChebInterpolator.cpp
+++ b/Tests/Utils/testChebInterpolator.cpp
@@ -35,7 +35,7 @@
 
 
 #include "../../Src/Kernels/Chebyshev/FChebInterpolator.hpp"
-#include "../../Src/Kernels/Chebyshev/FChebMatrixKernel.hpp"
+#include "../../Src/Kernels/Interpolation/FInterpMatrixKernel.hpp"
 
 #include "../../Src/Kernels/P2P/FP2PParticleContainer.hpp"
 #include "../../Src/Components/FSimpleLeaf.hpp"
@@ -50,7 +50,7 @@
 int main(int, char **){
     typedef FP2PParticleContainer ContainerClass;
     typedef FSimpleLeaf<ContainerClass> LeafClass;
-	typedef FChebMatrixKernelR MatrixKernelClass;
+	typedef FInterpMatrixKernelR MatrixKernelClass;
 
 
 	///////////////////////What we do/////////////////////////////
diff --git a/Tests/Utils/testChebM2Lprecomputation.cpp b/Tests/Utils/testChebM2Lprecomputation.cpp
index a59d5c403681ebeb0ee47cd2a0dd7beb0780a423..503caf43f822718e8293c10cf2b931bb238d497d 100755
--- a/Tests/Utils/testChebM2Lprecomputation.cpp
+++ b/Tests/Utils/testChebM2Lprecomputation.cpp
@@ -32,7 +32,7 @@
 
 #include "../../Src/Kernels/Chebyshev/FChebTensor.hpp"
 #include "../../Src/Kernels/Chebyshev/FChebM2LHandler.hpp"
-#include "../../Src/Kernels/Chebyshev/FChebMatrixKernel.hpp"
+#include "../../Src/Kernels/Interpolation/FInterpMatrixKernel.hpp"
 
 
 
@@ -45,7 +45,7 @@ int main(int argc, char* argv[])
 	FTic time;
 	
 	// define set matrix kernel
-	typedef FChebMatrixKernelR MatrixKernelClass;
+	typedef FInterpMatrixKernelR MatrixKernelClass;
 	MatrixKernelClass MatrixKernel;
 
 	// constants
diff --git a/Tests/Utils/testChebSxUCBSy.cpp b/Tests/Utils/testChebSxUCBSy.cpp
index 8ad70da9a19a2e5b2c2fccd4d93583f66f0abca0..568793760108a0705b5bdf25711dbd30f767906d 100755
--- a/Tests/Utils/testChebSxUCBSy.cpp
+++ b/Tests/Utils/testChebSxUCBSy.cpp
@@ -35,7 +35,7 @@
 
 
 #include "../../Src/Kernels/Chebyshev/FChebInterpolator.hpp"
-#include "../../Src/Kernels/Chebyshev/FChebMatrixKernel.hpp"
+#include "../../Src/Kernels/Interpolation/FInterpMatrixKernel.hpp"
 
 #include "../../Src/Components/FSimpleLeaf.hpp"
 #include "../../Src/Kernels/P2P/FP2PParticleContainer.hpp"
@@ -52,7 +52,7 @@ int main(int argc, char* argv[])
 {
     typedef FP2PParticleContainer ContainerClass;
     typedef FSimpleLeaf<ContainerClass> LeafClass;
-	typedef FChebMatrixKernelR MatrixKernelClass;
+	typedef FInterpMatrixKernelR MatrixKernelClass;
 
 	///////////////////////What we do/////////////////////////////
 	std::cout << "\nTask: Compute interactions between source particles in leaf Y and target\n";
diff --git a/Tests/Utils/testChebSymmetries.cpp b/Tests/Utils/testChebSymmetries.cpp
index 28d1c535b499c4364f8f72604af076403788726b..f506ca106be7e76490a98673881f5ee6216f275c 100755
--- a/Tests/Utils/testChebSymmetries.cpp
+++ b/Tests/Utils/testChebSymmetries.cpp
@@ -30,7 +30,7 @@
 #include "../../Src/Utils/FBlas.hpp"
 
 #include "../../Src/Kernels/Chebyshev/FChebTensor.hpp"
-#include "../../Src/Kernels/Chebyshev/FChebMatrixKernel.hpp"
+#include "../../Src/Kernels/Interpolation/FInterpMatrixKernel.hpp"
 #include "../../Src/Kernels/Chebyshev/FChebSymmetries.hpp"
 
 
@@ -59,7 +59,7 @@ int main(int argc, char* argv[])
 	FTic time;
 	
 	// define set matrix kernel
-	typedef FChebMatrixKernelR MatrixKernelClass;
+	typedef FInterpMatrixKernelR MatrixKernelClass;
 	MatrixKernelClass MatrixKernel;
 
 	// constants
diff --git a/Tests/Utils/testChebTensorProduct.cpp b/Tests/Utils/testChebTensorProduct.cpp
index 38d57bb953f509b61effeafd88b0850b2e594889..ee12ed4b5f4bd156aac1d225a2eb220690e8c68d 100755
--- a/Tests/Utils/testChebTensorProduct.cpp
+++ b/Tests/Utils/testChebTensorProduct.cpp
@@ -85,7 +85,7 @@ int main(int argc, char* argv[])
 
 
 		FReal coords[3][ORDER];
-		FChebTensor<ORDER>::setChebyshevRoots(cx, wx, coords);
+		FChebTensor<ORDER>::setPolynomialsRoots(cx, wx, coords);
 	
 		//	for (unsigned int n=0; n<ORDER; ++n) {
 		//		std::cout << coords[0][n] << "\t"
diff --git a/Tests/Utils/testFFTW.cpp b/Tests/Utils/testFFTW.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..7a9b260d0f13547fb732ee0cb7f71327bf52d3de
--- /dev/null
+++ b/Tests/Utils/testFFTW.cpp
@@ -0,0 +1,139 @@
+// ===================================================================================
+// Ce LOGICIEL "ScalFmm" est couvert par le copyright Inria 20xx-2012.
+// Inria détient tous les droits de propriété sur le LOGICIEL, et souhaite que
+// la communauté scientifique l'utilise afin de le tester et de l'évaluer.
+// Inria donne gracieusement le droit d'utiliser ce LOGICIEL. Toute utilisation
+// dans un but lucratif ou à des fins commerciales est interdite sauf autorisation
+// expresse et préalable d'Inria.
+// Toute utilisation hors des limites précisées ci-dessus et réalisée sans l'accord
+// expresse préalable d'Inria constituerait donc le délit de contrefaçon.
+// Le LOGICIEL étant un produit en cours de développement, Inria ne saurait assurer
+// aucune responsabilité et notamment en aucune manière et en aucun cas, être tenu
+// de répondre d'éventuels dommages directs ou indirects subits par l'utilisateur.
+// Tout utilisateur du LOGICIEL s'engage à communiquer à Inria ses remarques
+// relatives à l'usage du LOGICIEL
+// ===================================================================================
+
+#include <iostream>
+#include <stdlib.h>
+
+// include for libfftw3
+//#include <fftw3.h>
+
+// include for mkl_fftw3
+#include <fftw/fftw3.h>
+
+#include "../../Src/Utils/FGlobal.hpp"
+#include "../../Src/Utils/FComplexe.hpp"
+
+#include "../../Src/Utils/FTic.hpp"
+
+
+//#include "../../Src/Utils/FDft.hpp"
+
+
+int main()
+{
+  const FReal FRandMax = FReal(RAND_MAX);
+
+  FTic time;
+
+  //////////////////////////////////////////////////////////////////////////////
+  // INITIALIZATION
+
+  // size (pick a power of 2 for better performance of the FFT algorithm)
+  unsigned int nsteps_ = 5; 
+
+  // fftw arrays
+  FReal* fftR_;
+  FComplexe* fftC_;
+
+  fftR_ = (FReal*) fftw_malloc(sizeof(FReal) * nsteps_);
+  fftC_ = (FComplexe*) fftw_malloc(sizeof(FComplexe) * nsteps_);
+
+  // fftw plans
+  // use routine defined in file:
+  // /PATH/TO/mkl/interfaces/fftw3xf/wrappers/fftw_plan_dft_c2r_1d.c
+  fftw_plan plan_c2r_; // backward FFT plan
+  fftw_plan plan_r2c_; // forward FFT plan
+
+  std::cout<< "Init FFTW plans: ";
+  time.tic();
+
+  plan_c2r_ =
+    fftw_plan_dft_c2r_1d(nsteps_, 
+                         reinterpret_cast<fftw_complex*>(fftC_),
+                         fftR_, 
+                         FFTW_MEASURE);
+  plan_r2c_ =
+    fftw_plan_dft_r2c_1d(nsteps_, 
+                         fftR_, 
+                         reinterpret_cast<fftw_complex*>(fftC_), 
+                         FFTW_MEASURE);
+
+  std::cout << "took " << time.tacAndElapsed() << "sec." << std::endl;
+
+  // alternative choice of plan type
+//  plan_c2r_ =
+//    fftw_plan_dft_1d(nsteps_, fftC_, fftR_, FFTW_BACKWARD, FFTW_MEASURE);
+//  plan_r2c_ =
+//    fftw_plan_dft_1d(nsteps_, fftR_, fftC_, FFTW_FORWARD, FFTW_MEASURE);
+
+  //////////////////////////////////////////////////////////////////////////////
+  // EXECUTION
+  // generate random physical data
+  for(unsigned int s=0; s<nsteps_; ++s)
+    fftR_[s] = FReal(rand())/FRandMax; 
+
+  // display data in  physical space
+  std::cout<< "Physical data: "<<std::endl;
+  for(unsigned int s=0; s<nsteps_; ++s)
+    std::cout<< fftR_[s] << ", ";
+  std::cout<<std::endl;
+
+  // perform fft
+  std::cout<< "Perform Forward FFT: ";
+  time.tic();
+  fftw_execute( plan_r2c_ );
+  std::cout << "took " << time.tacAndElapsed() << "sec." << std::endl;
+
+  // display transform in Fourier space
+  // beware the real data FFT stores only N/2+1 complex output values
+  std::cout<< "Transformed data : "<<std::endl;
+  for(unsigned int s=0; s<nsteps_; ++s)
+    std::cout<< fftC_[s] << ", ";
+  std::cout<<std::endl;
+
+//  for(unsigned int s=0; s<nsteps_/2+1; ++s){
+//    fftC_[nsteps_-s]=FComplexe(fftC_[s].getReal(),-fftC_[s].getImag());
+//  }
+//
+//  std::cout<< "Full Transformed data : "<<std::endl;
+//  for(unsigned int s=0; s<nsteps_; ++s)
+//    std::cout<< fftC_[s] << ", ";
+//  std::cout<<std::endl;
+
+  // perform ifft of tranformed data (in order to get physical values back)
+  std::cout<< "Perform Backward FFT: ";
+  time.tic();
+  fftw_execute( plan_c2r_ );
+  std::cout << "took " << time.tacAndElapsed() << "sec." << std::endl;
+
+  // display data in physical space
+  std::cout<< "Physical data (from 1/N*IFFT(FFT(Physical data))): "<<std::endl;
+  for(unsigned int s=0; s<nsteps_; ++s)
+    std::cout<< fftR_[s]/nsteps_ << ", ";
+  std::cout<<std::endl;
+
+  //////////////////////////////////////////////////////////////////////////////
+  // VALIDATION
+
+
+
+  //free memory
+  fftw_destroy_plan(plan_c2r_);
+  fftw_free(fftR_);
+  fftw_free(fftC_);
+
+
+}// end test
diff --git a/Tests/Utils/testFastDiscreteConvolution.cpp b/Tests/Utils/testFastDiscreteConvolution.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..f6ae16c53173b3626d72a1438bc1b50c1a7dcec7
--- /dev/null
+++ b/Tests/Utils/testFastDiscreteConvolution.cpp
@@ -0,0 +1,178 @@
+// ===================================================================================
+// Ce LOGICIEL "ScalFmm" est couvert par le copyright Inria 20xx-2012.
+// Inria détient tous les droits de propriété sur le LOGICIEL, et souhaite que
+// la communauté scientifique l'utilise afin de le tester et de l'évaluer.
+// Inria donne gracieusement le droit d'utiliser ce LOGICIEL. Toute utilisation
+// dans un but lucratif ou à des fins commerciales est interdite sauf autorisation
+// expresse et préalable d'Inria.
+// Toute utilisation hors des limites précisées ci-dessus et réalisée sans l'accord
+// expresse préalable d'Inria constituerait donc le délit de contrefaçon.
+// Le LOGICIEL étant un produit en cours de développement, Inria ne saurait assurer
+// aucune responsabilité et notamment en aucune manière et en aucun cas, être tenu
+// de répondre d'éventuels dommages directs ou indirects subits par l'utilisateur.
+// Tout utilisateur du LOGICIEL s'engage à communiquer à Inria ses remarques
+// relatives à l'usage du LOGICIEL
+// ===================================================================================
+
+// ==== CMAKE =====
+// @FUSE_BLAS
+// ================
+
+
+#include <iostream>
+
+#include <stdlib.h>
+#include <time.h>
+
+#include "../../Src/Utils/FTic.hpp"
+#include "../../Src/Utils/FMath.hpp"
+#include "../../Src/Utils/FBlas.hpp" 
+
+// 
+#include "../../Src/Utils/FDft.hpp"
+#include "../../Src/Utils/FComplexe.hpp" 
+
+
+
+/**
+ * In this file we show how to use octree
+ */
+
+int main(int, char **){
+
+  const FReal FRandMax = FReal(RAND_MAX);
+
+  FTic time;
+
+  const unsigned int ORDER = 3;
+  const unsigned int N=(2*ORDER-1);
+
+
+  //////////////////////////////////////////////////////////////////////////////
+  // Application of Convolution Theorem
+  // Let there be a Circulant Toeplitz matrix K of size N
+
+  FReal K[N*N];
+  FReal Y[N];
+  FReal X[N];
+
+  for(unsigned int i=0; i<N; ++i) X[i]=0.0;
+  for(unsigned int i=0; i<N; ++i) Y[i]=FReal(rand())/FRandMax;
+
+  std::cout<< "Y: "<<std::endl;
+  for(unsigned int i=0; i<N; ++i) 
+    std::cout<< Y[i] << ", ";
+  std::cout<<std::endl;
+
+  // Assemble Matrix K
+  std::cout<< "Assemble Full Matrix K: ";
+  time.tic();
+  for(unsigned int i=0; i<N; ++i){
+    for(unsigned int j=0; j<N; ++j){
+      if(i>j) K[i*N+j]=i-j-1;
+      else   K[i*N+j]=N+i-j-1;
+    }
+  }
+  std::cout << "took " << time.tacAndElapsed() << "sec." << std::endl;
+
+  std::cout<< "Circulant Toeplitz matrix K: "<<std::endl;
+  for(unsigned int i=0; i<N; ++i){
+    for(unsigned int j=0; j<N; ++j){
+      std::cout<< K[i*N+j] << ", ";
+    } std::cout<<std::endl;
+  } std::cout<<std::endl;
+
+  // Direct application of Circulant Matrix
+  std::cout<< "Direct application of K: ";
+  time.tic();
+  for(unsigned int i=0; i<N; ++i){
+    for(unsigned int j=0; j<N; ++j){
+      X[i]+=K[i*N+j]*Y[j];
+    }
+  }
+  std::cout << "took " << time.tacAndElapsed() << "sec." << std::endl;
+
+  std::cout<< "X=KY: "<<std::endl;
+  for(unsigned int i=0; i<N; ++i)
+    std::cout<< X[i] << ", ";
+  std::cout<<std::endl;
+
+  // now compute via DFT and use convolution theorem
+  FComplexe FK[N];
+  FComplexe FY[N];
+  FComplexe FX[N];
+  FReal iFX[N];
+
+  // Init DFTor
+  std::cout<< "Set DFT: ";
+  time.tic();
+  //FDft Dft(N);// direct version
+  FFft Dft(N);// fast version
+  std::cout << "took " << time.tacAndElapsed() << "sec." << std::endl;
+
+  // Initialize manually
+//  for(unsigned int s=0; s<N; ++s){
+//    FX[s] = FY[s] = FK[s] =FComplexe(0.0,0.0); // init
+//    iFX[s]=0.0;
+//  }
+  // ... or using Blas routines
+  FBlas::c_setzero(N,reinterpret_cast<FReal*>(FX));
+  FBlas::c_setzero(N,reinterpret_cast<FReal*>(FY));
+  FBlas::c_setzero(N,reinterpret_cast<FReal*>(FK));
+  FBlas::setzero(N,iFX);
+
+
+  // Transform Y in Fourier space
+  std::cout<< "Transform Y->FY: ";
+  time.tic();
+  Dft.applyDFT(Y,FY);
+  std::cout << "took " << time.tacAndElapsed() << "sec." << std::endl;
+
+  std::cout<< "FY: "<<std::endl;
+  for(unsigned int i=0; i<N; ++i){
+    std::cout<< FY[i] << ", ";
+  }std::cout<<std::endl;
+
+  // Transform first column of K
+  FReal tK[N];
+  for(unsigned int i=0; i<N; ++i) tK[i]=K[i*N];
+  std::cout<< "Transform tK->FK: ";
+  time.tic();
+  Dft.applyDFT(tK,FK);
+  std::cout << "took " << time.tacAndElapsed() << "sec." << std::endl;
+
+//  std::cout<< "Transformed Matrix FK: "<<std::endl;
+//  for(unsigned int i=0; i<N; ++i){
+//    std::cout<< FK[i] << ", ";
+//  }std::cout<<std::endl;
+
+  // Compute X=KY in Fourier space
+  std::cout<< "Apply K in Fourier space (entrywise prod.): ";
+  time.tic();
+  for(unsigned int s=0; s<N; ++s){// TODO loop only over non zero entries
+    FX[s]=FK[s]; 
+    FX[s]*=FY[s];
+  }
+  std::cout << "took " << time.tacAndElapsed() << "sec." << std::endl;
+
+  std::cout<< "Transformed Local Exp FX=FK:FY: "<<std::endl;
+  for(unsigned int i=0; i<N; ++i){
+    std::cout<< FX[i] << ", ";
+  }std::cout<<std::endl;
+
+  // Transform FX back in Physical space
+  std::cout<< "Transform FX->iF(FX): ";
+  time.tic();
+  Dft.applyIDFT(FX,iFX);
+  std::cout << "took " << time.tacAndElapsed() << "sec." << std::endl;
+
+  std::cout<< "iF(FK:FY): "<<std::endl;
+  for(unsigned int i=0; i<N; ++i){
+    std::cout<< iFX[i] << ", ";
+  }std::cout<<std::endl;
+
+std::cout << "Relative error   = " << FMath::FAccurater( X, iFX, N) << std::endl;
+
+
+  return 0;
+}
diff --git a/Tests/Utils/testUnifInterpolator.cpp b/Tests/Utils/testUnifInterpolator.cpp
new file mode 100755
index 0000000000000000000000000000000000000000..3ef383da4cfdddb8d76b7a4768cbd52a84ad1046
--- /dev/null
+++ b/Tests/Utils/testUnifInterpolator.cpp
@@ -0,0 +1,513 @@
+// ===================================================================================
+// Ce LOGICIEL "ScalFmm" est couvert par le copyright Inria 20xx-2012.
+// Inria détient tous les droits de propriété sur le LOGICIEL, et souhaite que
+// la communauté scientifique l'utilise afin de le tester et de l'évaluer.
+// Inria donne gracieusement le droit d'utiliser ce LOGICIEL. Toute utilisation
+// dans un but lucratif ou à des fins commerciales est interdite sauf autorisation
+// expresse et préalable d'Inria.
+// Toute utilisation hors des limites précisées ci-dessus et réalisée sans l'accord
+// expresse préalable d'Inria constituerait donc le délit de contrefaçon.
+// Le LOGICIEL étant un produit en cours de développement, Inria ne saurait assurer
+// aucune responsabilité et notamment en aucune manière et en aucun cas, être tenu
+// de répondre d'éventuels dommages directs ou indirects subits par l'utilisateur.
+// Tout utilisateur du LOGICIEL s'engage à communiquer à Inria ses remarques
+// relatives à l'usage du LOGICIEL
+// ===================================================================================
+
+// ==== CMAKE =====
+// @FUSE_BLAS
+// ================
+
+
+#include <iostream>
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <time.h>
+
+#include "../../Src/Utils/FTic.hpp"
+#include "../../Src/Utils/FMath.hpp"
+#include "../../Src/Utils/FBlas.hpp"
+
+#include "../../Src/Containers/FVector.hpp"
+
+#include "../../Src/Utils/FAssert.hpp"
+#include "../../Src/Utils/FPoint.hpp"
+
+
+#include "../../Src/Kernels/Uniform/FUnifInterpolator.hpp"
+#include "../../Src/Kernels/Interpolation/FInterpMatrixKernel.hpp"
+#include "../../Src/Kernels/Uniform/FUnifTensor.hpp"
+
+// Check DFT
+#include "../../Src/Utils/FDft.hpp"
+#include "../../Src/Utils/FComplexe.hpp"
+
+
+#include "../../Src/Kernels/P2P/FP2PParticleContainer.hpp"
+#include "../../Src/Components/FSimpleLeaf.hpp"
+
+
+
+
+/**
+ * In this file we show how to use octree
+ */
+
+int main(int, char **){
+  typedef FP2PParticleContainer ContainerClass;
+  typedef FSimpleLeaf<ContainerClass> LeafClass;
+  typedef FInterpMatrixKernelR MatrixKernelClass;
+
+
+  ///////////////////////What we do/////////////////////////////
+  std::cout << "\nTask: Compute interactions between source particles in leaf Y and target\n";
+  std::cout << " particles in leaf X. Compare the fast summation K ~ Lx K Ly' with the\n";
+  std::cout << " direct computation.\n" << std::endl;
+  //////////////////////////////////////////////////////////////
+
+  MatrixKernelClass MatrixKernel;
+  const FReal FRandMax = FReal(RAND_MAX);
+  FTic time;
+
+
+  // Leaf size
+  FReal width = FReal(3.723);
+
+  ////////////////////////////////////////////////////////////////////
+  LeafClass X;
+  FPoint cx(0., 0., 0.);
+  const unsigned long M = 20000;
+  std::cout << "Fill the leaf X of width " << width
+            << " centered at cx=" << cx << " with M=" << M << " target particles" << std::endl;
+  {
+    for(unsigned long i=0; i<M; ++i){
+      FReal x = (FReal(rand())/FRandMax - FReal(.5)) * width + cx.getX();
+      FReal y = (FReal(rand())/FRandMax - FReal(.5)) * width + cx.getY();
+      FReal z = (FReal(rand())/FRandMax - FReal(.5)) * width + cx.getZ();
+      X.push(FPoint(x, y, z), FReal(rand())/FRandMax);
+    }
+  }
+
+
+  ////////////////////////////////////////////////////////////////////
+  LeafClass Y;
+  //  FPoint cy(FReal(2.)*width, 0., 0.);
+  FPoint cy(FReal(2.)*width, FReal(2.)*width, 0.);
+
+  const unsigned long N = 20000;
+  std::cout << "Fill the leaf Y of width " << width
+            << " centered at cy=" << cy	<< " with N=" << N << " target particles" << std::endl;
+  {
+    for(unsigned long i=0; i<N; ++i){
+      FReal x = (FReal(rand())/FRandMax - FReal(.5)) * width + cy.getX();
+      FReal y = (FReal(rand())/FRandMax - FReal(.5)) * width + cy.getY();
+      FReal z = (FReal(rand())/FRandMax - FReal(.5)) * width + cy.getZ();
+      Y.push(FPoint(x, y, z), FReal(rand())/FRandMax);
+    }
+  }
+
+
+
+  ////////////////////////////////////////////////////////////////////
+  // approximative computation
+  const unsigned int ORDER = 3;
+  const unsigned int nnodes = TensorTraits<ORDER>::nnodes;
+  typedef FUnifInterpolator<ORDER> InterpolatorClass;
+  InterpolatorClass S;
+
+  std::cout << "\nCompute interactions approximatively, interpolation order = " << ORDER << " ..." << std::endl;
+
+  std::cout << "\nP2M ... " << std::flush;
+  time.tic();
+  // Anterpolate: W_n = \sum_j^N S(y_j,\bar y_n) * w_j
+  FReal W[nnodes]; // multipole expansion
+  S.applyP2M(cy, width, W, Y.getSrc()); // the multipole expansions are set to 0 in S.applyP2M
+  std::cout << "took " << time.tacAndElapsed() << "s" << std::endl;
+
+  std::cout << "M2L ... " << std::flush;
+  time.tic();
+  // Multipole to local: F_m = \sum_n^L K(\bar x_m, \bar y_n) * W_n
+  FPoint rootsX[nnodes], rootsY[nnodes];
+  FUnifTensor<ORDER>::setRoots(cx, width, rootsX);
+  FUnifTensor<ORDER>::setRoots(cy, width, rootsY);
+
+  FReal F[nnodes]; // local expansion
+  for (unsigned int i=0; i<nnodes; ++i) {
+    F[i] = FReal(0.);
+    for (unsigned int j=0; j<nnodes; ++j){
+
+      F[i] += MatrixKernel.evaluate(rootsX[i], rootsY[j]) * W[j];
+
+    }
+  }
+  std::cout << "took " << time.tacAndElapsed() << "s" << std::endl;
+
+
+  ////////////////////////////////////////////////////////////////////////////
+  // Store M2L in K and apply K
+  FReal K[nnodes*nnodes]; // local expansion
+  for (unsigned int i=0; i<nnodes; ++i) {
+    for (unsigned int j=0; j<nnodes; ++j){
+      K[i*nnodes+j] = MatrixKernel.evaluate(rootsX[i], rootsY[j]);
+    }
+  }
+  std::cout<< "Apply M2L in usual sense: ";
+  time.tic();
+
+  for (unsigned int i=0; i<nnodes; ++i) {
+    F[i] = FReal(0.);
+    for (unsigned int j=0; j<nnodes; ++j){
+
+      F[i] += K[i*nnodes+j] * W[j];
+
+    }
+  }
+
+  time.tac();
+  std::cout << "took " << time.elapsed() << "sec." << std::endl;
+
+  // PB: Verify storage improvement works (indexing etc...)
+  // 1) store circulant matrix
+  const unsigned int rc = (2*ORDER-1)*(2*ORDER-1)*(2*ORDER-1);
+  FReal C[rc];
+
+  //  std::cout << "2*order-1=" << 2*ORDER-1 << std::endl;
+  unsigned int li,lj;
+  unsigned int mi,mj;
+  unsigned int ni,nj;
+  unsigned int idi, idj, ido=0;
+
+  for(unsigned int l=0; l<2*ORDER-1; ++l)
+    for(unsigned int m=0; m<2*ORDER-1; ++m)
+      for(unsigned int n=0; n<2*ORDER-1; ++n){
+
+        // compute position in nnodes vector of roots
+        if(l<ORDER-1) lj=ORDER-1; else lj=0;
+        if(m<ORDER-1) mj=ORDER-1; else mj=0;
+        if(n<ORDER-1) nj=ORDER-1; else nj=0;
+        li=(l-(ORDER-1))+lj; mi=(m-(ORDER-1))+mj; ni=(n-(ORDER-1))+nj;
+
+        idi=li*ORDER*ORDER + mi*ORDER + ni;
+        idj=lj*ORDER*ORDER + mj*ORDER + nj;
+
+        C[ido]
+          = MatrixKernel.evaluate(rootsX[idi], rootsY[idj]);
+        ido++;
+      }
+//  // Display C (gathers every values of K that need to be stored,
+//  // corresponds to the first line of the padded matrix (reverse order?))
+//  std::cout<<"C="<<std::endl;
+//  for (unsigned int n=0; n<rankpad; ++n)
+//    std::cout<< C[ido] << ", ";
+//  std::cout<<std::endl;
+
+  typedef FUnifTensor<ORDER> TensorType;
+  unsigned int node_diff[nnodes*nnodes];
+  TensorType::setNodeIdsDiff(node_diff);
+
+  //////////////////////////////////////////////////////////////////////////////////////////////////////
+  // K is a block Toeplitz matrix
+  // i.e. a blockwise Toeplitz matrix where the block also have the Toeplitz structure.
+  // e.g. for ORDER=3: K=[K_{1,1} K_{1,2} K_{1,3},  where K_{i,j}=[k11 k12 k13,
+  //                      K_{2,1} K_{1,1} K_{1,2},                 k21 k11 k12,
+  //                      K_{3,1} K_{2,1} K_{1,1}];                k31 k21 k11];
+  // K is of size order^3 x order^3
+  // (i.e. order^2 x order^2 Toeplitz blocks of size order x order),
+  // K is very close to be Toeplitz itself and even circulant.
+  // In order to actually embed K into a circulant matrix C one just
+  // needs to insert (ORDER-1) extra lines/columns (to each block).
+
+//  std::cout<< "K=" <<std::endl;
+//  for (unsigned int i=0; i<nnodes; ++i){
+//    for (unsigned int j=0; j<nnodes; ++j){
+//      std::cout<< K[i*nnodes+j]<<", ";
+//    }
+//    std::cout<<std::endl;
+//  }
+//  std::cout<<std::endl;
+
+//  // Check matrix node_diff
+//  std::cout<< "node_diff=" <<std::endl;
+//  for (unsigned int i=0; i<nnodes; ++i){
+//    for (unsigned int j=0; j<nnodes; ++j){
+//      std::cout<< node_diff[i*nnodes+j] <<", ";
+//    }
+//    std::cout<<std::endl;
+//  }
+//  std::cout<<std::endl;
+
+//  // Expected ido for the (2*ORDER-1)^3x(2*ORDER-1)^3 circulant matrix
+//  for (unsigned int i=0; i<rc; ++i){
+//    for (unsigned int j=0; j<rc; ++j){
+//      if(i>j) std::cout<< i-j-1 << ", ";
+//      else std::cout<< rc+i-j-1 << ", ";
+//    } std::cout<<std::endl;
+//  } std::cout<<std::endl;
+
+//  // kernel evaluated at previous ido returns a circulant matrix
+//  for (unsigned int i=0; i<rc/2; ++i){
+//    for (unsigned int j=0; j<rc/2; ++j){
+//      if(i>j) std::cout<< C[i-j-1] << ", ";
+//      else std::cout<< C[rc+i-j-1] << ", ";
+//    } std::cout<<std::endl;
+//  } std::cout<<std::endl;
+
+  // In 1D the Zero Padding consists in
+  // inserting ORDER-1 zeros in the multipole exp
+  // in order to apply the (ORDER+ORDER-1)x(ORDER+ORDER-1)
+  // circulant matrix to it.
+  // Let us extend it to the 3D case:
+  FReal MultExp[nnodes]; FReal PaddedMultExp[rc];
+  for (unsigned int i=0; i<nnodes; ++i) MultExp[i]=W[i];
+  FReal LocalExp[nnodes]; FReal PaddedLocalExp[rc];
+  FBlas::setzero(nnodes,LocalExp);
+  FBlas::setzero(rc,PaddedLocalExp);
+
+  std::cout<< "Expected LocalExp: "<<std::endl;
+  for (unsigned int i=0; i<nnodes; ++i)
+    std::cout<< F[i] << ", ";
+  std::cout<<std::endl;
+
+  /////////////////////////////////////////////////////////////////////////////////////
+  // Application of circulant Toeplitz system in PHYSICAL SPACE
+  std::cout<< "Apply circulant M2L in Physical space: ";
+  time.tic();
+  for (unsigned int i=0; i<nnodes; ++i){
+
+    // Pad Multipole Expansion with respect to current row
+    FBlas::setzero(rc,PaddedMultExp);
+    for (unsigned int j=0; j<nnodes; ++j)
+      PaddedMultExp[node_diff[i*nnodes+j]]=MultExp[j];
+
+//    std::cout<< "Padded MultExp for row i=" << i << ": "<<std::endl;
+//    for (unsigned int p=0; p<rc; ++p)
+//      std::cout<< PaddedMultExp[p] << ", ";
+//    std::cout<<std::endl;
+
+    // Application of M2L in PHYSICAL SPACE
+    for (unsigned int pj=0; pj<rc; ++pj)
+      LocalExp[i]+=C[pj]*PaddedMultExp[pj];
+
+  }// end i
+  time.tac();
+  std::cout << "took " << time.elapsed() << "sec." << std::endl;
+
+  std::cout<< "LocalExp via product in PHYSICAL SPACE: "<<std::endl;
+  for (unsigned int p=0; p<nnodes; ++p)
+    std::cout<< LocalExp[p] << ", ";
+  std::cout<<std::endl;
+  std::cout<<std::endl;
+
+  /////////////////////////////////////////////////////////////////////////////////////
+  // Efficient application of the Toeplitz system in FOURIER SPACE
+  FComplexe FPMultExp[rc];
+  FComplexe FPLocalExp[rc];
+  FReal PLocalExp[rc];
+
+  //for (unsigned int n=0; n<rc; ++n) FPLocalExp[n]=FComplexe(0.0,0.0);
+  FBlas::c_setzero(rc,reinterpret_cast<FReal*>(FPLocalExp));
+
+  FBlas::setzero(rc,PLocalExp);
+
+  // Init DFT
+  //FDft Dft(rc); // direct version
+  FFft Dft(rc); // fast version
+
+  // Get first COLUMN of K and Store in T
+  FReal T[rc];
+  // use permutations
+  unsigned int perm[rc];
+  for(unsigned int p=0; p<rc; ++p){
+    if(p>0) perm[p]=p-1;
+    else perm[p]=rc+p-1;
+//    std::cout << "perm["<< p << "]="<< perm[p] << std::endl;
+  }
+
+  for (unsigned int i=0; i<rc; ++i){
+    // keep this lines commented to see what permutation accounts for:
+    //  for (unsigned int j=0; j<rc; ++j){
+//      if(i>0) T[i]=C[i-0-1];
+//      else T[i]=C[rc+i-0-1];
+      T[i]=C[perm[i]];
+  //  }
+  }
+
+//  std::cout<< "First column of C[rc x rc]: "<<std::endl;
+//  for (unsigned int p=0; p<rc; ++p)
+//    std::cout<< T[p] << ", ";
+//  std::cout<<std::endl;
+
+  // Apply DFT to T
+  FComplexe FT[rc];
+  //  for (unsigned int n=0; n<rc; ++n) FT[n]=FComplexe(0.0,0.0);
+  FBlas::c_setzero(rc,reinterpret_cast<FReal*>(FT));
+
+  // if first COLUMN (T) of C is used
+  Dft.applyDFT(T,FT);
+//  // if first ROW of C is used
+//  Dft.applyDFT(C,FT);
+
+  // Pad physical MultExp
+  FBlas::setzero(rc,PaddedMultExp); //part of padding
+  for (unsigned int j=0; j<nnodes; ++j){
+    // if first COLUMN (T) of C is used
+    PaddedMultExp[node_diff[j*nnodes]]=MultExp[j];
+//    // if first ROW of C is used
+//    PaddedMultExp[node_diff[j]]=MultExp[j];
+  }
+
+//    std::cout<< "Padded MultExp for row i=" << i << ": "<<std::endl;
+//    for (unsigned int p=0; p<rc; ++p)
+//      std::cout<< PaddedMultExp[p] << ", ";
+//    std::cout<<std::endl;
+
+
+  // Set transformed MultExp to 0
+  //  for (unsigned int n=0; n<rc; ++n) FPMultExp[n]=FComplexe(0.0,0.0);
+  FBlas::c_setzero(rc,reinterpret_cast<FReal*>(FPMultExp));
+
+  // Transform PaddedMultExp
+  Dft.applyDFT(PaddedMultExp,FPMultExp);
+
+  std::cout<< "Apply M2L in Fourier space: ";
+  time.tic();
+
+  // Application of M2L in FOURIER SPACE
+  // > Use FMkl::c_had for hadamard product
+  // if mkl is used as blas (TODO otherwise use FBlas::c_had())
+//  FMkl::c_had(rc,reinterpret_cast<FReal*>(FT),
+//            reinterpret_cast<FReal*>(FPMultExp),
+//            reinterpret_cast<FReal*>(FPLocalExp));
+  // > or perform entrywise product manually
+  for (unsigned int pj=0; pj<rc; ++pj){
+    FPLocalExp[pj]=FT[pj];
+    FPLocalExp[pj]*=FPMultExp[pj];
+  }
+
+  time.tac();
+  std::cout << "took " << time.elapsed() << "sec." << std::endl;
+
+//    std::cout<< "Transfo Padded LocalExp: "<<std::endl;
+//    for (unsigned int p=0; p<rc; ++p)
+//      std::cout<< FPLocalExp[p] << ", ";
+//    std::cout<<std::endl;
+
+  Dft.applyIDFT(FPLocalExp,PLocalExp);
+
+//  std::cout<< "Padded LocalExp: "<<std::endl;
+//  for (unsigned int p=0; p<rc; ++p)
+//    std::cout<< PLocalExp[p] << ", ";
+//  std::cout<<std::endl;
+
+  // Unpad
+  for (unsigned int j=0; j<nnodes; ++j){
+    // if first COLUMN (T) of C is used
+    LocalExp[j]=PLocalExp[node_diff[nnodes-j-1]];
+//    // if first ROW of C is used
+//    LocalExp[j]=PLocalExp[node_diff[j*nnodes]];
+  }
+
+//  std::cout<< "Mask to be applied to Padded LocalExp: "<<std::endl;
+//  for (unsigned int j=0; j<nnodes; ++j)
+//    std::cout<< node_diff[nnodes-j-1] << ", ";
+//  std::cout<<std::endl;
+
+  std::cout<< "LocalExp via product in FOURIER SPACE: "<<std::endl;
+  for (unsigned int p=0; p<nnodes; ++p)
+    std::cout<< LocalExp[p] << ", ";
+  std::cout<<std::endl;
+
+
+  /////////////////////////////////////////////////////////////////////////////////////
+
+  std::cout << "L2P (potential) ... " << std::flush;
+  time.tic();
+  // Interpolate p_i = \sum_m^L S(x_i,\bar x_m) * F_m
+  S.applyL2P(cx, width, F, X.getTargets());
+  std::cout << "took " << time.tacAndElapsed() << "s" << std::endl;
+
+  std::cout << "L2P (forces) ... " << std::flush;
+  time.tic();
+  // Interpolate f_i = \sum_m^L P(x_i,\bar x_m) * F_m
+  S.applyL2PGradient(cx, width, F, X.getTargets());
+  std::cout << "took " << time.tacAndElapsed() << "s" << std::endl;
+
+  ////////////////////////////////////////////////////////////////////
+  // direct computation
+  std::cout << "Compute interactions directly ..." << std::endl;
+  time.tic();
+
+  FReal* approx_f = new FReal [M * 3];
+  FReal*        f = new FReal [M * 3];
+  FBlas::setzero(M*3, f);
+
+  FReal* approx_p = new FReal[M];
+  FReal*        p = new FReal[M];
+  FBlas::setzero(M, p);
+
+  { // start direct computation
+    unsigned int counter = 0;
+
+    for(int idxPartX = 0 ; idxPartX < X.getSrc()->getNbParticles() ; ++idxPartX){
+      const FPoint x = FPoint(X.getSrc()->getPositions()[0][idxPartX],
+                              X.getSrc()->getPositions()[1][idxPartX],
+                              X.getSrc()->getPositions()[2][idxPartX]);
+      const FReal  wx = X.getSrc()->getPhysicalValues()[idxPartX];
+
+      for(int idxPartY = 0 ; idxPartY < Y.getSrc()->getNbParticles() ; ++idxPartY){
+        const FPoint y = FPoint(Y.getSrc()->getPositions()[0][idxPartY],
+                                Y.getSrc()->getPositions()[1][idxPartY],
+                                Y.getSrc()->getPositions()[2][idxPartY]);
+        const FReal  wy = Y.getSrc()->getPhysicalValues()[idxPartY];
+
+        const FReal one_over_r = MatrixKernel.evaluate(x, y);
+        // potential
+        p[counter] += one_over_r * wy;
+        // force
+        FPoint force(y - x);
+        force *= one_over_r*one_over_r*one_over_r;
+        f[counter*3 + 0] += force.getX() * wx * wy;
+        f[counter*3 + 1] += force.getY() * wx * wy;
+        f[counter*3 + 2] += force.getZ() * wx * wy;
+      }
+
+      counter++;
+    }
+  } // end direct computation
+
+
+  time.tac();
+  std::cout << "Done in " << time.elapsed() << "sec." << std::endl;
+
+
+  ////////////////////////////////////////////////////////////////////
+  unsigned int counter = 0;
+  for(int idxPartX = 0 ; idxPartX < X.getSrc()->getNbParticles() ; ++idxPartX){
+    approx_p[counter] = X.getSrc()->getPotentials()[idxPartX];
+    const FPoint force = FPoint(X.getSrc()->getForcesX()[idxPartX],
+                                X.getSrc()->getForcesY()[idxPartX],
+                                X.getSrc()->getForcesZ()[idxPartX]);
+    approx_f[counter*3 + 0] = force.getX();
+    approx_f[counter*3 + 1] = force.getY();
+    approx_f[counter*3 + 2] = force.getZ();
+
+    counter++;
+  }
+
+  std::cout << "\nPotential error:" << std::endl;
+  std::cout << "Relative error   = " << FMath::FAccurater( p, approx_p, M) << std::endl;
+
+  std::cout << "\nForce error:" << std::endl;
+  std::cout << "Relative L2 error   = " << FMath::FAccurater( f, approx_f, M*3) << std::endl;
+  std::cout << std::endl;
+
+  // free memory
+  delete [] approx_p;
+  delete [] p;
+  delete [] approx_f;
+  delete [] f;
+
+
+  return 0;
+}
diff --git a/UTests/utestChebyshevDirect.cpp b/UTests/utestChebyshevDirect.cpp
index c71fc1c1b9ed4b708739d66d93a323a7433cf4d3..0d87c0d5729b06f8fd0d793c0e5d1c822de35ad0 100755
--- a/UTests/utestChebyshevDirect.cpp
+++ b/UTests/utestChebyshevDirect.cpp
@@ -35,7 +35,7 @@
 
 
 #include "../Src/Kernels/Chebyshev/FChebCell.hpp"
-#include "../Src/Kernels/Chebyshev/FChebMatrixKernel.hpp"
+#include "../Src/Kernels/Interpolation/FInterpMatrixKernel.hpp"
 #include "../Src/Kernels/Chebyshev/FChebKernel.hpp"
 #include "../Src/Kernels/Chebyshev/FChebSymKernel.hpp"
 
@@ -203,7 +203,7 @@ class TestChebyshevDirect : public FUTester<TestChebyshevDirect> {
         const FReal epsilon = FReal(1e-5);
         typedef FP2PParticleContainerIndexed ContainerClass;
         typedef FSimpleLeaf<ContainerClass> LeafClass;
-		typedef FChebMatrixKernelR MatrixKernelClass;
+		typedef FInterpMatrixKernelR MatrixKernelClass;
 		typedef FChebCell<ORDER> CellClass;
         typedef FOctree<CellClass,ContainerClass,LeafClass> OctreeClass;
         typedef FChebKernel<CellClass,ContainerClass,MatrixKernelClass,ORDER> KernelClass;
@@ -218,7 +218,7 @@ class TestChebyshevDirect : public FUTester<TestChebyshevDirect> {
         const FReal epsilon = FReal(1e-5);
         typedef FP2PParticleContainerIndexed ContainerClass;
         typedef FSimpleLeaf<ContainerClass> LeafClass;
-		typedef FChebMatrixKernelR MatrixKernelClass;
+		typedef FInterpMatrixKernelR MatrixKernelClass;
 		typedef FChebCell<ORDER> CellClass;
         typedef FOctree<CellClass,ContainerClass,LeafClass> OctreeClass;
         typedef FChebSymKernel<CellClass,ContainerClass,MatrixKernelClass,ORDER> KernelClass;
diff --git a/UTests/utestChebyshevDirectPeriodic.cpp b/UTests/utestChebyshevDirectPeriodic.cpp
index a7e821489a20284cae4d34406cd482059e25a182..944db45165a5826f4cbcd291f271b4f59db432b1 100755
--- a/UTests/utestChebyshevDirectPeriodic.cpp
+++ b/UTests/utestChebyshevDirectPeriodic.cpp
@@ -32,7 +32,7 @@
 
 
 #include "../Src/Kernels/Chebyshev/FChebCell.hpp"
-#include "../Src/Kernels/Chebyshev/FChebMatrixKernel.hpp"
+#include "../Src/Kernels/Interpolation/FInterpMatrixKernel.hpp"
 #include "../Src/Kernels/Chebyshev/FChebKernel.hpp"
 #include "../Src/Kernels/Chebyshev/FChebSymKernel.hpp"
 
@@ -225,7 +225,7 @@ class TestChebyshevDirect : public FUTester<TestChebyshevDirect> {
         const FReal epsilon = FReal(1e-5);
         typedef FP2PParticleContainerIndexed ContainerClass;
         typedef FSimpleLeaf<ContainerClass> LeafClass;
-		typedef FChebMatrixKernelR MatrixKernelClass;
+		typedef FInterpMatrixKernelR MatrixKernelClass;
 		typedef FChebCell<ORDER> CellClass;
         typedef FOctree<CellClass,ContainerClass,LeafClass> OctreeClass;
         typedef FChebKernel<CellClass,ContainerClass,MatrixKernelClass,ORDER> KernelClass;
@@ -240,7 +240,7 @@ class TestChebyshevDirect : public FUTester<TestChebyshevDirect> {
         const FReal epsilon = FReal(1e-5);
         typedef FP2PParticleContainerIndexed ContainerClass;
         typedef FSimpleLeaf<ContainerClass> LeafClass;
-		typedef FChebMatrixKernelR MatrixKernelClass;
+		typedef FInterpMatrixKernelR MatrixKernelClass;
 		typedef FChebCell<ORDER> CellClass;
         typedef FOctree<CellClass,ContainerClass,LeafClass> OctreeClass;
         typedef FChebSymKernel<CellClass,ContainerClass,MatrixKernelClass,ORDER> KernelClass;
diff --git a/UTests/utestChebyshevMultiRhs.cpp b/UTests/utestChebyshevMultiRhs.cpp
index d86fd0c0d6fe2c6de2ca23b14529f53a73f937d1..b2634d28db2c507ac52b03d71740b01ba4422fab 100644
--- a/UTests/utestChebyshevMultiRhs.cpp
+++ b/UTests/utestChebyshevMultiRhs.cpp
@@ -35,7 +35,7 @@
 
 
 #include "../Src/Kernels/Chebyshev/FChebCell.hpp"
-#include "../Src/Kernels/Chebyshev/FChebMatrixKernel.hpp"
+#include "../Src/Kernels/Interpolation/FInterpMatrixKernel.hpp"
 #include "../Src/Kernels/Chebyshev/FChebKernel.hpp"
 #include "../Src/Kernels/Chebyshev/FChebSymKernel.hpp"
 
@@ -226,7 +226,7 @@ class TestChebyshevDirect : public FUTester<TestChebyshevDirect> {
         const FReal epsilon = FReal(1e-5);
         typedef FP2PParticleContainerIndexed ContainerClass;
         typedef FSimpleLeaf<ContainerClass> LeafClass;
-        typedef FChebMatrixKernelR MatrixKernelClass;
+        typedef FInterpMatrixKernelR MatrixKernelClass;
         typedef FChebCell<ORDER, NbRhs> CellClass;
         typedef FOctree<CellClass,ContainerClass,LeafClass> OctreeClass;
         typedef FChebKernel<CellClass,ContainerClass,MatrixKernelClass,ORDER, NbRhs> KernelClass;
@@ -242,7 +242,7 @@ class TestChebyshevDirect : public FUTester<TestChebyshevDirect> {
         const FReal epsilon = FReal(1e-5);
         typedef FP2PParticleContainerIndexed ContainerClass;
         typedef FSimpleLeaf<ContainerClass> LeafClass;
-        typedef FChebMatrixKernelR MatrixKernelClass;
+        typedef FInterpMatrixKernelR MatrixKernelClass;
         typedef FChebCell<ORDER, NbRhs> CellClass;
         typedef FOctree<CellClass,ContainerClass,LeafClass> OctreeClass;
         typedef FChebSymKernel<CellClass,ContainerClass,MatrixKernelClass,ORDER, NbRhs> KernelClass;