diff --git a/Src/Kernels/Chebyshev/FChebFlopsSymKernel.hpp b/Src/Kernels/Chebyshev/FChebFlopsSymKernel.hpp
index 58dc8d5459c47ab8a8af8bae08609eefbec83aaa..cf61e82f910b58058dbe9589f834631de798ff64 100755
--- a/Src/Kernels/Chebyshev/FChebFlopsSymKernel.hpp
+++ b/Src/Kernels/Chebyshev/FChebFlopsSymKernel.hpp
@@ -25,7 +25,7 @@
 #include "../../Components/FAbstractKernels.hpp"
 
 #include "./FChebInterpolator.hpp"
-#include "./FChebSymmetries.hpp"
+#include "../Interpolation/FInterpSymmetries.hpp"
 
 class FTreeCoordinate;
 
@@ -322,7 +322,7 @@ struct FChebFlopsSymKernel<CellClass, ContainerClass, MatrixKernelClass, ORDER>
 		}
 			
 		// set permutation vector and indices
-		const FChebSymmetries<ORDER> Symmetries;
+		const FInterpSymmetries<ORDER> Symmetries;
 		for (int i=-3; i<=3; ++i)
 			for (int j=-3; j<=3; ++j)
 				for (int k=-3; k<=3; ++k)
diff --git a/Src/Kernels/Chebyshev/FChebSymM2LHandler.hpp b/Src/Kernels/Chebyshev/FChebSymM2LHandler.hpp
index bd205555f55635dd4e85c142cdd4bb9667d1b36a..d3f46e29d109a5e43d27e792c251bf0262a475d3 100755
--- a/Src/Kernels/Chebyshev/FChebSymM2LHandler.hpp
+++ b/Src/Kernels/Chebyshev/FChebSymM2LHandler.hpp
@@ -21,7 +21,7 @@
 #include "../../Utils/FBlas.hpp"
 
 #include "./FChebTensor.hpp"
-#include "./FChebSymmetries.hpp"
+#include "../Interpolation/FInterpSymmetries.hpp"
 #include "./FChebM2LHandler.hpp"
 
 /**
@@ -539,7 +539,7 @@ public:
 		}
 			
 		// set permutation vector and indices
-		const FChebSymmetries<ORDER> Symmetries;
+		const FInterpSymmetries<ORDER> Symmetries;
 		for (int i=-3; i<=3; ++i)
 			for (int j=-3; j<=3; ++j)
 				for (int k=-3; k<=3; ++k) {
@@ -620,7 +620,7 @@ public:
 		
 
 		// set permutation vector and indices
-		const FChebSymmetries<ORDER> Symmetries;
+		const FInterpSymmetries<ORDER> Symmetries;
 		for (int i=-3; i<=3; ++i)
 			for (int j=-3; j<=3; ++j)
 				for (int k=-3; k<=3; ++k) {
diff --git a/Src/Kernels/Chebyshev/FChebSymmetries.hpp b/Src/Kernels/Interpolation/FInterpSymmetries.hpp
similarity index 96%
rename from Src/Kernels/Chebyshev/FChebSymmetries.hpp
rename to Src/Kernels/Interpolation/FInterpSymmetries.hpp
index 699514f3c6bbe9fdc310e184dcda8c0dbefd43bd..24693aead9a3da01feca2316603b16220b951cc0 100755
--- a/Src/Kernels/Chebyshev/FChebSymmetries.hpp
+++ b/Src/Kernels/Interpolation/FInterpSymmetries.hpp
@@ -13,8 +13,8 @@
 // "http://www.cecill.info". 
 // "http://www.gnu.org/licenses".
 // ===================================================================================
-#ifndef FCHEBSYMMETRIES_HPP
-#define FCHEBSYMMETRIES_HPP
+#ifndef FINTERPSYMMETRIES_HPP
+#define FINTERPSYMMETRIES_HPP
 
 #include <climits>
 
@@ -25,12 +25,12 @@
  */
 
 /**
- * @class FChebSymmetries
+ * @class FInterpSymmetries
  *
- * The class @p FChebSymmetries exploits all symmetries
+ * The class @p FInterpSymmetries exploits all symmetries
  */
 template <int ORDER>
-class FChebSymmetries
+class FInterpSymmetries
 {
 	enum {nnodes = ORDER*ORDER*ORDER};
 
@@ -49,11 +49,11 @@ class FChebSymmetries
 		return  (sk | sj | si);
 	}
 
-
+  
 	public:
 
 	/** Constructor */
-	FChebSymmetries()
+	FInterpSymmetries()
 	{
 		// permutations for 8 quadrants
 		unsigned int quads[8][nnodes];
diff --git a/Src/Kernels/Uniform/FUnifInterpolator.hpp b/Src/Kernels/Uniform/FUnifInterpolator.hpp
index 4f7acccbf064c7643a97c12e3436148f56835b3a..c4c4f669bfc0ca586553440e728122fee625094a 100755
--- a/Src/Kernels/Uniform/FUnifInterpolator.hpp
+++ b/Src/Kernels/Uniform/FUnifInterpolator.hpp
@@ -45,8 +45,6 @@ class FUnifInterpolator : FNoCopyable
   typedef FUnifRoots< ORDER>  BasisType;
   typedef FUnifTensor<ORDER> TensorType;
 
-  //  FReal T_of_roots[ORDER][ORDER];
-  //  FReal T[ORDER * (ORDER-1)];
   unsigned int node_ids[nnodes][3];
   FReal* ChildParentInterpolator[8];
 
@@ -56,12 +54,13 @@ class FUnifInterpolator : FNoCopyable
   ////////////////////////////////////////////////////////////////////
 
 
-
+  // PB: use improved version of M2M/L2L
   /**
    * Initialize the child - parent - interpolator, it is basically the matrix
    * S which is precomputed and reused for all M2M and L2L operations, ie for
    * all non leaf inter/anterpolations.
    */
+/*
   void initM2MandL2L()
   {
     FPoint ParentRoots[nnodes], ChildRoots[nnodes];
@@ -86,6 +85,7 @@ class FUnifInterpolator : FNoCopyable
       assembleInterpolator(nnodes, ChildRoots, ChildParentInterpolator[child]);
     }
   }
+*/
 
   /**
    * Initialize the child - parent - interpolator, it is basically the matrix
@@ -141,22 +141,10 @@ public:
    */
   explicit FUnifInterpolator()
   {
-    //    // initialize chebyshev polynomials of root nodes: T_o(x_j)
-    //    for (unsigned int o=1; o<ORDER; ++o)
-    //      for (unsigned int j=0; j<ORDER; ++j)
-    //        T_of_roots[o][j] = FReal(BasisType::T(o, FReal(BasisType::roots[j])));
-    //
-    //    // initialize chebyshev polynomials of root nodes: T_o(x_j)
-    //    for (unsigned int o=1; o<ORDER; ++o)
-    //      for (unsigned int j=0; j<ORDER; ++j)
-    //        T[(o-1)*ORDER + j] = FReal(BasisType::T(o, FReal(BasisType::roots[j])));
-
-
     // initialize root node ids
     TensorType::setNodeIds(node_ids);
 
-    // initialize interpolation operator for non M2M and L2L (non leaf
-    // operations)
+    // initialize interpolation operator for M2M and L2L (non leaf operations)
     //this -> initM2MandL2L();     // non tensor-product interpolation
     this -> initTensorM2MandL2L(); // tensor-product interpolation
   }
@@ -278,7 +266,7 @@ public:
                      const FReal *const localExpansion,
                      ContainerClass *const localParticles) const;
 
-
+  // PB: ORDER^6 version of applyM2M/L2L
   /*
     void applyM2M(const unsigned int ChildIndex,
     const FReal *const ChildExpansion,
@@ -299,9 +287,7 @@ public:
     }
   */
 
-  // PB: improvement of applyM2M/L2L can be preserved (TOFACTO)
-  // since relative position of child and parent interpolation is remains unchanged
-
+  // PB: improved version of applyM2M/L2L also applies to Lagrange interpolation
   void applyM2M(const unsigned int ChildIndex,
                 const FReal *const ChildExpansion,
                 FReal *const ParentExpansion) const
@@ -506,10 +492,10 @@ inline void FUnifInterpolator<ORDER>::applyL2PGradient(const FPoint& center,
 
     // evaluate Lagrange polynomials of source particle
     for (unsigned int o=0; o<ORDER; ++o) {
-      L_of_x[o][0] = BasisType::L(o, localPosition.getX()); // 3 * ORDER*(ORDER-1) flops PB: TODO confirm
+      L_of_x[o][0] = BasisType::L(o, localPosition.getX()); // 3 * ORDER*(ORDER-1) flops 
       L_of_x[o][1] = BasisType::L(o, localPosition.getY()); // 3 * ORDER*(ORDER-1) flops
       L_of_x[o][2] = BasisType::L(o, localPosition.getZ()); // 3 * ORDER*(ORDER-1) flops
-      dL_of_x[o][0] = BasisType::dL(o, localPosition.getX()); // TODO verify 3 * ORDER*(ORDER-1) flops PB: TODO confirm
+      dL_of_x[o][0] = BasisType::dL(o, localPosition.getX()); // TODO verify 3 * ORDER*(ORDER-1) flops
       dL_of_x[o][1] = BasisType::dL(o, localPosition.getY()); // TODO verify 3 * ORDER*(ORDER-1) flops
       dL_of_x[o][2] = BasisType::dL(o, localPosition.getZ()); // TODO verify 3 * ORDER*(ORDER-1) flops
     }
diff --git a/Src/Kernels/Uniform/FUnifKernel.hpp b/Src/Kernels/Uniform/FUnifKernel.hpp
index 4bd37ed90025df313ee5f8bdeb02e42974fc27bf..2b4f2f1c6bdd936f2b7b9e231690ec02a02c5f9c 100644
--- a/Src/Kernels/Uniform/FUnifKernel.hpp
+++ b/Src/Kernels/Uniform/FUnifKernel.hpp
@@ -96,7 +96,7 @@ public:
   {
     for(int idxRhs = 0 ; idxRhs < NVALS ; ++idxRhs){
       // 1) apply Sy
-      FBlas::scal(AbstractBaseClass::nnodes*2, FReal(0.), ParentCell->getMultipole(idxRhs));
+      FBlas::scal(AbstractBaseClass::nnodes, FReal(0.), ParentCell->getMultipole(idxRhs));
       for (unsigned int ChildIndex=0; ChildIndex < 8; ++ChildIndex){
         if (ChildCells[ChildIndex]){
           AbstractBaseClass::Interpolator->applyM2M(ChildIndex, ChildCells[ChildIndex]->getMultipole(idxRhs),
@@ -170,7 +170,6 @@ public:
            const int /*TreeLevel*/)
   {
     for(int idxRhs = 0 ; idxRhs < NVALS ; ++idxRhs){
-
       // 1) Apply Inverse Discete Fourier Transform
       M2LHandler->unapplyZeroPaddingAndDFT(ParentCell->getTransformedLocal(idxRhs),
                                            const_cast<CellClass*>(ParentCell)->getLocal(idxRhs));
diff --git a/Src/Kernels/Uniform/FUnifM2LHandler.hpp b/Src/Kernels/Uniform/FUnifM2LHandler.hpp
index 8a6c2fee98bfbca66077d1e28c412dbcf07769cc..20a2f4a8e186dd1d4f8117a0f7309477b4d9df8e 100644
--- a/Src/Kernels/Uniform/FUnifM2LHandler.hpp
+++ b/Src/Kernels/Uniform/FUnifM2LHandler.hpp
@@ -64,7 +64,7 @@ class FUnifM2LHandler : FNoCopyable
   unsigned int opt_rc; 
 
   typedef FUnifTensor<ORDER> TensorType;
-  unsigned int node_diff[nnodes*nnodes]; // PB: used in applyC to get id=i-j
+  unsigned int node_diff[nnodes*nnodes];
 
   //  FDft Dft; // Direct Discrete Fourier Transformator
   FFft Dft; // Fast Discrete Fourier Transformator
@@ -222,18 +222,12 @@ public:
     FReal Py[rc];
     FBlas::setzero(rc,Py);
 
-    FComplexe tmpFY[rc]; // not mandatory?
-
     // Apply Zero Padding
     for (unsigned int i=0; i<nnodes; ++i)
       Py[node_diff[i*nnodes]]=y[i];
 
     // Apply forward Discrete Fourier Transform
-    Dft.applyDFT(Py,tmpFY);
-
-    // not mandatory?
-    for (unsigned int j=0; j<rc; ++j) // could be opt_rc
-      FY[j]+=tmpFY[j];
+    Dft.applyDFT(Py,FY);
 
   }
 
diff --git a/Src/Kernels/Uniform/FUnifRoots.hpp b/Src/Kernels/Uniform/FUnifRoots.hpp
index b3897e8efe5e9976b8577c8139993059ecd4a2ea..15f1b9f57249df8de7e9633a4bea2b56fb3112e3 100644
--- a/Src/Kernels/Uniform/FUnifRoots.hpp
+++ b/Src/Kernels/Uniform/FUnifRoots.hpp
@@ -63,14 +63,14 @@ struct FUnifRoots : FNoCopyable
       x = (x < FReal(-1.) ? FReal(-1.) : x);
     }
 
-    FReal tmpL=FReal(1.);
+    FReal L=FReal(1.);
     for(unsigned int m=0;m<order;++m){
       if(m!=n)
-        tmpL *= (x-FUnifRoots<order>::roots[m])/(FUnifRoots<order>::roots[n]-FUnifRoots<order>::roots[m]);
+        L *= (x-FUnifRoots<order>::roots[m])/(FUnifRoots<order>::roots[n]-FUnifRoots<order>::roots[m]);
       
     }
 
-    return FReal(tmpL);
+    return FReal(L);
   }
 
 
@@ -90,26 +90,22 @@ struct FUnifRoots : FNoCopyable
       x = (x < FReal(-1.) ? FReal(-1.) : x);
     }
 
-    FReal tmpdL;
-    FReal dL=FReal(0.);
-
+    // optimized variant
+    FReal NdL=FReal(0.);// init numerator
+    FReal DdL=FReal(1.);// init denominator
+    FReal tmpNdL;
     for(unsigned int p=0;p<order;++p){
       if(p!=n){
-        tmpdL=1./(FUnifRoots<order>::roots[n]-FUnifRoots<order>::roots[p]);
-
-      for(unsigned int m=0;m<order;++m){
-        if(m!=n && m!=p)
-          tmpdL *= (x-FUnifRoots<order>::roots[m])/(FUnifRoots<order>::roots[n]-FUnifRoots<order>::roots[m]);
-
-      }// m
-
-      dL+=tmpdL;
-      
+        tmpNdL=FReal(1.);
+        for(unsigned int m=0;m<order;++m)
+          if(m!=n && m!=p)
+            tmpNdL*=x-FUnifRoots<order>::roots[m];
+        NdL+=tmpNdL;
+        DdL*=FUnifRoots<order>::roots[n]-FUnifRoots<order>::roots[p];
       }//endif
-
     }// p
 
-    return FReal(dL);
+    return FReal(NdL/DdL);
 
   }
 };
diff --git a/Src/Kernels/Uniform/FUnifSymKernel.hpp b/Src/Kernels/Uniform/FUnifSymKernel.hpp
new file mode 100755
index 0000000000000000000000000000000000000000..509585a6325c38dad226e9116781eb9a89d1b9c3
--- /dev/null
+++ b/Src/Kernels/Uniform/FUnifSymKernel.hpp
@@ -0,0 +1,471 @@
+// ===================================================================================
+// Copyright ScalFmm 2011 INRIA, Olivier Coulaud, Bérenger Bramas, Matthias Messner
+// olivier.coulaud@inria.fr, berenger.bramas@inria.fr
+// This software is a computer program whose purpose is to compute the FMM.
+//
+// This software is governed by the CeCILL-C and LGPL licenses and
+// abiding by the rules of distribution of free software.
+//
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public and CeCILL-C Licenses for more details.
+// "http://www.cecill.info".
+// "http://www.gnu.org/licenses".
+// ===================================================================================
+#ifndef FUNIFSYMKERNEL_HPP
+#define FUNIFSYMKERNEL_HPP
+
+#include "../../Utils/FGlobal.hpp"
+#include "../../Utils/FTrace.hpp"
+#include "../../Utils/FSmartPointer.hpp"
+
+// Originally in M2LHandler but transferred to the kernel for the symmetric version
+#include "../../Utils/FDft.hpp" // PB: for FFT
+#include "../../Utils/FComplexe.hpp"
+#include "./FUnifTensor.hpp" // PB: for node_diff
+//
+
+#include "./FAbstractUnifKernel.hpp"
+#include "./FUnifInterpolator.hpp"
+
+#include "./FUnifSymM2LHandler.hpp"
+
+class FTreeCoordinate;
+
+
+// for verbosity only!!!
+//#define COUNT_BLOCKED_INTERACTIONS
+
+// if timings should be logged
+#define LOG_TIMINGS
+
+/**
+ * @author Pierre Blanchard (pierre.blanchard@inria.fr)
+ * @class FUnifSymKernel
+ * @brief
+ * Please read the license
+ *
+ * This kernels implement the Lagrange interpolation based FMM operators
+ * exploiting the symmetries in the far-field. It implements all interfaces
+ * (P2P, P2M, M2M, M2L, L2L, L2P) which are required by the FFmmAlgorithm and
+ * FFmmAlgorithmThread.
+ *
+ * @tparam CellClass Type of cell
+ * @tparam ContainerClass Type of container to store particles
+ * @tparam MatrixKernelClass Type of matrix kernel function
+ * @tparam ORDER interpolation order
+ */
+template < class CellClass,	class ContainerClass,	class MatrixKernelClass, int ORDER, int NVALS = 1>
+class FUnifSymKernel
+  : public FAbstractUnifKernel<CellClass, ContainerClass, MatrixKernelClass, ORDER, NVALS>
+{
+  typedef FAbstractUnifKernel<CellClass, ContainerClass, MatrixKernelClass, ORDER, NVALS>	AbstractBaseClass;
+  typedef FUnifSymM2LHandler<ORDER, MatrixKernelClass::Type> SymM2LHandlerClass;
+  enum {nnodes = AbstractBaseClass::nnodes};
+
+  /// Needed for handling all symmetries
+  const FSmartPointer<SymM2LHandlerClass,FSmartPointerMemory> SymM2LHandler;
+
+  // permuted local and multipole expansions
+  FReal** Loc;
+  FReal** Mul;
+  unsigned int* countExp;
+
+  // transformed expansions
+  FComplexe** TLoc;
+  FComplexe** TMul;
+
+  static const unsigned int rc = (2*ORDER-1)*(2*ORDER-1)*(2*ORDER-1);
+  static const unsigned int opt_rc = rc/2+1;
+
+  typedef FUnifTensor<ORDER> TensorType;
+  unsigned int node_diff[nnodes*nnodes];
+
+  //  FDft Dft; // Direct Discrete Fourier Transformator
+  FFft Dft; // Fast Discrete Fourier Transformator
+
+  /**
+   * Allocate memory for storing locally permuted mulipole and local expansions
+   */
+  void allocateMemoryForPermutedExpansions()
+  {
+    assert(Loc==NULL && Mul==NULL && countExp==NULL);
+    Loc = new FReal* [343];
+    Mul = new FReal* [343];
+    TLoc = new FComplexe* [343];
+    TMul = new FComplexe* [343];
+    countExp = new unsigned int [343];
+
+    // set all 343 to NULL
+    for (unsigned int idx=0; idx<343; ++idx) {
+      Mul[idx] = Loc[idx] = NULL;
+      TMul[idx] = TLoc[idx] = NULL;
+    }
+
+    // init only 16 of 343 possible translations due to symmetries
+    for (int i=2; i<=3; ++i)
+      for (int j=0; j<=i; ++j)
+        for (int k=0; k<=j; ++k) {
+          const unsigned int idx = (i+3)*7*7 + (j+3)*7 + (k+3);
+          assert(Mul[idx]==NULL || Loc[idx]==NULL);
+          Mul[idx] = new FReal [24 * nnodes];
+          Loc[idx] = new FReal [24 * nnodes];
+          TMul[idx] = new FComplexe [24 * rc];
+          TLoc[idx] = new FComplexe [24 * rc];
+        }
+  }
+
+
+#ifdef LOG_TIMINGS
+  FTic time;
+  FReal t_m2l_1, t_m2l_2, t_m2l_3;
+#endif
+
+public:
+  /**
+   * The constructor initializes all constant attributes and it reads the
+   * precomputed and compressed M2L operators from a binary file (an
+   * runtime_error is thrown if the required file is not valid).
+   */
+  FUnifSymKernel(const int inTreeHeight,
+                 const FPoint& inBoxCenter,
+                 const FReal inBoxWidth)
+    : AbstractBaseClass(inTreeHeight, inBoxCenter, inBoxWidth),
+      SymM2LHandler(new SymM2LHandlerClass(AbstractBaseClass::MatrixKernel.getPtr(), inBoxWidth, inTreeHeight)),
+      Loc(NULL), Mul(NULL), countExp(NULL),
+      TLoc(NULL), TMul(NULL),
+      Dft(rc) // initialize Discrete Fourier Transformator,
+  {
+    this->allocateMemoryForPermutedExpansions();
+
+    // initialize root node ids
+    TensorType::setNodeIdsDiff(node_diff);
+
+#ifdef LOG_TIMINGS
+    t_m2l_1 = FReal(0.);
+    t_m2l_2 = FReal(0.);
+    t_m2l_3 = FReal(0.);
+#endif
+  }
+
+
+
+  /** Copy constructor */
+  FUnifSymKernel(const FUnifSymKernel& other)
+  : AbstractBaseClass(other),
+    SymM2LHandler(other.SymM2LHandler),
+    Loc(NULL), Mul(NULL), countExp(NULL),
+    TLoc(NULL), TMul(NULL)
+  {
+    this->allocateMemoryForPermutedExpansions();
+  }
+
+
+
+  /** Destructor */
+  ~FUnifSymKernel()
+  {
+    for (unsigned int t=0; t<343; ++t) {
+      if (Loc[t]!=NULL) delete [] Loc[t];
+      if (Mul[t]!=NULL) delete [] Mul[t];
+      if (TLoc[t]!=NULL) delete [] TLoc[t];
+      if (TMul[t]!=NULL) delete [] TMul[t];
+    }
+    if (Loc!=NULL)      delete [] Loc;
+    if (Mul!=NULL)      delete [] Mul;
+    if (countExp!=NULL) delete [] countExp;
+    if (TLoc!=NULL)      delete [] TLoc;
+    if (TMul!=NULL)      delete [] TMul;
+
+#ifdef LOG_TIMINGS
+    std::cout << "- Permutation+Pad+Transfo took " << t_m2l_1 << "s"
+              << "\n- Apply M2L in Fourier space took " << t_m2l_2 << "s"
+              << "\n- Unpermutation+Untransfo+Unpad took " << t_m2l_3 << "s"
+              << std::endl;
+#endif
+  }
+
+
+  const SymM2LHandlerClass *const getPtrToSymM2LHandler() const
+  {	return SymM2LHandler.getPtr();	}
+
+
+
+  void P2M(CellClass* const LeafCell,
+           const ContainerClass* const SourceParticles)
+  {
+    const FPoint LeafCellCenter(AbstractBaseClass::getLeafCellCenter(LeafCell->getCoordinate()));
+    for(int idxRhs = 0 ; idxRhs < NVALS ; ++idxRhs){
+      // 1) apply Sy
+      AbstractBaseClass::Interpolator->applyP2M(LeafCellCenter, AbstractBaseClass::BoxWidthLeaf,
+                                                LeafCell->getMultipole(idxRhs), SourceParticles);
+//      // 2) apply Discrete Fourier Transform
+//      SymM2LHandler->applyZeroPaddingAndDFT(LeafCell->getMultipole(idxRhs),
+//                                         LeafCell->getTransformedMultipole(idxRhs));
+    }
+  }
+
+
+
+  void M2M(CellClass* const FRestrict ParentCell,
+           const CellClass*const FRestrict *const FRestrict ChildCells,
+           const int /*TreeLevel*/)
+  {
+    for(int idxRhs = 0 ; idxRhs < NVALS ; ++idxRhs){
+      // apply Sy
+      FBlas::scal(nnodes, FReal(0.), ParentCell->getMultipole(idxRhs));
+      for (unsigned int ChildIndex=0; ChildIndex < 8; ++ChildIndex){
+        if (ChildCells[ChildIndex]){
+          AbstractBaseClass::Interpolator->applyM2M(ChildIndex, ChildCells[ChildIndex]->getMultipole(idxRhs), ParentCell->getMultipole(idxRhs));
+        }
+      }
+//      // 2) Apply Discete Fourier Transform
+//      SymM2LHandler->applyZeroPaddingAndDFT(ParentCell->getMultipole(idxRhs),
+//                                         ParentCell->getTransformedMultipole(idxRhs));
+    }
+  }
+
+
+
+  void M2L(CellClass* const FRestrict TargetCell,
+           const CellClass* SourceCells[343],
+           const int /*NumSourceCells*/,
+           const int TreeLevel)
+  {
+#ifdef LOG_TIMINGS
+    time.tic();
+#endif
+    for(int idxRhs = 0 ; idxRhs < NVALS ; ++idxRhs){
+
+      // permute and copy multipole expansion
+      memset(countExp, 0, sizeof(int) * 343);
+      for (unsigned int idx=0; idx<343; ++idx) {
+        if (SourceCells[idx]) {
+          const unsigned int pidx = SymM2LHandler->pindices[idx];
+          const unsigned int count = (countExp[pidx])++;
+          FReal *const mul = Mul[pidx] + count*nnodes;
+          const unsigned int *const pvec = SymM2LHandler->pvectors[idx];
+          const FReal *const MultiExp = SourceCells[idx]->getMultipole(idxRhs);
+
+          // explicit loop unrolling
+          for (unsigned int n=0; n<nnodes/4 * 4; n+=4) {
+            mul[pvec[n  ]] = MultiExp[n];
+            mul[pvec[n+1]] = MultiExp[n+1];
+            mul[pvec[n+2]] = MultiExp[n+2];
+            mul[pvec[n+3]] = MultiExp[n+3];
+          }
+          for (unsigned int n=nnodes/4 * 4; n<nnodes; ++n){
+            mul[pvec[n]] = MultiExp[n];
+          }
+
+          // transform permuted expansion
+          FComplexe *const tmul = TMul[pidx] + count*rc;
+
+          ///////////////////////////////////////////
+          FReal pmul[rc];
+          FBlas::setzero(rc,pmul);
+
+          // Apply Zero Padding
+          for (unsigned int i=0; i<nnodes; ++i)
+            pmul[node_diff[i*nnodes]]=mul[i];
+
+          // Apply forward Discrete Fourier Transform
+          Dft.applyDFT(pmul,tmul);
+          ///////////////////////////////////////////
+
+
+        }
+      }
+
+#ifdef LOG_TIMINGS
+      t_m2l_1 += time.tacAndElapsed();
+#endif
+
+#ifdef COUNT_BLOCKED_INTERACTIONS ////////////////////////////////////
+      unsigned int count_lidx = 0;
+      unsigned int count_interactions = 0;
+      for (unsigned int idx=0; idx<343; ++idx)
+        count_interactions += countExp[idx];
+      if (count_interactions==189) {
+        for (unsigned int idx=0; idx<343; ++idx) {
+          if (countExp[idx])
+            std::cout << "gidx = " << idx << " gives lidx = " << count_lidx++ << " and has "
+                      << countExp[idx] << " interactions" << std::endl;
+        }
+        std::cout << std::endl;
+      }
+#endif ///////////////////////////////////////////////////////////////
+
+
+#ifdef LOG_TIMINGS
+      time.tic();
+#endif
+
+      // multiply (mat-mat-mul)
+      const FReal scale = AbstractBaseClass::MatrixKernel->getScaleFactor(AbstractBaseClass::BoxWidth, TreeLevel);
+
+      FComplexe tmpTLoc; 
+
+      for (unsigned int pidx=0; pidx<343; ++pidx) {
+        const unsigned int count = countExp[pidx];
+        if (count) {
+
+          FComplexe *const tmpK=const_cast<FComplexe*>(SymM2LHandler->getK(TreeLevel, pidx));
+
+          for (unsigned int i=0; i<count; ++i){
+          //unsigned int i=count;
+            FComplexe *const ttmul = TMul[pidx] + i*rc;
+            FComplexe *const ttloc = TLoc[pidx] + i*rc;
+
+            // Perform entrywise product manually
+            for (unsigned int j=0; j<opt_rc; ++j){
+              tmpTLoc=tmpK[j];
+              tmpTLoc*=ttmul[j];
+              tmpTLoc*=scale;
+              ttloc[j]=tmpTLoc;
+            }
+          }
+
+//          // rank * count * (2*nnodes-1) flops
+//          FBlas::gemtm(nnodes, rank, count, FReal(1.),
+//                       const_cast<FReal*>(SymM2LHandler->getK(TreeLevel, pidx))+rank*nnodes,
+//                       nnodes, Mul[pidx], nnodes, Compressed, rank);
+//          // nnodes *count * (2*rank-1) flops
+//          FBlas::gemm( nnodes, rank, count, scale,
+//                       const_cast<FReal*>(SymM2LHandler->getK(TreeLevel, pidx)),
+//                       nnodes, Compressed, rank, Loc[pidx], nnodes);
+        }
+      }
+
+
+
+#ifdef LOG_TIMINGS
+      t_m2l_2 += time.tacAndElapsed();
+#endif
+
+#ifdef LOG_TIMINGS
+      time.tic();
+#endif
+
+      // permute and add contribution to local expansions
+      FReal *const LocalExpansion = TargetCell->getLocal(idxRhs);
+      memset(countExp, 0, sizeof(int) * 343);
+      for (unsigned int idx=0; idx<343; ++idx) {
+        if (SourceCells[idx]) {
+          const unsigned int pidx = SymM2LHandler->pindices[idx];
+          const unsigned int count = (countExp[pidx])++;
+
+          FReal *const loc = Loc[pidx] + count*nnodes;
+          const FComplexe *const tloc = TLoc[pidx] + count*rc;
+
+          ///////////////////////////////////////////
+          FReal ploc[rc];
+          FBlas::setzero(rc,ploc);
+          // Apply forward Discrete Fourier Transform
+          Dft.applyIDFT(tloc,ploc);
+
+          // Unapply Zero Padding
+          for (unsigned int j=0; j<nnodes; ++j)
+            loc[j]=ploc[node_diff[nnodes-j-1]];
+          //loc[j]+=ploc[node_diff[nnodes-j-1]];
+          ///////////////////////////////////////////
+
+          const unsigned int *const pvec = SymM2LHandler->pvectors[idx];
+
+          /*
+          // no loop unrolling
+          for (unsigned int n=0; n<nnodes; ++n)
+          LocalExpansion[n] += loc[pvec[n]];
+          */
+
+          // explicit loop unrolling
+          for (unsigned int n=0; n<nnodes/4 * 4; n+=4) {
+            LocalExpansion[n  ] += loc[pvec[n  ]];
+            LocalExpansion[n+1] += loc[pvec[n+1]];
+            LocalExpansion[n+2] += loc[pvec[n+2]];
+            LocalExpansion[n+3] += loc[pvec[n+3]];
+          }
+          for (unsigned int n=nnodes/4 * 4; n<nnodes; ++n)
+            LocalExpansion[n] += loc[pvec[n]];
+
+        }
+      }
+
+#ifdef LOG_TIMINGS
+      t_m2l_3 += time.tacAndElapsed();
+#endif
+    }
+
+  }
+
+
+
+  void L2L(const CellClass* const FRestrict ParentCell,
+           CellClass* FRestrict *const FRestrict ChildCells,
+           const int /*TreeLevel*/)
+  {
+    for(int idxRhs = 0 ; idxRhs < NVALS ; ++idxRhs){
+//      // 1) Apply Inverse Discete Fourier Transform
+//      SymM2LHandler->unapplyZeroPaddingAndDFT(ParentCell->getTransformedLocal(idxRhs),
+//                                           const_cast<CellClass*>(ParentCell)->getLocal(idxRhs));
+      // 2) apply Sx
+      for (unsigned int ChildIndex=0; ChildIndex < 8; ++ChildIndex){
+        if (ChildCells[ChildIndex]){
+          AbstractBaseClass::Interpolator->applyL2L(ChildIndex, ParentCell->getLocal(idxRhs), ChildCells[ChildIndex]->getLocal(idxRhs));
+        }
+      }
+    }
+  }
+
+
+
+  void L2P(const CellClass* const LeafCell,
+           ContainerClass* const TargetParticles)
+  {
+    const FPoint LeafCellCenter(AbstractBaseClass::getLeafCellCenter(LeafCell->getCoordinate()));
+
+    for(int idxRhs = 0 ; idxRhs < NVALS ; ++idxRhs){
+
+//      // 1)  Apply Inverse Discete Fourier Transform
+//      SymM2LHandler->unapplyZeroPaddingAndDFT(LeafCell->getTransformedLocal(idxRhs),
+//                                           const_cast<CellClass*>(LeafCell)->getLocal(idxRhs));
+//
+      // 2.a) apply Sx
+      AbstractBaseClass::Interpolator->applyL2P(LeafCellCenter, AbstractBaseClass::BoxWidthLeaf,
+                                                LeafCell->getLocal(idxRhs), TargetParticles);
+
+      // 2.b) apply Px (grad Sx)
+      AbstractBaseClass::Interpolator->applyL2PGradient(LeafCellCenter, AbstractBaseClass::BoxWidthLeaf,
+                                                        LeafCell->getLocal(idxRhs), TargetParticles);
+    }
+  }
+
+  void P2P(const FTreeCoordinate& /* LeafCellCoordinate */, // needed for periodic boundary conditions
+           ContainerClass* const FRestrict TargetParticles,
+           const ContainerClass* const FRestrict /*SourceParticles*/,
+           ContainerClass* const NeighborSourceParticles[27],
+           const int /* size */)
+  {
+    DirectInteractionComputer<MatrixKernelClass::Identifier, NVALS>::P2P(TargetParticles,NeighborSourceParticles);
+  }
+
+
+  void P2PRemote(const FTreeCoordinate& /*inPosition*/,
+                 ContainerClass* const FRestrict inTargets, const ContainerClass* const FRestrict /*inSources*/,
+                 ContainerClass* const inNeighbors[27], const int /*inSize*/){
+    DirectInteractionComputer<MatrixKernelClass::Identifier, NVALS>::P2PRemote(inTargets,inNeighbors,27);
+  }
+
+};
+
+
+
+
+
+
+
+
+#endif //FUNIFSYMKERNELS_HPP
+
+// [--END--]
diff --git a/Src/Kernels/Uniform/FUnifSymM2LHandler.hpp b/Src/Kernels/Uniform/FUnifSymM2LHandler.hpp
new file mode 100755
index 0000000000000000000000000000000000000000..a7e4843b651eadebe7ae35815815213ffee0f982
--- /dev/null
+++ b/Src/Kernels/Uniform/FUnifSymM2LHandler.hpp
@@ -0,0 +1,428 @@
+// ===================================================================================
+// Copyright ScalFmm 2011 INRIA, Olivier Coulaud, Bérenger Bramas, Matthias Messner
+// olivier.coulaud@inria.fr, berenger.bramas@inria.fr
+// This software is a computer program whose purpose is to compute the FMM.
+//
+// This software is governed by the CeCILL-C and LGPL licenses and
+// abiding by the rules of distribution of free software.  
+// 
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public and CeCILL-C Licenses for more details.
+// "http://www.cecill.info". 
+// "http://www.gnu.org/licenses".
+// ===================================================================================
+#ifndef FUNIFSYMM2LHANDLER_HPP
+#define FUNIFSYMM2LHANDLER_HPP
+
+#include <climits>
+
+#include "../../Utils/FBlas.hpp"
+#include "../../Utils/FDft.hpp"
+
+#include "../../Utils/FComplexe.hpp"
+
+#include "./FUnifTensor.hpp"
+#include "../Interpolation/FInterpSymmetries.hpp"
+#include "./FUnifM2LHandler.hpp"
+
+/**
+ * @author Pierre Blanchard (pierre.blanchard@inria.fr)
+ * Please read the license
+ */
+
+
+/*!  Precomputes the 16 far-field interactions (due to symmetries in their
+  arrangement all 316 far-field interactions can be represented by
+  permutations of the 16 we compute in this function).
+ */
+template <int ORDER, typename MatrixKernelClass>
+static void precompute(const MatrixKernelClass *const MatrixKernel, const FReal CellWidth,
+                       FComplexe* FC[343])
+{
+  //	std::cout << "\nComputing 16 far-field interactions (l=" << ORDER << ", eps=" << Epsilon
+  //						<< ") for cells of width w = " << CellWidth << std::endl;
+
+	static const unsigned int nnodes = ORDER*ORDER*ORDER;
+
+	// interpolation points of source (Y) and target (X) cell
+	FPoint X[nnodes], Y[nnodes];
+	// set roots of target cell (X)
+	FUnifTensor<ORDER>::setRoots(FPoint(0.,0.,0.), CellWidth, X);
+	// temporary matrices
+	FReal *_C;
+	FComplexe *_FC;
+
+  // reduce storage from nnodes^2=order^6 to (2order-1)^3
+  const unsigned int rc = (2*ORDER-1)*(2*ORDER-1)*(2*ORDER-1);
+	_C = new FReal [rc];
+	_FC = new FComplexe [rc]; // TODO: do it in the non-sym version!!!
+
+  // init Discrete Fourier Transformator
+//	FDft Dft(rc);
+	FFft Dft(rc);
+
+  // reduce storage if real valued kernel
+  const unsigned int opt_rc = rc/2+1;
+
+  // get first column of K via permutation
+  unsigned int perm[rc];
+  for(unsigned int p=0; p<rc; ++p){
+    if(p<rc-1) perm[p]=p+1;
+    else perm[p]=p+1-rc;
+  }
+  unsigned int li,lj, mi,mj, ni,nj;
+  unsigned int idi, idj, ido;
+
+	// initialize timer
+	FTic time;
+	double overall_time(0.);
+	double elapsed_time(0.);
+
+	unsigned int counter = 0;
+	for (int i=2; i<=3; ++i) {
+		for (int j=0; j<=i; ++j) {
+			for (int k=0; k<=j; ++k) {
+
+				// set roots of source cell (Y)
+				const FPoint cy(CellWidth*FReal(i), CellWidth*FReal(j), CellWidth*FReal(k));
+				FUnifTensor<ORDER>::setRoots(cy, CellWidth, Y);
+
+				// start timer
+				time.tic();
+
+        // evaluate m2l operator
+        ido=0;
+        for(unsigned int l=0; l<2*ORDER-1; ++l)
+          for(unsigned int m=0; m<2*ORDER-1; ++m)
+            for(unsigned int n=0; n<2*ORDER-1; ++n){   
+          
+              // l=0:(2*ORDER-1) => li-lj=-(ORDER-1):(ORDER-1)
+              // Convention:
+              // lj=ORDER-1 & li=0:ORDER-1 => li-lj=1-ORDER:0
+              // lj=1 & li=0:ORDER-1 => li-lj=1:ORDER-1
+              if(l<ORDER-1) lj=ORDER-1; else lj=0;
+              if(m<ORDER-1) mj=ORDER-1; else mj=0;
+              if(n<ORDER-1) nj=ORDER-1; else nj=0;
+              li=(l-(ORDER-1))+lj; mi=(m-(ORDER-1))+mj; ni=(n-(ORDER-1))+nj;
+              // deduce corresponding index of K[nnodes x nnodes] 
+              idi=li*ORDER*ORDER + mi*ORDER + ni;
+              idj=lj*ORDER*ORDER + mj*ORDER + nj;
+                
+              // store value at current position in C
+              // use permutation if DFT is used because 
+              // the storage of the first column is required
+              // i.e. C[0] C[rc-1] C[rc-2] .. C[1] < WRONG!
+              // i.e. C[rc-1] C[0] C[1] .. C[rc-2] < RIGHT!
+//                _C[counter*rc + ido]
+              _C[perm[ido]]
+                = MatrixKernel->evaluate(X[idi], Y[idj]);
+              ido++;
+            }
+
+        // Apply Discrete Fourier Transformation
+        Dft.applyDFT(_C,_FC);
+
+        // determine new index
+				const unsigned int idx = (i+3)*7*7 + (j+3)*7 + (k+3);
+
+				// store
+				{
+					// allocate
+					assert(FC[idx]==NULL);
+					FC[idx] = new FComplexe[opt_rc];
+          FBlas::c_copy(opt_rc, reinterpret_cast<FReal*>(_FC), 
+                        reinterpret_cast<FReal*>(FC[idx]));
+				}
+
+				elapsed_time = time.tacAndElapsed(); 
+				overall_time += elapsed_time;	
+
+				counter++;
+			}
+		}
+	}
+
+		std::cout << "The approximation of the " << counter
+              << " far-field interactions (sizeM2L= " 
+              << counter*opt_rc*sizeof(FComplexe) << " B"
+              << ") took " << overall_time << "s\n" << std::endl;
+
+    // Free _C & _FC
+    delete [] _C;
+    delete [] _FC;
+}
+
+
+
+
+
+
+
+
+
+/*!  \class FUnifSymM2LHandler
+
+	\brief Deals with all the symmetries in the arrangement of the far-field interactions
+
+	Stores permutation indices and permutation vectors to reduce 316 (7^3-3^3)
+  different far-field interactions to 16 only. We use the number 343 (7^3)
+  because it allows us to use to associate the far-field interactions based on
+  the index \f$t = 7^2(i+3) + 7(j+3) + (k+3)\f$ where \f$(i,j,k)\f$ denotes
+  the relative position of the source cell to the target cell. */
+template <int ORDER, KERNEL_FUNCTION_TYPE TYPE> class FUnifSymM2LHandler;
+
+/*! Specialization for homogeneous kernel functions */
+template <int ORDER>
+class FUnifSymM2LHandler<ORDER, HOMOGENEOUS>
+{
+  static const unsigned int nnodes = ORDER*ORDER*ORDER;
+
+	// M2L operators
+	FComplexe*    K[343];
+
+public:
+	
+	// permutation vectors and permutated indices
+	unsigned int pvectors[343][nnodes];
+	unsigned int pindices[343];
+
+
+	/** Constructor: with 16 small SVDs */
+	template <typename MatrixKernelClass>
+	FUnifSymM2LHandler(const MatrixKernelClass *const MatrixKernel,
+                     const FReal, const unsigned int)
+	{
+		// init all 343 item to zero, because effectively only 16 exist
+		for (unsigned int t=0; t<343; ++t)
+			K[t] = NULL;
+			
+		// set permutation vector and indices
+		const FInterpSymmetries<ORDER> Symmetries;
+		for (int i=-3; i<=3; ++i)
+			for (int j=-3; j<=3; ++j)
+				for (int k=-3; k<=3; ++k) {
+					const unsigned int idx = ((i+3) * 7 + (j+3)) * 7 + (k+3);
+					pindices[idx] = 0;
+					if (abs(i)>1 || abs(j)>1 || abs(k)>1)
+						pindices[idx] = Symmetries.getPermutationArrayAndIndex(i,j,k, pvectors[idx]);
+				}
+
+		// precompute 16 M2L operators
+		const FReal ReferenceCellWidth = FReal(2.);
+		precompute<ORDER>(MatrixKernel, ReferenceCellWidth, K);
+	}
+
+
+
+	/** Destructor */
+	~FUnifSymM2LHandler()
+	{
+		for (unsigned int t=0; t<343; ++t) if (K[t]!=NULL) delete [] K[t];
+	}
+
+
+	/*! return the t-th approximated far-field interactions*/
+	const FComplexe *const getK(const unsigned int, const unsigned int t) const
+	{	return K[t]; }
+
+};
+
+
+
+
+
+
+/*! Specialization for non-homogeneous kernel functions */
+template <int ORDER>
+class FUnifSymM2LHandler<ORDER, NON_HOMOGENEOUS>
+{
+  static const unsigned int nnodes = ORDER*ORDER*ORDER;
+
+	// Height of octree; needed only in the case of non-homogeneous kernel functions
+	const unsigned int TreeHeight;
+
+	// M2L operators for all levels in the octree
+	FComplexe***    K;
+
+public:
+	
+	// permutation vectors and permutated indices
+	unsigned int pvectors[343][nnodes];
+	unsigned int pindices[343];
+
+
+	/** Constructor: with 16 small SVDs */
+	template <typename MatrixKernelClass>
+	FUnifSymM2LHandler(const MatrixKernelClass *const MatrixKernel,
+                     const FReal RootCellWidth, const unsigned int inTreeHeight)
+		: TreeHeight(inTreeHeight)
+	{
+		// init all 343 item to zero, because effectively only 16 exist
+		K       = new FComplexe** [TreeHeight];
+		K[0]       = NULL; K[1]       = NULL;
+		for (unsigned int l=2; l<TreeHeight; ++l) {
+			K[l]       = new FComplexe* [343];
+			for (unsigned int t=0; t<343; ++t)
+				K[l][t]       = NULL;
+		}
+		
+
+		// set permutation vector and indices
+		const FInterpSymmetries<ORDER> Symmetries;
+		for (int i=-3; i<=3; ++i)
+			for (int j=-3; j<=3; ++j)
+				for (int k=-3; k<=3; ++k) {
+					const unsigned int idx = ((i+3) * 7 + (j+3)) * 7 + (k+3);
+					pindices[idx] = 0;
+					if (abs(i)>1 || abs(j)>1 || abs(k)>1)
+						pindices[idx] = Symmetries.getPermutationArrayAndIndex(i,j,k, pvectors[idx]);
+				}
+
+		// precompute 16 M2L operators at all levels having far-field interactions
+		FReal CellWidth = RootCellWidth / FReal(2.); // at level 1
+		CellWidth /= FReal(2.);                      // at level 2
+		for (unsigned int l=2; l<TreeHeight; ++l) {
+			precompute<ORDER>(MatrixKernel, CellWidth, K[l]);
+			CellWidth /= FReal(2.);                    // at level l+1 
+		}
+	}
+
+
+
+	/** Destructor */
+	~FUnifSymM2LHandler()
+	{
+		for (unsigned int l=0; l<TreeHeight; ++l) {
+			if (K[l]!=NULL) {
+				for (unsigned int t=0; t<343; ++t) if (K[l][t]!=NULL) delete [] K[l][t];
+				delete [] K[l];
+			}
+		}
+		delete [] K;
+	}
+
+	/*! return the t-th approximated far-field interactions*/
+	const FComplexe *const getK(const unsigned int l, const unsigned int t) const
+	{	return K[l][t]; }
+
+};
+
+
+
+
+
+
+
+
+//#include <fstream>
+//#include <sstream>
+//
+//
+///**
+// * Computes, compresses and stores the 16 M2L kernels in a binary file.
+// */
+//template <int ORDER, typename MatrixKernelClass>
+//static void ComputeAndCompressAndStoreInBinaryFile(const MatrixKernelClass *const MatrixKernel, const FReal Epsilon)
+//{
+//	static const unsigned int nnodes = ORDER*ORDER*ORDER;
+//
+//	// compute and compress ////////////
+//	FReal* K[343];
+//	int LowRank[343];
+//	for (unsigned int idx=0; idx<343; ++idx) { K[idx] = NULL; LowRank[idx] = 0;	}
+//	precompute<ORDER>(MatrixKernel, FReal(2.), Epsilon, K, LowRank);
+//
+//	// write to binary file ////////////
+//	FTic time; time.tic();
+//	// start computing process
+//	const char precision = (typeid(FReal)==typeid(double) ? 'd' : 'f');
+//	std::stringstream sstream;
+//	sstream << "sym2l_" << precision << "_o" << ORDER << "_e" << Epsilon << ".bin";
+//	const std::string filename(sstream.str());
+//	std::ofstream stream(filename.c_str(),
+//											 std::ios::out | std::ios::binary | std::ios::trunc);
+//	if (stream.good()) {
+//		stream.seekp(0);
+//		for (unsigned int idx=0; idx<343; ++idx)
+//			if (K[idx]!=NULL) {
+//				// 1) write index
+//				stream.write(reinterpret_cast<char*>(&idx), sizeof(int));
+//				// 2) write low rank (int)
+//				int rank = LowRank[idx];
+//				stream.write(reinterpret_cast<char*>(&rank), sizeof(int));
+//				// 3) write U and V (both: rank*nnodes * FReal)
+//				FReal *const U = K[idx];
+//				FReal *const V = K[idx] + rank*nnodes;
+//				stream.write(reinterpret_cast<char*>(U), sizeof(FReal)*rank*nnodes);
+//				stream.write(reinterpret_cast<char*>(V), sizeof(FReal)*rank*nnodes);
+//			}
+//	} else throw std::runtime_error("File could not be opened to write");
+//	stream.close();
+//	// write info
+//	//	std::cout << "Compressed M2L operators stored in binary file " << filename
+//	//					<< " in " << time.tacAndElapsed() << "sec."	<< std::endl;
+//
+//	// free memory /////////////////////
+//	for (unsigned int t=0; t<343; ++t) if (K[t]!=NULL) delete [] K[t];
+//}
+//
+//
+///**
+// * Reads the 16 compressed M2L kernels from the binary files and writes them
+// * in K and the respective low-rank in LowRank.
+// */
+//template <int ORDER>
+//void ReadFromBinaryFile(const FReal Epsilon, FReal* K[343], int LowRank[343])
+//{
+//	// compile time constants
+//	const unsigned int nnodes = ORDER*ORDER*ORDER;
+//	
+//	// find filename
+//	const char precision = (typeid(FReal)==typeid(double) ? 'd' : 'f');
+//	std::stringstream sstream;
+//	sstream << "sym2l_" << precision << "_o" << ORDER << "_e" << Epsilon << ".bin";
+//	const std::string filename(sstream.str());
+//
+//	// read binary file
+//	std::ifstream istream(filename.c_str(),
+//												std::ios::in | std::ios::binary | std::ios::ate);
+//	const std::ifstream::pos_type size = istream.tellg();
+//	if (size<=0) throw std::runtime_error("The requested binary file does not yet exist. Exit.");
+//	
+//	if (istream.good()) {
+//		istream.seekg(0);
+//		// 1) read index (int)
+//		int _idx;
+//		istream.read(reinterpret_cast<char*>(&_idx), sizeof(int));
+//		// loop to find 16 compressed m2l operators
+//		for (int idx=0; idx<343; ++idx) {
+//			K[idx] = NULL;
+//			LowRank[idx] = 0;
+//			// if it exists
+//			if (idx == _idx) {
+//				// 2) read low rank (int)
+//				int rank;
+//				istream.read(reinterpret_cast<char*>(&rank), sizeof(int));
+//				LowRank[idx] = rank;
+//				// 3) read U and V (both: rank*nnodes * FReal)
+//				K[idx] = new FReal [2*rank*nnodes];
+//				FReal *const U = K[idx];
+//				FReal *const V = K[idx] + rank*nnodes;
+//				istream.read(reinterpret_cast<char*>(U), sizeof(FReal)*rank*nnodes);
+//				istream.read(reinterpret_cast<char*>(V), sizeof(FReal)*rank*nnodes);
+//
+//				// 1) read next index
+//				istream.read(reinterpret_cast<char*>(&_idx), sizeof(int));
+//			}
+//		}
+//	}	else throw std::runtime_error("File could not be opened to read");
+//	istream.close();
+//}
+
+
+
+
+
+#endif
diff --git a/Src/Utils/FDft.hpp b/Src/Utils/FDft.hpp
index fa61ade230f1aa6cfad0b00f92312ee9f0ef2615..7a6b3f62f48675746cc3580c69935f10e73cb0d3 100644
--- a/Src/Utils/FDft.hpp
+++ b/Src/Utils/FDft.hpp
@@ -172,15 +172,17 @@ class FFft
 
 private:
   unsigned int nsteps_;
+  unsigned int nsteps_opt_;
 
 public:
 
   FFft(const unsigned int nsteps)
-  : nsteps_(nsteps)
+  : nsteps_(nsteps),
+    nsteps_opt_(nsteps/2+1) // SPECIFIC TO FTT FOR REAL VALUES
   {
     // allocate arrays
     fftR_ = (FReal*) fftw_malloc(sizeof(FReal) * nsteps_);
-    fftC_ = (FComplexe*) fftw_malloc(sizeof(FComplexe) * nsteps_);
+    fftC_ = (FComplexe*) fftw_malloc(sizeof(FComplexe) * nsteps_opt_);
 
     // fftw plans
     plan_c2r_ =
@@ -213,7 +215,7 @@ public:
     // read sampled data
 //    std::cout<< "copy(";
 //    time.tic();
-    FBlas::c_setzero(nsteps_,reinterpret_cast<FReal*>(fftC_));
+    FBlas::c_setzero(nsteps_opt_,reinterpret_cast<FReal*>(fftC_));
     FBlas::copy(nsteps_, sampledData,fftR_);
 //    std::cout << time.tacAndElapsed() << ")";
 
@@ -226,11 +228,10 @@ public:
     // write transformed data
 //    std::cout<< " - copy(";
 //    time.tic();
-//    FBlas::c_copy(nsteps_,reinterpret_cast<FReal*>(fftC_),
-//                  reinterpret_cast<FReal*>(transformedData));
-
-    for(unsigned int s=0; s<nsteps_; ++s)
-      transformedData[s]=fftC_[s];
+    FBlas::c_copy(nsteps_opt_,reinterpret_cast<FReal*>(fftC_),
+                  reinterpret_cast<FReal*>(transformedData));
+//    for(unsigned int s=0; s<nsteps_opt_; ++s)
+//      transformedData[s]=fftC_[s];
 
 //    std::cout << time.tacAndElapsed() << ") ";
 
@@ -242,7 +243,7 @@ public:
   {
     // read transformed data
     FBlas::setzero(nsteps_,fftR_);
-    FBlas::c_copy(nsteps_,reinterpret_cast<const FReal*>(transformedData),
+    FBlas::c_copy(nsteps_opt_,reinterpret_cast<const FReal*>(transformedData),
                   reinterpret_cast<FReal*>(fftC_));
 
     // perform ifft
diff --git a/Tests/Kernels/testUnifAlgorithm.cpp b/Tests/Kernels/testUnifAlgorithm.cpp
index 22379a0c9f31cfd88231fdb94746d04445ba79f6..bf04ddaad767d41b6aff15fe219cab32e49b8e1e 100644
--- a/Tests/Kernels/testUnifAlgorithm.cpp
+++ b/Tests/Kernels/testUnifAlgorithm.cpp
@@ -29,7 +29,8 @@
 
 #include "../../Src/Kernels/Uniform/FUnifCell.hpp"
 #include "../../Src/Kernels/Interpolation/FInterpMatrixKernel.hpp"
-#include "../../Src/Kernels/Uniform/FUnifKernel.hpp"
+//#include "../../Src/Kernels/Uniform/FUnifKernel.hpp"
+#include "../../Src/Kernels/Uniform/FUnifSymKernel.hpp"
 
 #include "../../Src/Components/FSimpleLeaf.hpp"
 #include "../../Src/Kernels/P2P/FP2PParticleContainerIndexed.hpp"
@@ -51,7 +52,7 @@
 int main(int argc, char* argv[])
 {
   const char* const filename       = FParameters::getStr(argc,argv,"-f", "../Data/test20k.fma");
-  const unsigned int TreeHeight    = FParameters::getValue(argc, argv, "-h", 5);
+  const unsigned int TreeHeight    = FParameters::getValue(argc, argv, "-h", 3);
   const unsigned int SubTreeHeight = FParameters::getValue(argc, argv, "-sh", 2);
   const unsigned int NbThreads     = FParameters::getValue(argc, argv, "-t", 1);
 
@@ -121,7 +122,7 @@ int main(int argc, char* argv[])
   {	// begin Lagrange kernel
 
     // accuracy
-    const unsigned int ORDER = 7;
+    const unsigned int ORDER = 5;
     // typedefs
     typedef FP2PParticleContainerIndexed ContainerClass;
     typedef FSimpleLeaf< ContainerClass >  LeafClass;
@@ -129,7 +130,8 @@ int main(int argc, char* argv[])
     typedef FInterpMatrixKernelR MatrixKernelClass;
     typedef FUnifCell<ORDER> CellClass;
     typedef FOctree<CellClass,ContainerClass,LeafClass> OctreeClass;
-    typedef FUnifKernel<CellClass,ContainerClass,MatrixKernelClass,ORDER> KernelClass;
+    //typedef FUnifKernel<CellClass,ContainerClass,MatrixKernelClass,ORDER> KernelClass;
+    typedef FUnifSymKernel<CellClass,ContainerClass,MatrixKernelClass,ORDER> KernelClass;
     typedef FFmmAlgorithm<OctreeClass,CellClass,ContainerClass,KernelClass,LeafClass> FmmClass;
     //  typedef FFmmAlgorithmThread<OctreeClass,CellClass,ContainerClass,KernelClass,LeafClass> FmmClass;
 
diff --git a/Tests/Utils/testChebSymmetries.cpp b/Tests/Utils/testChebSymmetries.cpp
index f506ca106be7e76490a98673881f5ee6216f275c..94886d04115c666130f5fe5227ea62470c861982 100755
--- a/Tests/Utils/testChebSymmetries.cpp
+++ b/Tests/Utils/testChebSymmetries.cpp
@@ -31,7 +31,7 @@
 
 #include "../../Src/Kernels/Chebyshev/FChebTensor.hpp"
 #include "../../Src/Kernels/Interpolation/FInterpMatrixKernel.hpp"
-#include "../../Src/Kernels/Chebyshev/FChebSymmetries.hpp"
+#include "../../Src/Kernels/Interpolation/FInterpSymmetries.hpp"
 
 
 
@@ -108,7 +108,7 @@ int main(int argc, char* argv[])
 	FReal maxdiff(0.);
 
 	// permuter
-	FChebSymmetries<order> permuter;
+	FInterpSymmetries<order> permuter;
 
 	// permutation vector
 	unsigned int perm[nnodes];