diff --git a/Examples/DirectComputation.cpp b/Examples/DirectComputation.cpp
index 76790c674ae16f85d7f04e78352c20bc21fb682c..8e6395d331225b7f62fd32d4ee7760ebeb07fcb5 100644
--- a/Examples/DirectComputation.cpp
+++ b/Examples/DirectComputation.cpp
@@ -134,13 +134,13 @@ int main(int argc, char ** argv){
                 //
                 for(int idxOther = 0; idxOther < nbParticles ; ++idxOther){
                     if( idxOther != idxTarget ){
-                        FP2P::NonMutualParticles(
-                                    particles[idxOther].getPosition().getX(), particles[idxOther].getPosition().getY(),
-                                    particles[idxOther].getPosition().getZ(),particles[idxOther].getPhysicalValue(),
-                                    particles[idxTarget].getPosition().getX(), particles[idxTarget].getPosition().getY(),
-                                    particles[idxTarget].getPosition().getZ(),particles[idxTarget].getPhysicalValue(),
-                                    &particles[idxTarget].setForces()[0],&particles[idxTarget].setForces()[1],
-                                &particles[idxTarget].setForces()[2],particles[idxTarget].setPotential(),&MatrixKernel);
+                        FP2P::NonMutualParticles(particles[idxTarget].getPosition().getX(), particles[idxTarget].getPosition().getY(),
+                                                 particles[idxTarget].getPosition().getZ(),particles[idxTarget].getPhysicalValue(),
+                                                 &particles[idxTarget].setForces()[0],&particles[idxTarget].setForces()[1],
+                                                 &particles[idxTarget].setForces()[2],particles[idxTarget].setPotential(),
+                                                 particles[idxOther].getPosition().getX(), particles[idxOther].getPosition().getY(),
+                                                 particles[idxOther].getPosition().getZ(),particles[idxOther].getPhysicalValue(),
+                                                 &MatrixKernel);
                     }
                 }
             } // end for
diff --git a/Src/Kernels/Interpolation/FInterpMatrixKernel.hpp b/Src/Kernels/Interpolation/FInterpMatrixKernel.hpp
index 5f3ef5f9ce435daa70170196e6d8985d10576f3a..1211917d949dda3a3ae443a4e548df01d63edeb0 100644
--- a/Src/Kernels/Interpolation/FInterpMatrixKernel.hpp
+++ b/Src/Kernels/Interpolation/FInterpMatrixKernel.hpp
@@ -40,15 +40,16 @@ enum KERNEL_FUNCTION_TYPE {HOMOGENEOUS, NON_HOMOGENEOUS};
  * It can either be scalar (NCMP=1) or tensorial (NCMP>1) depending on the
  * dimension of the equation considered. NCMP denotes the number of components
  * that are actually stored (e.g. 6 for a \f$3\times3\f$ symmetric tensor).
- * Notes on application scheme:
- * Let there be a kernel \f$K\f$ such that \f$X_i=K_{ij}Y_j\f$
- * with \f$X\f$ the lhs of size NLHS and \f$Y\f$ the rhs of size NRHS.
+ *
+ * Notes on the application scheme:
+ * Let there be a kernel \f$K\f$ such that \f$Potential_i(X)X=K_{ij}(X,Y)PhysicalValue_j(Y)\f$
+ * with \f$Potential\f$ the lhs of size NLHS and \f$PhysicalValues\f$ the rhs of size NRHS.
  * The table applyTab provides the indices in the reduced storage table
  * corresponding to the application scheme depicted earlier.
  *
  * PB: BEWARE! Homogeneous matrix kernels do not support cell width extension
  * yet. Is it possible to find a reference width and a scale factor such that
- * only 1 set of M2L ops can be used for all levels??
+ * only 1 set of M2L opt can be used for all levels??
  *
  */
 template <class FReal>
@@ -76,52 +77,61 @@ struct FInterpMatrixKernelR : FInterpAbstractMatrixKernel<FReal>
     FInterpMatrixKernelR() {}
 
     // copy ctor
-    FInterpMatrixKernelR(const FInterpMatrixKernelR& /*other*/)
-    {}
+    FInterpMatrixKernelR(const FInterpMatrixKernelR& /*other*/) {}
+
+    static const char* getID() { return "ONE_OVER_R"; }
 
-  static const char* getID() { return "ONE_OVER_R"; }
+    static void printInfo() { std::cout << "K(x,y)=1/r with r=|x-y|" << std::endl; }
 
     // returns position in reduced storage
     int getPosition(const unsigned int) const
     {return 0;}
 
+    // returns coefficient of mutual interaction
+    // 1 for symmetric kernels
+    // -1 for antisymmetric kernels
+    // somethings else if other property of symmetry
+    FReal getMutualCoefficient() const{ return FReal(1.); }
+
     // evaluate interaction
     template <class ValueClass>
-    ValueClass evaluate(const ValueClass& x1, const ValueClass& y1, const ValueClass& z1,
-                   const ValueClass& x2, const ValueClass& y2, const ValueClass& z2) const
+    ValueClass evaluate(const ValueClass& xt, const ValueClass& yt, const ValueClass& zt, 
+                        const ValueClass& xs, const ValueClass& ys, const ValueClass& zs) const
     {
-        const ValueClass diffx = (x1-x2);
-        const ValueClass diffy = (y1-y2);
-        const ValueClass diffz = (z1-z2);
+        // diff = t-s
+        const ValueClass diffx = (xt-xs);
+        const ValueClass diffy = (yt-ys);
+        const ValueClass diffz = (zt-zs);
         return FMath::One<ValueClass>() / FMath::Sqrt(diffx*diffx + diffy*diffy + diffz*diffz);
     }
 
     // evaluate interaction (blockwise)
     template <class ValueClass>
-    void evaluateBlock(const ValueClass& x1, const ValueClass& y1, const ValueClass& z1,
-                       const ValueClass& x2, const ValueClass& y2, const ValueClass& z2, ValueClass* block) const
+    void evaluateBlock(const ValueClass& xt, const ValueClass& yt, const ValueClass& zt, 
+                       const ValueClass& xs, const ValueClass& ys, const ValueClass& zs,
+                       ValueClass* block) const
     {
-        block[0] = this->evaluate(x1,y1,z1,x2,y2,z2);
+        block[0] = this->evaluate(xt,yt,zt,xs,ys,zs);
     }
 
     // evaluate interaction and derivative (blockwise)
     template <class ValueClass>
-    void evaluateBlockAndDerivative(const ValueClass& x1, const ValueClass& y1, const ValueClass& z1,
-                                    const ValueClass& x2, const ValueClass& y2, const ValueClass& z2,
+    void evaluateBlockAndDerivative(const ValueClass& xt, const ValueClass& yt, const ValueClass& zt,
+                                    const ValueClass& xs, const ValueClass& ys, const ValueClass& zs,
                                     ValueClass block[1], ValueClass blockDerivative[3]) const
     {
-        const ValueClass diffx = (x1-x2);
-        const ValueClass diffy = (y1-y2);
-        const ValueClass diffz = (z1-z2);
+        const ValueClass diffx = (xt-xs);
+        const ValueClass diffy = (yt-ys);
+        const ValueClass diffz = (zt-zs);
         const ValueClass one_over_r = FMath::One<ValueClass>() / FMath::Sqrt(diffx*diffx + diffy*diffy + diffz*diffz);
 
         const ValueClass one_over_r3 = one_over_r*one_over_r*one_over_r;
 
         block[0] = one_over_r;
 
-        blockDerivative[0] = one_over_r3 * diffx;
-        blockDerivative[1] = one_over_r3 * diffy;
-        blockDerivative[2] = one_over_r3 * diffz;
+        blockDerivative[0] = - one_over_r3 * diffx;
+        blockDerivative[1] = - one_over_r3 * diffy;
+        blockDerivative[2] = - one_over_r3 * diffz;
     }
 
     FReal getScaleFactor(const FReal RootCellWidth, const int TreeLevel) const
@@ -135,15 +145,15 @@ struct FInterpMatrixKernelR : FInterpAbstractMatrixKernel<FReal>
         return FReal(2.) / CellWidth;
     }
 
-    FReal evaluate(const FPoint<FReal>& p1, const FPoint<FReal>& p2) const {
-        return evaluate<FReal>(p1.getX(), p1.getY(), p1.getZ(), p2.getX(), p2.getY(), p2.getZ());
+    FReal evaluate(const FPoint<FReal>& pt, const FPoint<FReal>& ps) const {
+        return evaluate<FReal>(pt.getX(), pt.getY(), pt.getZ(), ps.getX(), ps.getY(), ps.getZ());
     }
-    void evaluateBlock(const FPoint<FReal>& p1, const FPoint<FReal>& p2, FReal* block) const{
-        evaluateBlock<FReal>(p1.getX(), p1.getY(), p1.getZ(), p2.getX(), p2.getY(), p2.getZ(), block);
+    void evaluateBlock(const FPoint<FReal>& pt, const FPoint<FReal>& ps, FReal* block) const{
+        evaluateBlock<FReal>(pt.getX(), pt.getY(), pt.getZ(), ps.getX(), ps.getY(), ps.getZ(), block);
     }
-    void evaluateBlockAndDerivative(const FPoint<FReal>& p1, const FPoint<FReal>& p2,
+    void evaluateBlockAndDerivative(const FPoint<FReal>& pt, const FPoint<FReal>& ps,
                                     FReal block[1], FReal blockDerivative[3]) const {
-        evaluateBlockAndDerivative<FReal>(p1.getX(), p1.getY(), p1.getZ(), p2.getX(), p2.getY(), p2.getZ(), block, blockDerivative);
+        evaluateBlockAndDerivative<FReal>(pt.getX(), pt.getY(), pt.getZ(), ps.getX(), ps.getY(), ps.getZ(), block, blockDerivative);
     }
 };
 
@@ -158,24 +168,27 @@ struct FInterpMatrixKernelRH :FInterpMatrixKernelR<FReal>{
     static const unsigned int NLHS = 1; //< dim of loc exp
     FReal LX,LY,LZ ;
 
-    FInterpMatrixKernelRH() : LX(1.0),LY(1.0),LZ(1.0)
-    {	 }
+    FInterpMatrixKernelRH() 
+    : LX(1.0),LY(1.0),LZ(1.0)
+    { }
 
     // copy ctor
     FInterpMatrixKernelRH(const FInterpMatrixKernelRH& other)
-        :FInterpMatrixKernelR<FReal>(other), LX(other.LX), LY(other.LY), LZ(other.LZ)
+    : FInterpMatrixKernelR<FReal>(other), LX(other.LX), LY(other.LY), LZ(other.LZ)
     {}
 
-  static const char* getID() { return "ONE_OVER_RH"; }
+    static const char* getID() { return "ONE_OVER_RH"; }
+
+    static void printInfo() { std::cout << "K(x,y)=1/rh with rh=sqrt(L_i*(x_i-y_i)^2)" << std::endl; }
 
     // evaluate interaction
     template <class ValueClass>
-    ValueClass evaluate(const ValueClass& x1, const ValueClass& y1, const ValueClass& z1,
-                   const ValueClass& x2, const ValueClass& y2, const ValueClass& z2) const
+    ValueClass evaluate(const ValueClass& xt, const ValueClass& yt, const ValueClass& zt, 
+                        const ValueClass& xs, const ValueClass& ys, const ValueClass& zs) const
     {
-        const ValueClass diffx = (x1-x2);
-        const ValueClass diffy = (y1-y2);
-        const ValueClass diffz = (z1-z2);
+        const ValueClass diffx = (xt-xs);
+        const ValueClass diffy = (yt-ys);
+        const ValueClass diffz = (zt-zs);
         return FMath::One<ValueClass>() / FMath::Sqrt(FMath::ConvertTo<ValueClass,FReal>(LX)*diffx*diffx +
                                        FMath::ConvertTo<ValueClass,FReal>(LY)*diffy*diffy +
                                        FMath::ConvertTo<ValueClass,FReal>(LZ)*diffz*diffz);
@@ -185,23 +198,29 @@ struct FInterpMatrixKernelRH :FInterpMatrixKernelR<FReal>{
     // returns position in reduced storage
     int getPosition(const unsigned int) const
     {return 0;}
+    // returns coefficient of mutual interaction
+    // 1 for symmetric kernels
+    // -1 for antisymmetric kernels
+    // somethings else if other property of symmetry
+    FReal getMutualCoefficient() const{ return FReal(1.); }
 
     template <class ValueClass>
-    void evaluateBlock(const ValueClass& x1, const ValueClass& y1, const ValueClass& z1,
-                       const ValueClass& x2, const ValueClass& y2, const ValueClass& z2, ValueClass* block) const
+    void evaluateBlock(const ValueClass& xt, const ValueClass& yt, const ValueClass& zt, 
+                       const ValueClass& xs, const ValueClass& ys, const ValueClass& zs,
+                       ValueClass* block) const
     {
-        block[0]=this->evaluate(x1,y1,z1,x2,y2,z2);
+        block[0]=this->evaluate(xt,yt,zt,xs,ys,zs);
     }
 
     // evaluate interaction and derivative (blockwise)
     template <class ValueClass>
-    void evaluateBlockAndDerivative(const ValueClass& x1, const ValueClass& y1, const ValueClass& z1,
-                                    const ValueClass& x2, const ValueClass& y2, const ValueClass& z2,
+    void evaluateBlockAndDerivative(const ValueClass& xt, const ValueClass& yt, const ValueClass& zt,
+                                    const ValueClass& xs, const ValueClass& ys, const ValueClass& zs,
                                     ValueClass block[1], ValueClass blockDerivative[3]) const
     {
-        const ValueClass diffx = (x1-x2);
-        const ValueClass diffy = (y1-y2);
-        const ValueClass diffz = (z1-z2);
+        const ValueClass diffx = (xt-xs);
+        const ValueClass diffy = (yt-ys);
+        const ValueClass diffz = (zt-zs);
         const ValueClass one_over_rL = FMath::One<ValueClass>() / FMath::Sqrt(FMath::ConvertTo<ValueClass,FReal>(LX)*diffx*diffx +
                                                           FMath::ConvertTo<ValueClass,FReal>(LY)*diffy*diffy +
                                                           FMath::ConvertTo<ValueClass,FReal>(LZ)*diffz*diffz);
@@ -226,15 +245,15 @@ struct FInterpMatrixKernelRH :FInterpMatrixKernelR<FReal>{
         return FReal(2.) / CellWidth;
     }
 
-    FReal evaluate(const FPoint<FReal>& p1, const FPoint<FReal>& p2) const{
-        return evaluate<FReal>(p1.getX(), p1.getY(), p1.getZ(), p2.getX(), p2.getY(), p2.getZ());
+    FReal evaluate(const FPoint<FReal>& pt, const FPoint<FReal>& ps) const{
+        return evaluate<FReal>(pt.getX(), pt.getY(), pt.getZ(), ps.getX(), ps.getY(), ps.getZ());
     }
-    void evaluateBlock(const FPoint<FReal>& p1, const FPoint<FReal>& p2, FReal* block) const{
-        evaluateBlock<FReal>(p1.getX(), p1.getY(), p1.getZ(), p2.getX(), p2.getY(), p2.getZ(), block);
+    void evaluateBlock(const FPoint<FReal>& pt, const FPoint<FReal>& ps, FReal* block) const{
+        evaluateBlock<FReal>(pt.getX(), pt.getY(), pt.getZ(), ps.getX(), ps.getY(), ps.getZ(), block);
     }
-    void evaluateBlockAndDerivative(const FPoint<FReal>& p1, const FPoint<FReal>& p2,
+    void evaluateBlockAndDerivative(const FPoint<FReal>& pt, const FPoint<FReal>& ps,
                                     FReal block[1], FReal blockDerivative[3]) const {
-        evaluateBlockAndDerivative<FReal>(p1.getX(), p1.getY(), p1.getZ(), p2.getX(), p2.getY(), p2.getZ(), block, blockDerivative);
+        evaluateBlockAndDerivative<FReal>(pt.getX(), pt.getY(), pt.getZ(), ps.getX(), ps.getY(), ps.getZ(), block, blockDerivative);
     }
 };
 
@@ -253,48 +272,52 @@ struct FInterpMatrixKernelRR : FInterpAbstractMatrixKernel<FReal>
     FInterpMatrixKernelRR() {}
 
     // copy ctor
-    FInterpMatrixKernelRR(const FInterpMatrixKernelRR& /*other*/)
-    {}
+    FInterpMatrixKernelRR(const FInterpMatrixKernelRR& /*other*/) {}
 
-  static const char* getID() { return "ONE_OVER_R_SQUARED"; }
+    static const char* getID() { return "ONE_OVER_R_SQUARED"; }
+
+    static void printInfo() { std::cout << "K(x,y)=1/r^2 with r=|x-y|" << std::endl; }
 
     // returns position in reduced storage
     int getPosition(const unsigned int) const
     {return 0;}
 
+    // returns coefficient of mutual interaction
+    // 1 for symmetric kernels
+    // -1 for antisymmetric kernels
+    // somethings else if other property of symmetry
+    FReal getMutualCoefficient() const{ return FReal(1.); }
+
     // evaluate interaction
     template <class ValueClass>
-    ValueClass evaluate(const ValueClass& x1, const ValueClass& y1, const ValueClass& z1,
-                   const ValueClass& x2, const ValueClass& y2, const ValueClass& z2) const
+    ValueClass evaluate(const ValueClass& xt, const ValueClass& yt, const ValueClass& zt, 
+                        const ValueClass& xs, const ValueClass& ys, const ValueClass& zs) const
     {
-        const ValueClass diffx = (x1-x2);
-        const ValueClass diffy = (y1-y2);
-        const ValueClass diffz = (z1-z2);
-        return FMath::One<ValueClass>() / FReal(diffx*diffx +
-                                 diffy*diffy +
-                                 diffz*diffz);
+        const ValueClass diffx = (xt-xs);
+        const ValueClass diffy = (yt-ys);
+        const ValueClass diffz = (zt-zs);
+        return FMath::One<ValueClass>() / FReal(diffx*diffx+diffy*diffy+diffz*diffz);
     }
 
     // evaluate interaction (blockwise)
     template <class ValueClass>
-    void evaluateBlock(const ValueClass& x1, const ValueClass& y1, const ValueClass& z1,
-                       const ValueClass& x2, const ValueClass& y2, const ValueClass& z2, ValueClass* block) const
+    void evaluateBlock(const ValueClass& xt, const ValueClass& yt, const ValueClass& zt, 
+                       const ValueClass& xs, const ValueClass& ys, const ValueClass& zs,
+                       ValueClass* block) const
     {
-        block[0]=this->evaluate(x1,y1,z1,x2,y2,z2);
+        block[0]=this->evaluate(xt,yt,zt,xs,ys,zs);
     }
 
     // evaluate interaction and derivative (blockwise)
     template <class ValueClass>
-    void evaluateBlockAndDerivative(const ValueClass& x1, const ValueClass& y1, const ValueClass& z1,
-                                    const ValueClass& x2, const ValueClass& y2, const ValueClass& z2,
+    void evaluateBlockAndDerivative(const ValueClass& xt, const ValueClass& yt, const ValueClass& zt,
+                                    const ValueClass& xs, const ValueClass& ys, const ValueClass& zs,
                                     ValueClass block[1], ValueClass blockDerivative[3]) const
     {
-        const ValueClass diffx = (x1-x2);
-        const ValueClass diffy = (y1-y2);
-        const ValueClass diffz = (z1-z2);
-        const ValueClass r2 = (diffx*diffx +
-                               diffy*diffy +
-                               diffz*diffz);
+        const ValueClass diffx = (xt-xs);
+        const ValueClass diffy = (yt-ys);
+        const ValueClass diffz = (zt-zs);
+        const ValueClass r2 = (diffx*diffx+diffy*diffy+diffz*diffz);
         const ValueClass one_over_r2 = FMath::One<ValueClass>() / (r2);
         const ValueClass one_over_r4 = one_over_r2*one_over_r2;
 
@@ -318,15 +341,15 @@ struct FInterpMatrixKernelRR : FInterpAbstractMatrixKernel<FReal>
         return FReal(4.) / (CellWidth*CellWidth);
     }
 
-    FReal evaluate(const FPoint<FReal>& p1, const FPoint<FReal>& p2) const{
-        return evaluate<FReal>(p1.getX(), p1.getY(), p1.getZ(), p2.getX(), p2.getY(), p2.getZ());
+    FReal evaluate(const FPoint<FReal>& pt, const FPoint<FReal>& ps) const{
+        return evaluate<FReal>(pt.getX(), pt.getY(), pt.getZ(), ps.getX(), ps.getY(), ps.getZ());
     }
-    void evaluateBlock(const FPoint<FReal>& p1, const FPoint<FReal>& p2, FReal* block) const{
-        evaluateBlock<FReal>(p1.getX(), p1.getY(), p1.getZ(), p2.getX(), p2.getY(), p2.getZ(), block);
+    void evaluateBlock(const FPoint<FReal>& pt, const FPoint<FReal>& ps, FReal* block) const{
+        evaluateBlock<FReal>(pt.getX(), pt.getY(), pt.getZ(), ps.getX(), ps.getY(), ps.getZ(), block);
     }
-    void evaluateBlockAndDerivative(const FPoint<FReal>& p1, const FPoint<FReal>& p2,
+    void evaluateBlockAndDerivative(const FPoint<FReal>& pt, const FPoint<FReal>& ps,
                                     FReal block[1], FReal blockDerivative[3]) const {
-        evaluateBlockAndDerivative<FReal>(p1.getX(), p1.getY(), p1.getZ(), p2.getX(), p2.getY(), p2.getZ(), block, blockDerivative);
+        evaluateBlockAndDerivative<FReal>(pt.getX(), pt.getY(), pt.getZ(), ps.getX(), ps.getY(), ps.getZ(), block, blockDerivative);
     }
 };
 
@@ -346,26 +369,31 @@ struct FInterpMatrixKernelLJ : FInterpAbstractMatrixKernel<FReal>
     FInterpMatrixKernelLJ() {}
 
     // copy ctor
-    FInterpMatrixKernelLJ(const FInterpMatrixKernelLJ& /*other*/)
-    {}
+    FInterpMatrixKernelLJ(const FInterpMatrixKernelLJ& /*other*/) {}
 
-  static const char* getID() { return "LENNARD_JONES_POTENTIAL"; }
+    static const char* getID() { return "LENNARD_JONES_POTENTIAL"; }
+
+    static void printInfo() { std::cout << "K(x,y)=1/r with r=|x-y|" << std::endl; }
 
     // returns position in reduced storage
     int getPosition(const unsigned int) const
     {return 0;}
 
+    // returns coefficient of mutual interaction
+    // 1 for symmetric kernels
+    // -1 for antisymmetric kernels
+    // somethings else if other property of symmetry
+    FReal getMutualCoefficient() const{ return FReal(1.); }
+
     // evaluate interaction
     template <class ValueClass>
-    ValueClass evaluate(const ValueClass& x1, const ValueClass& y1, const ValueClass& z1,
-                   const ValueClass& x2, const ValueClass& y2, const ValueClass& z2) const
+    ValueClass evaluate(const ValueClass& xt, const ValueClass& yt, const ValueClass& zt, 
+                        const ValueClass& xs, const ValueClass& ys, const ValueClass& zs) const
     {
-        const ValueClass diffx = (x1-x2);
-        const ValueClass diffy = (y1-y2);
-        const ValueClass diffz = (z1-z2);
-        const ValueClass r = FMath::Sqrt(diffx*diffx +
-                                   diffy*diffy +
-                                   diffz*diffz);
+        const ValueClass diffx = (xt-xs);
+        const ValueClass diffy = (yt-ys);
+        const ValueClass diffz = (zt-zs);
+        const ValueClass r = FMath::Sqrt(diffx*diffx+diffy*diffy+diffz*diffz);
         const ValueClass r3 = r*r*r;
         const ValueClass one_over_r6 = FMath::One<ValueClass>() / (r3*r3);
         //return one_over_r6 * one_over_r6;
@@ -375,24 +403,23 @@ struct FInterpMatrixKernelLJ : FInterpAbstractMatrixKernel<FReal>
 
     // evaluate interaction (blockwise)
     template <class ValueClass>
-    void evaluateBlock(const ValueClass& x1, const ValueClass& y1, const ValueClass& z1,
-                       const ValueClass& x2, const ValueClass& y2, const ValueClass& z2, ValueClass* block) const
+    void evaluateBlock(const ValueClass& xt, const ValueClass& yt, const ValueClass& zt, 
+                       const ValueClass& xs, const ValueClass& ys, const ValueClass& zs,
+                       ValueClass* block) const
     {
-        block[0]=this->evaluate(x1,y1,z1,x2,y2,z2);
+        block[0]=this->evaluate(xt,yt,zt,xs,ys,zs);
     }
 
     // evaluate interaction and derivative (blockwise)
     template <class ValueClass>
-    void evaluateBlockAndDerivative(const ValueClass& x1, const ValueClass& y1, const ValueClass& z1,
-                                    const ValueClass& x2, const ValueClass& y2, const ValueClass& z2,
+    void evaluateBlockAndDerivative(const ValueClass& xt, const ValueClass& yt, const ValueClass& zt,
+                                    const ValueClass& xs, const ValueClass& ys, const ValueClass& zs,
                                     ValueClass block[1], ValueClass blockDerivative[3]) const
     {
-        const ValueClass diffx = (x1-x2);
-        const ValueClass diffy = (y1-y2);
-        const ValueClass diffz = (z1-z2);
-        const ValueClass r = FMath::Sqrt(diffx*diffx +
-                                    diffy*diffy +
-                                    diffz*diffz);
+        const ValueClass diffx = (xt-xs);
+        const ValueClass diffy = (yt-ys);
+        const ValueClass diffz = (zt-zs);
+        const ValueClass r = FMath::Sqrt(diffx*diffx+diffy*diffy+diffz*diffz);
         const ValueClass r2 = r*r;
         const ValueClass r3 = r2*r;
         const ValueClass one_over_r6 = FMath::One<ValueClass>() / (r3*r3);
@@ -421,15 +448,15 @@ struct FInterpMatrixKernelLJ : FInterpAbstractMatrixKernel<FReal>
 
 
 
-    FReal evaluate(const FPoint<FReal>& p1, const FPoint<FReal>& p2) const{
-        return evaluate<FReal>(p1.getX(), p1.getY(), p1.getZ(), p2.getX(), p2.getY(), p2.getZ());
+    FReal evaluate(const FPoint<FReal>& pt, const FPoint<FReal>& ps) const{
+        return evaluate<FReal>(pt.getX(), pt.getY(), pt.getZ(), ps.getX(), ps.getY(), ps.getZ());
     }
-    void evaluateBlock(const FPoint<FReal>& p1, const FPoint<FReal>& p2, FReal* block) const{
-        evaluateBlock<FReal>(p1.getX(), p1.getY(), p1.getZ(), p2.getX(), p2.getY(), p2.getZ(), block);
+    void evaluateBlock(const FPoint<FReal>& pt, const FPoint<FReal>& ps, FReal* block) const{
+        evaluateBlock<FReal>(pt.getX(), pt.getY(), pt.getZ(), ps.getX(), ps.getY(), ps.getZ(), block);
     }
-    void evaluateBlockAndDerivative(const FPoint<FReal>& p1, const FPoint<FReal>& p2,
+    void evaluateBlockAndDerivative(const FPoint<FReal>& pt, const FPoint<FReal>& ps,
                                     FReal block[1], FReal blockDerivative[3]) const {
-        evaluateBlockAndDerivative<FReal>(p1.getX(), p1.getY(), p1.getZ(), p2.getX(), p2.getY(), p2.getZ(), block, blockDerivative);
+        evaluateBlockAndDerivative<FReal>(pt.getX(), pt.getY(), pt.getZ(), ps.getX(), ps.getY(), ps.getZ(), block, blockDerivative);
     }
 };
 
@@ -449,54 +476,59 @@ struct FInterpMatrixKernelAPLUSRR : FInterpAbstractMatrixKernel<FReal>
 
     FInterpMatrixKernelAPLUSRR(const FReal inCoreWidth = .25)
     : CoreWidth(inCoreWidth)
-     {}
+    {}
 
     // copy ctor
     FInterpMatrixKernelAPLUSRR(const FInterpMatrixKernelAPLUSRR& other)
     : CoreWidth(other.CoreWidth)
     {}
 
-  static const char* getID() { return "ONE_OVER_A_PLUS_RR"; }
+    static const char* getID() { return "ONE_OVER_A_PLUS_RR"; }
+
+    static void printInfo() { std::cout << "K(x,y)=1/r with r=|x-y|" << std::endl; }
 
     // returns position in reduced storage
     int getPosition(const unsigned int) const
     {return 0;}
 
+    // returns coefficient of mutual interaction
+    // 1 for symmetric kernels
+    // -1 for antisymmetric kernels
+    // somethings else if other property of symmetry
+    FReal getMutualCoefficient() const{ return FReal(1.); }
+
     // evaluate interaction
     template <class ValueClass>
-    ValueClass evaluate(const ValueClass& x1, const ValueClass& y1, const ValueClass& z1,
-                   const ValueClass& x2, const ValueClass& y2, const ValueClass& z2) const
+    ValueClass evaluate(const ValueClass& xt, const ValueClass& yt, const ValueClass& zt,
+                        const ValueClass& xs, const ValueClass& ys, const ValueClass& zs) const
     {
-        const ValueClass diffx = (x1-x2);
-        const ValueClass diffy = (y1-y2);
-        const ValueClass diffz = (z1-z2);
-        return FMath::One<ValueClass>() / (FMath::ConvertTo<ValueClass,FReal>(CoreWidth) + // WHY FReal??
-                                           diffx*diffx +
-                                           diffy*diffy +
-                                           diffz*diffz);
+        const ValueClass diffx = (xt-xs);
+        const ValueClass diffy = (yt-ys);
+        const ValueClass diffz = (zt-zs);
+        const ValueClass r2 = (diffx*diffx+diffy*diffy+diffz*diffz);
+        return FMath::One<ValueClass>() / (r2 + FMath::ConvertTo<ValueClass,FReal>(CoreWidth));
     }
 
     // evaluate interaction (blockwise)
     template <class ValueClass>
-    void evaluateBlock(const ValueClass& x1, const ValueClass& y1, const ValueClass& z1,
-                       const ValueClass& x2, const ValueClass& y2, const ValueClass& z2, ValueClass* block) const
+    void evaluateBlock(const ValueClass& xt, const ValueClass& yt, const ValueClass& zt, 
+                       const ValueClass& xs, const ValueClass& ys, const ValueClass& zs,
+                       ValueClass* block) const
     {
-        block[0]=this->evaluate(x1,y1,z1,x2,y2,z2);
+        block[0]=this->evaluate(xt,yt,zt,xs,ys,zs);
     }
 
     // evaluate interaction and derivative (blockwise)
     template <class ValueClass>
-    void evaluateBlockAndDerivative(const ValueClass& x1, const ValueClass& y1, const ValueClass& z1,
-                                    const ValueClass& x2, const ValueClass& y2, const ValueClass& z2,
+    void evaluateBlockAndDerivative(const ValueClass& xt, const ValueClass& yt, const ValueClass& zt,
+                                    const ValueClass& xs, const ValueClass& ys, const ValueClass& zs,
                                     ValueClass block[1], ValueClass blockDerivative[3]) const
     {
-        const ValueClass diffx = (x1-x2);
-        const ValueClass diffy = (y1-y2);
-        const ValueClass diffz = (z1-z2);
-        const ValueClass r2 = (diffx*diffx +
-                               diffy*diffy +
-                               diffz*diffz);
-        const ValueClass one_over_a_plus_r2 = FMath::One<ValueClass>() / (FMath::ConvertTo<ValueClass,FReal>(CoreWidth)+r2);
+        const ValueClass diffx = (xt-xs);
+        const ValueClass diffy = (yt-ys);
+        const ValueClass diffz = (zt-zs);
+        const ValueClass r2 = (diffx*diffx+diffy*diffy+diffz*diffz);
+        const ValueClass one_over_a_plus_r2 = FMath::One<ValueClass>() / (r2 + FMath::ConvertTo<ValueClass,FReal>(CoreWidth));
         const ValueClass one_over_a_plus_r2_squared = one_over_a_plus_r2*one_over_a_plus_r2;
 
         block[0] = one_over_a_plus_r2;
@@ -521,240 +553,22 @@ struct FInterpMatrixKernelAPLUSRR : FInterpAbstractMatrixKernel<FReal>
         return FReal(1.0);    
     }
 
-    FReal evaluate(const FPoint<FReal>& p1, const FPoint<FReal>& p2) const{
-        return evaluate<FReal>(p1.getX(), p1.getY(), p1.getZ(), p2.getX(), p2.getY(), p2.getZ());
+    FReal evaluate(const FPoint<FReal>& pt, const FPoint<FReal>& ps) const{
+        return evaluate<FReal>(pt.getX(), pt.getY(), pt.getZ(), ps.getX(), ps.getY(), ps.getZ());
     }
-    void evaluateBlock(const FPoint<FReal>& p1, const FPoint<FReal>& p2, FReal* block) const{
-        evaluateBlock<FReal>(p1.getX(), p1.getY(), p1.getZ(), p2.getX(), p2.getY(), p2.getZ(), block);
+    void evaluateBlock(const FPoint<FReal>& pt, const FPoint<FReal>& ps, FReal* block) const{
+        evaluateBlock<FReal>(pt.getX(), pt.getY(), pt.getZ(), ps.getX(), ps.getY(), ps.getZ(), block);
     }
-    void evaluateBlockAndDerivative(const FPoint<FReal>& p1, const FPoint<FReal>& p2,
+    void evaluateBlockAndDerivative(const FPoint<FReal>& pt, const FPoint<FReal>& ps,
                                     FReal block[1], FReal blockDerivative[3]) const {
-        evaluateBlockAndDerivative<FReal>(p1.getX(), p1.getY(), p1.getZ(), p2.getX(), p2.getY(), p2.getZ(), block, blockDerivative);
+        evaluateBlockAndDerivative<FReal>(pt.getX(), pt.getY(), pt.getZ(), ps.getX(), ps.getY(), ps.getZ(), block, blockDerivative);
     }
 };
 
 
-////////////////////////////////////////////////////////////////////////////////
-////////////////////////////////////////////////////////////////////////////////
-//
-// Tensorial Matrix Kernels (NCMP>1)
-//
-// The definition of the potential p and force f are extended to the case
-// of tensorial interaction kernels:
-// p_i(x) = K_{ip}(x,y)w_p(y), \forall i=1..NPOT, p=1..NPV
-// f_{ik}= w_p(x)K_{ip,k}(x,y)w_p(y) "
-//
-// Since the interpolation scheme is such that
-// p_i(x) \approx S^m(x) L^{m}_{ip}
-// f_{ik}= w_p(x) \nabla_k S^m(x) L^{m}_{ip}
-// with
-// L^{m}_{ip} = K^{mn}_{ip} S^n(y) w_p(y) (local expansion)
-// M^{m}_{p} = S^n(y) w_p(y) (multipole expansion)
-// then the multipole exp have NPV components and the local exp NPOT*NPV.
-//
-// NB1: Only the computation of forces requires that the sum over p is 
-// performed at L2P step. It could be done at M2L step for the potential.
-//
-// NB2: An efficient application of the matrix kernel is highly kernel 
-// dependent, we recommand overriding the P2M/M2L/L2P function of the kernel 
-// you are using in order to have optimal performances + set your own NRHS/NLHS.
-//
-////////////////////////////////////////////////////////////////////////////////
-////////////////////////////////////////////////////////////////////////////////
-
-
-/// R_{,ij}
-// PB: IMPORTANT! This matrix kernel does not present the symmetries 
-// required by ChebSym kernel => only suited for Unif interpolation
-template <class FReal>
-struct FInterpMatrixKernel_R_IJ : FInterpAbstractMatrixKernel<FReal>
-{
-    static const KERNEL_FUNCTION_TYPE Type = NON_HOMOGENEOUS;
-    static const unsigned int NK   = 3*3; //< total number of components
-    static const unsigned int NCMP = 6;   //< number of components
-    static const unsigned int NPV  = 3;   //< dim of physical values
-    static const unsigned int NPOT = 3;   //< dim of potentials
-    static const unsigned int NRHS = NPV; //< dim of mult exp
-    static const unsigned int NLHS = NPOT*NRHS; //< dim of loc exp
-
-    // store indices (i,j) corresponding to sym idx
-    static const unsigned int indexTab[/*2*NCMP=12*/];
-
-    // store positions in sym tensor (when looping over NRHSxNLHS)
-    static const unsigned int applyTab[/*NK=9*/];
-
-    // indices to be set at construction if component-wise evaluation is performed
-    const unsigned int _i,_j;
-
-    // Material Parameters
-    const FReal _CoreWidth2; // if >0 then kernel is NON homogeneous
-
-    FInterpMatrixKernel_R_IJ(const FReal CoreWidth = 0.0, const unsigned int d = 0)
-        : _i(indexTab[d]), _j(indexTab[d+NCMP]), _CoreWidth2(CoreWidth*CoreWidth)
-    {}
-
-    // copy ctor
-    FInterpMatrixKernel_R_IJ(const FInterpMatrixKernel_R_IJ& other)
-        : _i(other._i), _j(other._j), _CoreWidth2(other._CoreWidth2)
-    {}
-
-  static const char* getID() { return "R_IJ"; }
-
-    // returns position in reduced storage from position in full 3x3 matrix
-    unsigned  int getPosition(const unsigned int n) const
-    {return applyTab[n];}
-
-    // returns Core Width squared
-    FReal getCoreWidth2() const
-    {return _CoreWidth2;}
-
-    // evaluate interaction
-    template <class ValueClass>
-    FReal evaluate(const ValueClass& x1, const ValueClass& y1, const ValueClass& z1,
-                   const ValueClass& x2, const ValueClass& y2, const ValueClass& z2) const
-    {
-        const ValueClass diffx = (x1-x2);
-        const ValueClass diffy = (y1-y2);
-        const ValueClass diffz = (z1-z2);
-        const ValueClass one_over_r = FMath::One<ValueClass>()/FMath::Sqrt(diffx*diffx +
-                                                diffy*diffy +
-                                                diffz*diffz + FMath::ConvertTo<ValueClass,FReal>(_CoreWidth2));
-        const ValueClass one_over_r3 = one_over_r*one_over_r*one_over_r;
-        ValueClass ri,rj;
-
-        if(_i==0) ri=diffx;
-        else if(_i==1) ri=diffy;
-        else if(_i==2) ri=diffz;
-        else throw std::runtime_error("Update i!");
-
-        if(_j==0) rj=diffx;
-        else if(_j==1) rj=diffy;
-        else if(_j==2) rj=diffz;
-        else throw std::runtime_error("Update j!");
-
-        if(_i==_j)
-            return one_over_r - ri * ri * one_over_r3;
-        else
-            return - ri * rj * one_over_r3;
-
-    }
-
-    // evaluate interaction (blockwise)
-    template <class ValueClass>
-    void evaluateBlock(const ValueClass& x1, const ValueClass& y1, const ValueClass& z1,
-                       const ValueClass& x2, const ValueClass& y2, const ValueClass& z2, ValueClass* block) const
-    {
-        const ValueClass diffx = (x1-x2);
-        const ValueClass diffy = (y1-y2);
-        const ValueClass diffz = (z1-z2);
-        const ValueClass one_over_r = FMath::One<ValueClass>()/FMath::Sqrt(diffx*diffx +
-                                                diffy*diffy +
-                                                diffz*diffz + FMath::ConvertTo<ValueClass,FReal>(_CoreWidth2));
-        const ValueClass one_over_r3 = one_over_r*one_over_r*one_over_r;
-        const ValueClass r[3] = {diffx,diffy,diffz};
-
-        for(unsigned int d=0;d<NCMP;++d){
-            unsigned int i = indexTab[d];
-            unsigned int j = indexTab[d+NCMP];
-
-            if(i==j)
-                block[d] = one_over_r - r[i] * r[i] * one_over_r3;
-            else
-                block[d] = - r[i] * r[j] * one_over_r3;
-        }
-    }
-
-    // evaluate interaction and derivative (blockwise)
-    template <class ValueClass>
-    void evaluateBlockAndDerivative(const ValueClass& x1, const ValueClass& y1, const ValueClass& z1,
-                                    const ValueClass& x2, const ValueClass& y2, const ValueClass& z2,
-                                    ValueClass block[NCMP], ValueClass blockDerivative[NCMP][3]) const
-    {
-        const ValueClass diffx = (x1-x2);
-        const ValueClass diffy = (y1-y2);
-        const ValueClass diffz = (z1-z2);
-        const ValueClass r2[3] = {diffx*diffx,diffy*diffy,diffz*diffz};
-        const ValueClass one_over_r2 = FMath::One<ValueClass>() / (r2[0] + r2[1] + r2[2]);
-        const ValueClass one_over_r = FMath::Sqrt(one_over_r2);
-        const ValueClass one_over_r3 = one_over_r2*one_over_r;
-
-        const ValueClass r[3] = {diffx,diffy,diffz};
-
-        const ValueClass Three = FMath::ConvertTo<ValueClass,FReal>(3.);
-        const ValueClass MinusOne = -FMath::One<ValueClass>();
-
-        for(unsigned int d=0;d<NCMP;++d){
-            unsigned int i = indexTab[d];
-            unsigned int j = indexTab[d+NCMP];
-
-            // evaluate kernel
-            if(i==j)
-                block[d] = one_over_r - r2[i] * one_over_r3;
-            else
-                block[d] = - r[i] * r[j] * one_over_r3;
-
-            // evaluate derivative
-            for(unsigned int k = 0 ; k < 3 ; ++k){
-              if(i==j){
-                if(j==k) //i=j=k
-                  blockDerivative[d][k] = Three * ( MinusOne + r2[i] * one_over_r2 ) * r[i] * one_over_r3;
-                else //i=j!=k
-                  blockDerivative[d][k] = ( MinusOne + Three * r2[i] * one_over_r2 ) * r[k] * one_over_r3;
-              }
-              else{ //(i!=j)
-                if(i==k) //i=k!=j
-                  blockDerivative[d][k] = ( MinusOne + Three * r2[i] * one_over_r2 ) * r[j] * one_over_r3;
-                else if(j==k) //i!=k=j
-                  blockDerivative[d][k] = ( MinusOne + Three * r2[j] * one_over_r2 ) * r[i] * one_over_r3;
-                else //i!=k!=j
-                  blockDerivative[d][k] = Three * r[i] * r[j] * r[k] * one_over_r2 * one_over_r3;
-              }
-            }// k
-
-        }// NCMP
-    }
-
-    FReal getScaleFactor(const FReal RootCellWidth, const int TreeLevel) const
-    {
-        const FReal CellWidth(RootCellWidth / FReal(FMath::pow(2, TreeLevel)));
-        return getScaleFactor(CellWidth);
-    }
-
-    // R_{,ij} is homogeneous to [L]/[L]^{-2}=[L]^{-1}
-    // => scales like ONE_OVER_R
-    FReal getScaleFactor(const FReal CellWidth) const
-    {
-        return FReal(2.) / CellWidth;
-    }
-
-
-
-    FReal evaluate(const FPoint<FReal>& p1, const FPoint<FReal>& p2) const{
-        return evaluate<FReal>(p1.getX(), p1.getY(), p1.getZ(), p2.getX(), p2.getY(), p2.getZ());
-    }
-    void evaluateBlock(const FPoint<FReal>& p1, const FPoint<FReal>& p2, FReal* block) const{
-        evaluateBlock<FReal>(p1.getX(), p1.getY(), p1.getZ(), p2.getX(), p2.getY(), p2.getZ(), block);
-    }
-    void evaluateBlockAndDerivative(const FPoint<FReal>& p1, const FPoint<FReal>& p2,
-                                    FReal block[NCMP], FReal blockDerivative[NCMP][3]) const {
-        evaluateBlockAndDerivative<FReal>(p1.getX(), p1.getY(), p1.getZ(), p2.getX(), p2.getY(), p2.getZ(), block, blockDerivative);
-    }
-};
-
-/// R_IJ
-template <class FReal>
-const unsigned int FInterpMatrixKernel_R_IJ<FReal>::indexTab[]={0,0,0,1,1,2,
-                                                         0,1,2,1,2,2};
-
-template <class FReal>
-const unsigned int FInterpMatrixKernel_R_IJ<FReal>::applyTab[]={0,1,2,
-                                                         1,3,4,
-                                                         2,4,5};
-
-
 
 /*!  Functor which provides the interface to assemble a matrix based on the
-  number of rows and cols and on the coordinates x and y and the type of the
+  number of rows and cols and on the coordinates s and t and the type of the
   generating matrix-kernel function.
 */
 template <class FReal, typename MatrixKernelClass>
@@ -762,45 +576,45 @@ class EntryComputer
 {
     const MatrixKernelClass *const MatrixKernel;
 
-    const unsigned int nx, ny;
-    const FPoint<FReal> *const px, *const py;
+    const unsigned int nt, ns;
+    const FPoint<FReal> *const pt, *const ps;
 
     const FReal *const weights;
 
 public:
     explicit EntryComputer(const MatrixKernelClass *const inMatrixKernel,
-                           const unsigned int _nx, const FPoint<FReal> *const _px,
-                           const unsigned int _ny, const FPoint<FReal> *const _py,
+                           const unsigned int _nt, const FPoint<FReal> *const _ps,
+                           const unsigned int _ns, const FPoint<FReal> *const _pt,
                            const FReal *const _weights = NULL)
-        : MatrixKernel(inMatrixKernel),	nx(_nx), ny(_ny), px(_px), py(_py), weights(_weights) {}
+        : MatrixKernel(inMatrixKernel),	nt(_nt), ns(_ns), pt(_pt), ps(_ps), weights(_weights) {}
 
-    void operator()(const unsigned int xbeg, const unsigned int xend,
-                    const unsigned int ybeg, const unsigned int yend,
+    void operator()(const unsigned int tbeg, const unsigned int tend,
+                    const unsigned int sbeg, const unsigned int send,
                     FReal *const data) const
     {
         unsigned int idx = 0;
         if (weights) {
-            for (unsigned int j=ybeg; j<yend; ++j)
-                for (unsigned int i=xbeg; i<xend; ++i)
-                    data[idx++] = weights[i] * weights[j] * MatrixKernel->evaluate(px[i], py[j]);
+            for (unsigned int j=tbeg; j<tend; ++j)
+                for (unsigned int i=sbeg; i<send; ++i)
+                    data[idx++] = weights[i] * weights[j] * MatrixKernel->evaluate(pt[i], ps[j]);
         } else {
-            for (unsigned int j=ybeg; j<yend; ++j)
-                for (unsigned int i=xbeg; i<xend; ++i)
-                    data[idx++] = MatrixKernel->evaluate(px[i], py[j]);
+            for (unsigned int j=tbeg; j<tend; ++j)
+                for (unsigned int i=sbeg; i<send; ++i)
+                    data[idx++] = MatrixKernel->evaluate(pt[i], ps[j]);
         }
 
         /*
     // apply weighting matrices
     if (weights) {
-    if ((xend-xbeg) == (yend-ybeg) && (xend-xbeg) == nx)
-    for (unsigned int n=0; n<nx; ++n) {
-    FBlas::scal(nx, weights[n], data + n,  nx); // scale rows
-    FBlas::scal(nx, weights[n], data + n * nx); // scale cols
-    }
-    else if ((xend-xbeg) == 1 && (yend-ybeg) == ny)
-    for (unsigned int j=0; j<ny; ++j)	data[j] *= weights[j];
-    else if ((yend-ybeg) == 1 && (xend-xbeg) == nx)
-    for (unsigned int i=0; i<nx; ++i)	data[i] *= weights[i];
+    if ((tend-tbeg) == (send-sbeg) && (tend-tbeg) == nt)
+    for (unsigned int n=0; n<nt; ++n) {
+    FBlas::scal(nt, weights[n], data + n,  nt); // scale rows
+    FBlas::scal(nt, weights[n], data + n * nt); // scale cols
+    }
+    else if ((tend-tbeg) == 1 && (send-sbeg) == ns)
+    for (unsigned int j=0; j<ns; ++j)	data[j] *= weights[j];
+    else if ((send-sbeg) == 1 && (tend-tbeg) == nt)
+    for (unsigned int i=0; i<nt; ++i)	data[i] *= weights[i];
     }
     */
 
diff --git a/Src/Kernels/Interpolation/FInterpMatrixKernel_TensorialInteractions.hpp b/Src/Kernels/Interpolation/FInterpMatrixKernel_TensorialInteractions.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..e05977bfa8a209b4a7b31ddfc8ab9a947d205808
--- /dev/null
+++ b/Src/Kernels/Interpolation/FInterpMatrixKernel_TensorialInteractions.hpp
@@ -0,0 +1,296 @@
+// ===================================================================================
+// Copyright ScalFmm 2011 INRIA, Olivier Coulaud, Berenger Bramas, Matthias Messner
+// olivier.coulaud@inria.fr, berenger.bramas@inria.fr
+// This software is a computer program whose purpose is to compute the FMM.
+//
+// This software is governed by the CeCILL-C and LGPL licenses and
+// abiding by the rules of distribution of free software.
+//
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public and CeCILL-C Licenses for more details.
+// "http://www.cecill.info".
+// "http://www.gnu.org/licenses".
+// ===================================================================================
+#ifndef FINTERPMATRIXKERNEL_TENSORIAL_INTERACTIONS_HPP
+#define FINTERPMATRIXKERNEL_TENSORIAL_INTERACTIONS_HPP
+
+#include <stdexcept>
+
+#include "Utils/FPoint.hpp"
+#include "Utils/FNoCopyable.hpp"
+#include "Utils/FMath.hpp"
+#include "Utils/FGlobal.hpp"
+
+#include "FInterpMatrixKernel.hpp"
+
+
+/**
+ *
+ * @author Pierre Blanchard (pierre.blanchard@inria.fr)
+ * @class FInterpMatrixKernels_TensorialInteractions
+ * Please read the license
+ *
+ * This class provides the evaluators and scaling functions of the matrix
+ * kernels. A matrix kernel should be understood in the sense of a kernel
+ * of interaction (or the fundamental solution of a given equation).
+ * It can either be scalar (NCMP=1) or tensorial (NCMP>1) depending on the
+ * dimension of the equation considered. NCMP denotes the number of components
+ * that are actually stored (e.g. 6 for a \f$3\times3\f$ symmetric tensor).
+ * 
+ * Notes on the application scheme:
+ * Let there be a kernel \f$K\f$ such that \f$P_i(X)X=K_{ij}(X,Y)W_j(Y)\f$
+ * with \f$P\f$ the lhs of size NLHS and \f$W\f$ the rhs of size NRHS.
+ * The table applyTab provides the indices in the reduced storage table
+ * corresponding to the application scheme depicted earlier.
+ *
+ * PB: BEWARE! Homogeneous matrix kernels do not support cell width extension
+ * yet. Is it possible to find a reference width and a scale factor such that
+ * only 1 set of M2L opt can be used for all levels??
+ *
+ */
+
+
+////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////
+//
+// Tensorial Matrix Kernels (NCMP>1)
+//
+// The definition of the potential p and force f are extended to the case
+// of tensorial interaction kernels:
+// p_i(x) = K_{ip}(x,y)w_p(y), \forall i=1..NPOT, p=1..NPV
+// f_{ik}= w_p(x)K_{ip,k}(x,y)w_p(y) "
+//
+// Since the interpolation scheme is such that
+// p_i(x) \approx S^m(x) L^{m}_{ip}
+// f_{ik}= w_p(x) \nabla_k S^m(x) L^{m}_{ip}
+// with
+// L^{m}_{ip} = K^{mn}_{ip} S^n(y) w_p(y) (local expansion)
+// M^{m}_{p} = S^n(y) w_p(y) (multipole expansion)
+// then the multipole exp have NPV components and the local exp NPOT*NPV.
+//
+// NB1: Only the computation of forces requires that the sum over p is 
+// performed at L2P step. It could be done at M2L step for the potential.
+//
+// NB2: An efficient application of the matrix kernel is highly kernel 
+// dependent, we recommand overriding the P2M/M2L/L2P function of the kernel 
+// you are using in order to have opsimal performances + set your own NRHS/NLHS.
+//
+////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////
+
+
+/// R_{,ij}
+// PB: IMPORTANT! This matrix kernel does not present the symmetries 
+// required by ChebSym kernel => only suited for Unif interpolation
+template <class FReal>
+struct FInterpMatrixKernel_R_IJ : FInterpAbstractMatrixKernel<FReal>
+{
+    static const KERNEL_FUNCTION_TYPE Type = NON_HOMOGENEOUS;
+    static const unsigned int NK   = 3*3; //< total number of components
+    static const unsigned int NCMP = 6;   //< number of independent components
+    static const unsigned int NPV  = 3;   //< dim of physical values
+    static const unsigned int NPOT = 3;   //< dim of potentials
+    static const unsigned int NRHS = NPV; //< dim of mult exp
+    static const unsigned int NLHS = NPOT*NPV; //< dim of loc exp (no csummation over j during M2L)
+
+    // store indices (i,j) corresponding to sym idx
+    static const unsigned int indexTab[/*2*NCMP=12*/];
+
+    // store positions in sym tensor (when looping over NRHSxNLHS)
+    static const unsigned int applyTab[/*NK=9*/];
+
+    // indices to be set at construction if component-wise evaluation is performed
+    const unsigned int _i,_j;
+
+    // Material Parameters
+    const FReal _CoreWidth2; // if >0 then kernel is NON homogeneous
+
+    FInterpMatrixKernel_R_IJ(const FReal CoreWidth = 0.0, const unsigned int d = 0)
+        : _i(indexTab[d]), _j(indexTab[d+NCMP]), _CoreWidth2(CoreWidth*CoreWidth)
+    {}
+
+    // copy ctor
+    FInterpMatrixKernel_R_IJ(const FInterpMatrixKernel_R_IJ& other)
+        : _i(other._i), _j(other._j), _CoreWidth2(other._CoreWidth2)
+    {}
+
+    static const char* getID() { return "R_IJ"; }
+
+    static void printInfo() { 
+        std::cout << "K_ij(x,y)=r_{,ij}=d^2/dx_idx_j(r), with r=|x-y|_a=sqrt(a^2 + (x_i-y_i)^2)." << std::endl; 
+        std::cout << "Compute potentials p_i(x)=sum_{y,j}(K_ij(x,y)w_j(y))" << std::endl; 
+        std::cout << "        forces f_{ik}(x)=-d/dx_k(sum_{y,j}(w_j(x)K_ij(x,y)w_j(y)))" << std::endl; 
+        std::cout << std::endl;
+    }
+
+    // returns position in reduced storage from position in full 3x3 matrix
+    unsigned  int getPosition(const unsigned int n) const
+    {return applyTab[n];}
+
+    // returns Core Width squared
+    FReal getCoreWidth2() const
+    {return _CoreWidth2;}
+
+    // returns coefficient of mutual interaction
+    // 1 for symmetric kernels
+    // -1 for antisymmetric kernels
+    // somethings else if other property of symmetry
+    FReal getMutualCoefficient() const{ return FReal(1.); }
+
+    // evaluate interaction
+    template <class ValueClass>
+    FReal evaluate(const ValueClass& xt, const ValueClass& yt, const ValueClass& zt, 
+                   const ValueClass& xs, const ValueClass& ys, const ValueClass& zs) const
+    {
+        const ValueClass diffx = (xt-xs);
+        const ValueClass diffy = (yt-ys);
+        const ValueClass diffz = (zt-zs);
+        const ValueClass r2 = diffx*diffx+diffy*diffy+diffz*diffz;
+        const ValueClass one_over_r = FMath::One<ValueClass>()/FMath::Sqrt(r2 + FMath::ConvertTo<ValueClass,FReal>(_CoreWidth2));
+        const ValueClass one_over_r3 = one_over_r*one_over_r*one_over_r;
+        ValueClass ri,rj;
+
+        if(_i==0) ri=diffx;
+        else if(_i==1) ri=diffy;
+        else if(_i==2) ri=diffz;
+        else throw std::runtime_error("Update i!");
+
+        if(_j==0) rj=diffx;
+        else if(_j==1) rj=diffy;
+        else if(_j==2) rj=diffz;
+        else throw std::runtime_error("Update j!");
+
+        if(_i==_j)
+            return one_over_r - ri * ri * one_over_r3;
+        else
+            return - ri * rj * one_over_r3;
+
+    }
+
+    // evaluate interaction (blockwise)
+    template <class ValueClass>
+    void evaluateBlock(const ValueClass& xt, const ValueClass& yt, const ValueClass& zt, 
+                       const ValueClass& xs, const ValueClass& ys, const ValueClass& zs,
+                       ValueClass* block) const
+    {
+        const ValueClass diffx = (xt-xs);
+        const ValueClass diffy = (yt-ys);
+        const ValueClass diffz = (zt-zs);
+        const ValueClass r2 = diffx*diffx+diffy*diffy+diffz*diffz;
+        const ValueClass one_over_r = FMath::One<ValueClass>()/FMath::Sqrt(r2 + FMath::ConvertTo<ValueClass,FReal>(_CoreWidth2));
+        const ValueClass one_over_r3 = one_over_r*one_over_r*one_over_r;
+
+        const ValueClass r[3] = {diffx,diffy,diffz};
+
+        for(unsigned int d=0;d<NCMP;++d){
+            unsigned int i = indexTab[d];
+            unsigned int j = indexTab[d+NCMP];
+
+            if(i==j)
+                block[d] = one_over_r - r[i] * r[i] * one_over_r3;
+            else
+                block[d] = - r[i] * r[j] * one_over_r3;
+        }
+    }
+
+    // evaluate interaction and derivative (blockwise)
+    // [TODO] Fix! Add corewidth!
+    template <class ValueClass>
+    void evaluateBlockAndDerivative(const ValueClass& xt, const ValueClass& yt, const ValueClass& zt,
+                                    const ValueClass& xs, const ValueClass& ys, const ValueClass& zs,
+                                    ValueClass block[NCMP], ValueClass blockDerivative[NCMP][3]) const
+    {
+        const ValueClass diffx = (xt-xs);
+        const ValueClass diffy = (yt-ys);
+        const ValueClass diffz = (zt-zs);
+        const ValueClass r2[3] = {diffx*diffx,diffy*diffy,diffz*diffz};
+        const ValueClass one_over_r2 = FMath::One<ValueClass>() / (r2[0] + r2[1] + r2[2] + FMath::ConvertTo<ValueClass,FReal>(_CoreWidth2));
+        const ValueClass one_over_r  = FMath::Sqrt(one_over_r2);
+        const ValueClass one_over_r3 = one_over_r2*one_over_r;
+
+        const ValueClass r[3] = {diffx,diffy,diffz};
+
+        const ValueClass Three = FMath::ConvertTo<ValueClass,FReal>(3.);
+        const ValueClass MinusOne = - FMath::One<ValueClass>();
+
+        for(unsigned int d=0;d<NCMP;++d){
+            unsigned int i = indexTab[d];
+            unsigned int j = indexTab[d+NCMP];
+
+            // evaluate kernel
+            if(i==j)
+                block[d] = one_over_r - r2[i] * one_over_r3;
+            else
+                block[d] = - r[i] * r[j] * one_over_r3;
+
+            // evaluate derivative
+            for(unsigned int k = 0 ; k < 3 ; ++k){
+              if(i==j){
+                if(j==k) //i=j=k
+                  blockDerivative[d][k] = Three * ( MinusOne + r2[i] * one_over_r2 ) * r[i] * one_over_r3;
+                else //i=j!=k
+                  blockDerivative[d][k] = ( MinusOne + Three * r2[i] * one_over_r2 ) * r[k] * one_over_r3;
+              }
+              else{ //(i!=j)
+                if(i==k) //i=k!=j
+                  blockDerivative[d][k] = ( MinusOne + Three * r2[i] * one_over_r2 ) * r[j] * one_over_r3;
+                else if(j==k) //i!=k=j
+                  blockDerivative[d][k] = ( MinusOne + Three * r2[j] * one_over_r2 ) * r[i] * one_over_r3;
+                else //i!=k!=j
+                  blockDerivative[d][k] = Three * r[i] * r[j] * r[k] * one_over_r2 * one_over_r3;
+              }
+            }// k
+
+        }// NCMP
+    }
+
+    FReal getScaleFactor(const FReal RootCellWidth, const int TreeLevel) const
+    {
+        const FReal CellWidth(RootCellWidth / FReal(FMath::pow(2, TreeLevel)));
+        return getScaleFactor(CellWidth);
+    }
+
+    // R_{,ij} is homogeneous to [L]/[L]^{-2}=[L]^{-1}
+    // => scales like ONE_OVER_R
+    FReal getScaleFactor(const FReal CellWidth) const
+    {
+        return FReal(2.) / CellWidth;
+    }
+
+
+
+    FReal evaluate(const FPoint<FReal>& pt, const FPoint<FReal>& ps) const{
+        return evaluate<FReal>(pt.getX(), pt.getY(), pt.getZ(), ps.getX(), ps.getY(), ps.getZ());
+    }
+    void evaluateBlock(const FPoint<FReal>& pt, const FPoint<FReal>& ps, FReal* block) const{
+        evaluateBlock<FReal>(pt.getX(), pt.getY(), pt.getZ(), ps.getX(), ps.getY(), ps.getZ(), block);
+    }
+    void evaluateBlockAndDerivative(const FPoint<FReal>& pt, const FPoint<FReal>& ps,
+                                    FReal block[NCMP], FReal blockDerivative[NCMP][3]) const {
+        evaluateBlockAndDerivative<FReal>(pt.getX(), pt.getY(), pt.getZ(), ps.getX(), ps.getY(), ps.getZ(), block, blockDerivative);
+    }
+};
+
+/// R_IJ
+template <class FReal>
+const unsigned int FInterpMatrixKernel_R_IJ<FReal>::indexTab[]={0,0,0,1,1,2,
+                                                                0,1,2,1,2,2};
+
+template <class FReal>
+const unsigned int FInterpMatrixKernel_R_IJ<FReal>::applyTab[]={0,1,2,
+                                                                1,3,4,
+                                                                2,4,5};
+
+
+
+
+
+
+
+
+#endif // FINTERPMATRIXKERNEL_TENSORIAL_INTERACTIONS_HPP
+
+// [--END--]
+
diff --git a/Src/Kernels/P2P/FP2P.hpp b/Src/Kernels/P2P/FP2P.hpp
index 1577568babecf4a7a30dcfd3bc86ad9e5650a82b..9f5e380047fac01d4c3990797e12bc42141da69e 100644
--- a/Src/Kernels/P2P/FP2P.hpp
+++ b/Src/Kernels/P2P/FP2P.hpp
@@ -48,10 +48,10 @@ namespace FP2P {
    * @param MatrixKernel pointer to an interaction kernel evaluator
    */
 template <class FReal, typename MatrixKernelClass>
-inline void MutualParticles(const FReal sourceX,const FReal sourceY,const FReal sourceZ, const FReal sourcePhysicalValue,
-                            FReal* sourceForceX, FReal* sourceForceY, FReal* sourceForceZ, FReal* sourcePotential,
-                            const FReal targetX,const FReal targetY,const FReal targetZ, const FReal targetPhysicalValue,
+inline void MutualParticles(const FReal targetX,const FReal targetY,const FReal targetZ, const FReal targetPhysicalValue,
                             FReal* targetForceX, FReal* targetForceY, FReal* targetForceZ, FReal* targetPotential,
+                            const FReal sourceX,const FReal sourceY,const FReal sourceZ, const FReal sourcePhysicalValue,
+                            FReal* sourceForceX, FReal* sourceForceY, FReal* sourceForceZ, FReal* sourcePotential,
                             const MatrixKernelClass *const MatrixKernel){
 
     // Compute kernel of interaction...
@@ -59,7 +59,9 @@ inline void MutualParticles(const FReal sourceX,const FReal sourceY,const FReal
     const FPoint<FReal> targetPoint(targetX,targetY,targetZ);
     FReal Kxy[1];
     FReal dKxy[3];
-    MatrixKernel->evaluateBlockAndDerivative(sourcePoint,targetPoint,Kxy,dKxy);
+    MatrixKernel->evaluateBlockAndDerivative(targetPoint,sourcePoint,Kxy,dKxy);
+    const FReal mutual_coeff = MatrixKernel->getMutualCoefficient(); // 1 if symmetric; -1 if antisymmetric
+
     FReal coef = (targetPhysicalValue * sourcePhysicalValue);
 
     (*targetForceX) += dKxy[0] * coef;
@@ -70,7 +72,7 @@ inline void MutualParticles(const FReal sourceX,const FReal sourceY,const FReal
     (*sourceForceX) -= dKxy[0] * coef;
     (*sourceForceY) -= dKxy[1] * coef;
     (*sourceForceZ) -= dKxy[2] * coef;
-    (*sourcePotential) += ( Kxy[0] * targetPhysicalValue );
+    (*sourcePotential) += ( mutual_coeff * Kxy[0] * targetPhysicalValue );
 }
 
 /**
@@ -89,9 +91,9 @@ inline void MutualParticles(const FReal sourceX,const FReal sourceY,const FReal
    * \f$ F(x) = \frac{ \Delta_x * q_1 * q_2 }{ r^2 } \f$
    */
 template <class FReal, typename MatrixKernelClass>
-inline void NonMutualParticles(const FReal sourceX,const FReal sourceY,const FReal sourceZ, const FReal sourcePhysicalValue,
-                               const FReal targetX,const FReal targetY,const FReal targetZ, const FReal targetPhysicalValue,
+inline void NonMutualParticles(const FReal targetX,const FReal targetY,const FReal targetZ, const FReal targetPhysicalValue,
                                FReal* targetForceX, FReal* targetForceY, FReal* targetForceZ, FReal* targetPotential,
+                               const FReal sourceX,const FReal sourceY,const FReal sourceZ, const FReal sourcePhysicalValue,
                                const MatrixKernelClass *const MatrixKernel){
 
     // Compute kernel of interaction...
@@ -99,7 +101,8 @@ inline void NonMutualParticles(const FReal sourceX,const FReal sourceY,const FRe
     const FPoint<FReal> targetPoint(targetX,targetY,targetZ);
     FReal Kxy[1];
     FReal dKxy[3];
-    MatrixKernel->evaluateBlockAndDerivative(sourcePoint,targetPoint,Kxy,dKxy);
+    MatrixKernel->evaluateBlockAndDerivative(targetPoint,sourcePoint,Kxy,dKxy);
+
     FReal coef = (targetPhysicalValue * sourcePhysicalValue);
 
     (*targetForceX) += dKxy[0] * coef;
@@ -108,282 +111,7 @@ inline void NonMutualParticles(const FReal sourceX,const FReal sourceY,const FRe
     (*targetPotential) += ( Kxy[0] * sourcePhysicalValue );
 }
 
-//////////////////////////////////////////////////////////////////////////////////////////////////////////
-//////////////////////////////////////////////////////////////////////////////////////////////////////////
-//////////////////////////////////////////////////////////////////////////////////////////////////////////
-//////////////////////////////////////////////////////////////////////////////////////////////////////////
-// Tensorial Matrix Kernels: K_IJ / p_i=\sum_j K_{ij} w_j
-//////////////////////////////////////////////////////////////////////////////////////////////////////////
-//////////////////////////////////////////////////////////////////////////////////////////////////////////
-//////////////////////////////////////////////////////////////////////////////////////////////////////////
-//////////////////////////////////////////////////////////////////////////////////////////////////////////
-
-/**
-   * @brief MutualParticlesKIJ
-   * @param sourceX
-   * @param sourceY
-   * @param sourceZ
-   * @param sourcePhysicalValue
-   * @param sourceForceX
-   * @param sourceForceY
-   * @param sourceForceZ
-   * @param sourcePotential
-   * @param targetX
-   * @param targetY
-   * @param targetZ
-   * @param targetPhysicalValue
-   * @param targetForceX
-   * @param targetForceY
-   * @param targetForceZ
-   * @param targetPotential
-   */
-template<class FReal, typename MatrixKernelClass>
-inline void MutualParticlesKIJ(const FReal sourceX,const FReal sourceY,const FReal sourceZ, const FReal* sourcePhysicalValue,
-                               FReal* sourceForceX, FReal* sourceForceY, FReal* sourceForceZ, FReal* sourcePotential,
-                               const FReal targetX,const FReal targetY,const FReal targetZ, const FReal* targetPhysicalValue,
-                               FReal* targetForceX, FReal* targetForceY, FReal* targetForceZ, FReal* targetPotential,
-                               const MatrixKernelClass *const MatrixKernel){
-
-    // get information on tensorial aspect of matrix kernel
-    const int ncmp = MatrixKernelClass::NCMP;
-    const int applyTab[9] = {0,1,2,
-                             1,3,4,
-                             2,4,5};
-
-    // evaluate kernel and its partial derivatives
-    const FPoint<FReal> sourcePoint(sourceX,sourceY,sourceZ);
-    const FPoint<FReal> targetPoint(targetX,targetY,targetZ);
-    FReal Kxy[ncmp];
-    FReal dKxy[ncmp][3];
-    MatrixKernel->evaluateBlockAndDerivative(sourcePoint,targetPoint,Kxy,dKxy);
-
-    for(unsigned int i = 0 ; i < 3 ; ++i){
-        for(unsigned int j = 0 ; j < 3 ; ++j){
-
-            // update component to be applied
-            const int d = applyTab[i*3+j];
-
-            // forces prefactor
-            const FReal coef = -(targetPhysicalValue[j] * sourcePhysicalValue[j]);
-
-            targetForceX[i] += dKxy[d][0] * coef;
-            targetForceY[i] += dKxy[d][1] * coef;
-            targetForceZ[i] += dKxy[d][2] * coef;
-            targetPotential[i] += ( Kxy[d] * sourcePhysicalValue[j] );
-
-            sourceForceX[i] -= dKxy[d][0] * coef;
-            sourceForceY[i] -= dKxy[d][1] * coef;
-            sourceForceZ[i] -= dKxy[d][2] * coef;
-            sourcePotential[i] += ( Kxy[d] * targetPhysicalValue[j] );
-
-        }// j
-    }// i
-
-}
-
-//////////////////////////////////////////////////////////////////////////////////////////////////////////
-//////////////////////////////////////////////////////////////////////////////////////////////////////////
-//////////////////////////////////////////////////////////////////////////////////////////////////////////
-//////////////////////////////////////////////////////////////////////////////////////////////////////////
-// Tensorial Matrix Kernels: K_IJ
-// TODO: Implement SSE and AVX variants then move following FullMutualKIJ and FullRemoteKIJ to FP2P.hpp
-//////////////////////////////////////////////////////////////////////////////////////////////////////////
-//////////////////////////////////////////////////////////////////////////////////////////////////////////
-//////////////////////////////////////////////////////////////////////////////////////////////////////////
-//////////////////////////////////////////////////////////////////////////////////////////////////////////
-
-/**
- * @brief FullMutualKIJ
- */
-template <class FReal, class ContainerClass, typename MatrixKernelClass>
-inline void FullMutualKIJ(ContainerClass* const FRestrict inTargets, ContainerClass* const inNeighbors[],
-                          const int limiteNeighbors, const MatrixKernelClass *const MatrixKernel){
-
-    // get information on tensorial aspect of matrix kernel
-    const int ncmp = MatrixKernelClass::NCMP;
-    const int applyTab[9] = {0,1,2,
-                             1,3,4,
-                             2,4,5};
-
-    const FSize nbParticlesTargets = inTargets->getNbParticles();
-    const FReal*const targetsX = inTargets->getPositions()[0];
-    const FReal*const targetsY = inTargets->getPositions()[1];
-    const FReal*const targetsZ = inTargets->getPositions()[2];
-
-    for(FSize idxNeighbors = 0 ; idxNeighbors < limiteNeighbors ; ++idxNeighbors){
-        for(FSize idxTarget = 0 ; idxTarget < nbParticlesTargets ; ++idxTarget){
-            if( inNeighbors[idxNeighbors] ){
-                const FSize nbParticlesSources = inNeighbors[idxNeighbors]->getNbParticles();
-                const FReal*const sourcesX = inNeighbors[idxNeighbors]->getPositions()[0];
-                const FReal*const sourcesY = inNeighbors[idxNeighbors]->getPositions()[1];
-                const FReal*const sourcesZ = inNeighbors[idxNeighbors]->getPositions()[2];
-
-                for(FSize idxSource = 0 ; idxSource < nbParticlesSources ; ++idxSource){
-
-                    // evaluate kernel and its partial derivatives
-                    const FPoint<FReal> sourcePoint(sourcesX[idxSource],sourcesY[idxSource],sourcesZ[idxSource]);
-                    const FPoint<FReal> targetPoint(targetsX[idxTarget],targetsY[idxTarget],targetsZ[idxTarget]);
-                    FReal Kxy[ncmp];
-                    FReal dKxy[ncmp][3];
-                    MatrixKernel->evaluateBlockAndDerivative(sourcePoint,targetPoint,Kxy,dKxy);
-
-                    for(unsigned int i = 0 ; i < 3 ; ++i){
-                        FReal*const targetsPotentials = inTargets->getPotentials(i);
-                        FReal*const targetsForcesX = inTargets->getForcesX(i);
-                        FReal*const targetsForcesY = inTargets->getForcesY(i);
-                        FReal*const targetsForcesZ = inTargets->getForcesZ(i);
-                        FReal*const sourcesPotentials = inNeighbors[idxNeighbors]->getPotentials(i);
-                        FReal*const sourcesForcesX = inNeighbors[idxNeighbors]->getForcesX(i);
-                        FReal*const sourcesForcesY = inNeighbors[idxNeighbors]->getForcesY(i);
-                        FReal*const sourcesForcesZ = inNeighbors[idxNeighbors]->getForcesZ(i);
-
-                        for(unsigned int j = 0 ; j < 3 ; ++j){
-                            const FReal*const targetsPhysicalValues = inTargets->getPhysicalValues(j);
-                            const FReal*const sourcesPhysicalValues = inNeighbors[idxNeighbors]->getPhysicalValues(j);
-
-                            // update component to be applied
-                            const int d = applyTab[i*3+j];
-
-                            // forces prefactor
-                            FReal coef = -(targetsPhysicalValues[idxTarget] * sourcesPhysicalValues[idxSource]);
-
-                            targetsForcesX[idxTarget] += dKxy[d][0] * coef;
-                            targetsForcesY[idxTarget] += dKxy[d][1] * coef;
-                            targetsForcesZ[idxTarget] += dKxy[d][2] * coef;
-                            targetsPotentials[idxTarget] += ( Kxy[d] * sourcesPhysicalValues[idxSource] );
-
-                            sourcesForcesX[idxSource] -= dKxy[d][0] * coef;
-                            sourcesForcesY[idxSource] -= dKxy[d][1] * coef;
-                            sourcesForcesZ[idxSource] -= dKxy[d][2] * coef;
-                            sourcesPotentials[idxSource] += Kxy[d] * targetsPhysicalValues[idxTarget];
-
-                        }// j
-                    }// i
-                }
-            }
-        }
-    }
-}
-
-template <class FReal, class ContainerClass, typename MatrixKernelClass>
-inline void InnerKIJ(ContainerClass* const FRestrict inTargets, const MatrixKernelClass *const MatrixKernel){
-
-    // get information on tensorial aspect of matrix kernel
-    const int ncmp = MatrixKernelClass::NCMP;
-    const int applyTab[9] = {0,1,2,
-                             1,3,4,
-                             2,4,5};
-
-    const FSize nbParticlesTargets = inTargets->getNbParticles();
-    const FReal*const targetsX = inTargets->getPositions()[0];
-    const FReal*const targetsY = inTargets->getPositions()[1];
-    const FReal*const targetsZ = inTargets->getPositions()[2];
-
-    for(FSize idxTarget = 0 ; idxTarget < nbParticlesTargets ; ++idxTarget){
-        for(FSize idxSource = idxTarget + 1 ; idxSource < nbParticlesTargets ; ++idxSource){
-
-            // evaluate kernel and its partial derivatives
-            const FPoint<FReal> sourcePoint(targetsX[idxSource],targetsY[idxSource],targetsZ[idxSource]);
-            const FPoint<FReal> targetPoint(targetsX[idxTarget],targetsY[idxTarget],targetsZ[idxTarget]);
-            FReal Kxy[ncmp];
-            FReal dKxy[ncmp][3];
-            MatrixKernel->evaluateBlockAndDerivative(sourcePoint,targetPoint,Kxy,dKxy);
-
-            for(unsigned int i = 0 ; i < 3 ; ++i){
-                FReal*const targetsPotentials = inTargets->getPotentials(i);
-                FReal*const targetsForcesX = inTargets->getForcesX(i);
-                FReal*const targetsForcesY = inTargets->getForcesY(i);
-                FReal*const targetsForcesZ = inTargets->getForcesZ(i);
-
-                for(unsigned int j = 0 ; j < 3 ; ++j){
-                    const FReal*const targetsPhysicalValues = inTargets->getPhysicalValues(j);
-
-                    // update component to be applied
-                    const int d = applyTab[i*3+j];
-
-                    // forces prefactor
-                    const FReal coef = -(targetsPhysicalValues[idxTarget] * targetsPhysicalValues[idxSource]);
-
-                    targetsForcesX[idxTarget] += dKxy[d][0] * coef;
-                    targetsForcesY[idxTarget] += dKxy[d][1] * coef;
-                    targetsForcesZ[idxTarget] += dKxy[d][2] * coef;
-                    targetsPotentials[idxTarget] += ( Kxy[d] * targetsPhysicalValues[idxSource] );
-
-                    targetsForcesX[idxSource] -= dKxy[d][0] * coef;
-                    targetsForcesY[idxSource] -= dKxy[d][1] * coef;
-                    targetsForcesZ[idxSource] -= dKxy[d][2] * coef;
-                    targetsPotentials[idxSource] += Kxy[d] * targetsPhysicalValues[idxTarget];
-                }// j
-            }// i
-
-        }
-    }
-}
-
-/**
-   * @brief FullRemoteKIJ
-   */
-template <class FReal, class ContainerClass, typename MatrixKernelClass>
-inline void FullRemoteKIJ(ContainerClass* const FRestrict inTargets, const ContainerClass* const inNeighbors[],
-                          const int limiteNeighbors, const MatrixKernelClass *const MatrixKernel){
-
-    // get information on tensorial aspect of matrix kernel
-    const int ncmp = MatrixKernelClass::NCMP;
-    const int applyTab[9] = {0,1,2,
-                             1,3,4,
-                             2,4,5};
-
-    const FSize nbParticlesTargets = inTargets->getNbParticles();
-    const FReal*const targetsX = inTargets->getPositions()[0];
-    const FReal*const targetsY = inTargets->getPositions()[1];
-    const FReal*const targetsZ = inTargets->getPositions()[2];
-
-    for(FSize idxNeighbors = 0 ; idxNeighbors < limiteNeighbors ; ++idxNeighbors){
-        for(FSize idxTarget = 0 ; idxTarget < nbParticlesTargets ; ++idxTarget){
-            if( inNeighbors[idxNeighbors] ){
-                const FSize nbParticlesSources = inNeighbors[idxNeighbors]->getNbParticles();
-                const FReal*const sourcesX = inNeighbors[idxNeighbors]->getPositions()[0];
-                const FReal*const sourcesY = inNeighbors[idxNeighbors]->getPositions()[1];
-                const FReal*const sourcesZ = inNeighbors[idxNeighbors]->getPositions()[2];
-
-                for(FSize idxSource = 0 ; idxSource < nbParticlesSources ; ++idxSource){
-
-                    // evaluate kernel and its partial derivatives
-                    const FPoint<FReal> sourcePoint(sourcesX[idxSource],sourcesY[idxSource],sourcesZ[idxSource]);
-                    const FPoint<FReal> targetPoint(targetsX[idxTarget],targetsY[idxTarget],targetsZ[idxTarget]);
-                    FReal Kxy[ncmp];
-                    FReal dKxy[ncmp][3];
-                    MatrixKernel->evaluateBlockAndDerivative(sourcePoint,targetPoint,Kxy,dKxy);
-
-                    for(unsigned int i = 0 ; i < 3 ; ++i){
-                        FReal*const targetsPotentials = inTargets->getPotentials(i);
-                        FReal*const targetsForcesX = inTargets->getForcesX(i);
-                        FReal*const targetsForcesY = inTargets->getForcesY(i);
-                        FReal*const targetsForcesZ = inTargets->getForcesZ(i);
-
-                        for(unsigned int j = 0 ; j < 3 ; ++j){
-                            const FReal*const targetsPhysicalValues = inTargets->getPhysicalValues(j);
-                            const FReal*const sourcesPhysicalValues = inNeighbors[idxNeighbors]->getPhysicalValues(j);
-
-                            // update component to be applied
-                            const int d = applyTab[i*3+j];
-
-                            // forces prefactor
-                            const FReal coef = -(targetsPhysicalValues[idxTarget] * sourcesPhysicalValues[idxSource]);
 
-                            targetsForcesX[idxTarget] += dKxy[d][0] * coef;
-                            targetsForcesY[idxTarget] += dKxy[d][1] * coef;
-                            targetsForcesZ[idxTarget] += dKxy[d][2] * coef;
-                            targetsPotentials[idxTarget] += ( Kxy[d] * sourcesPhysicalValues[idxSource] );
-
-                        }// j
-                    }// i
-
-                }
-            }
-        }
-    }
-}
 
 
 template <class FReal, class ContainerClass, class MatrixKernelClass, class ComputeClass, int NbFRealInComputeClass>
@@ -425,8 +153,11 @@ static void GenericFullMutual(ContainerClass* const FRestrict inTargets, Contain
                 for(FSize idxSource = 0 ; idxSource < nbParticlesSources ; ++idxSource){
                     ComputeClass Kxy[1];
                     ComputeClass dKxy[3];
-                    MatrixKernel->evaluateBlockAndDerivative(sourcesX[idxSource],sourcesY[idxSource],sourcesZ[idxSource],
-                                                             tx,ty,tz,Kxy,dKxy);
+                    MatrixKernel->evaluateBlockAndDerivative(tx,ty,tz,
+                                                             sourcesX[idxSource],sourcesY[idxSource],sourcesZ[idxSource],
+                                                             Kxy,dKxy);
+                    const ComputeClass mutual_coeff = FMath::ConvertTo<ComputeClass, FReal>(MatrixKernel->getMutualCoefficient());; // 1 if symmetric; -1 if antisymmetric
+
                     const ComputeClass coef = (tv * sourcesPhysicalValues[idxSource]);
 
                     dKxy[0] *= coef;
@@ -441,7 +172,7 @@ static void GenericFullMutual(ContainerClass* const FRestrict inTargets, Contain
                     sourcesForcesX[idxSource] -= dKxy[0];
                     sourcesForcesY[idxSource] -= dKxy[1];
                     sourcesForcesZ[idxSource] -= dKxy[2];
-                    sourcesPotentials[idxSource] += Kxy[0] * tv;
+                    sourcesPotentials[idxSource] += mutual_coeff * Kxy[0] * tv;
                 }
 
                 targetsForcesX[idxTarget] += FMath::ConvertTo<FReal, ComputeClass>(tfx);
@@ -492,8 +223,11 @@ static void GenericInner(ContainerClass* const FRestrict inTargets, const Matrix
             for(FSize idxSource = (idxTarget+NbFRealInComputeClass)/NbFRealInComputeClass ; idxSource < nbParticlesSources ; ++idxSource){
                 ComputeClass Kxy[1];
                 ComputeClass dKxy[3];
-                MatrixKernel->evaluateBlockAndDerivative(sourcesX[idxSource],sourcesY[idxSource],sourcesZ[idxSource],
-                                                         tx,ty,tz,Kxy,dKxy);
+                MatrixKernel->evaluateBlockAndDerivative(tx,ty,tz,
+                                                         sourcesX[idxSource],sourcesY[idxSource],sourcesZ[idxSource],
+                                                         Kxy,dKxy);
+                const ComputeClass mutual_coeff = FMath::ConvertTo<ComputeClass, FReal>(MatrixKernel->getMutualCoefficient()); // 1 if symmetric; -1 if antisymmetric
+
                 const ComputeClass coef = (tv * sourcesPhysicalValues[idxSource]);
 
                 dKxy[0] *= coef;
@@ -503,12 +237,12 @@ static void GenericInner(ContainerClass* const FRestrict inTargets, const Matrix
                 tfx += dKxy[0];
                 tfy += dKxy[1];
                 tfz += dKxy[2];
-		tpo = FMath::FMAdd(Kxy[0],sourcesPhysicalValues[idxSource],tpo);
+        		tpo = FMath::FMAdd(Kxy[0],sourcesPhysicalValues[idxSource],tpo);
 
                 sourcesForcesX[idxSource] -= dKxy[0];
                 sourcesForcesY[idxSource] -= dKxy[1];
                 sourcesForcesZ[idxSource] -= dKxy[2];
-		sourcesPotentials[idxSource] = FMath::FMAdd(Kxy[0],tv,sourcesPotentials[idxSource]);
+        		sourcesPotentials[idxSource] = FMath::FMAdd(mutual_coeff * Kxy[0],tv,sourcesPotentials[idxSource]);
             }
 
             targetsForcesX[idxTarget] += FMath::ConvertTo<FReal, ComputeClass>(tfx);
@@ -524,9 +258,11 @@ static void GenericInner(ContainerClass* const FRestrict inTargets, const Matrix
             const FSize idxSource = idxTarget + idxS;
             FReal Kxy[1];
             FReal dKxy[3];
-            MatrixKernel->evaluateBlockAndDerivative(targetsX[idxSource],targetsY[idxSource],targetsZ[idxSource],
-                                                     targetsX[idxTarget],targetsY[idxTarget],targetsZ[idxTarget],
+            MatrixKernel->evaluateBlockAndDerivative(targetsX[idxTarget],targetsY[idxTarget],targetsZ[idxTarget],
+                                                     targetsX[idxSource],targetsY[idxSource],targetsZ[idxSource],
                                                      Kxy,dKxy);
+            const FReal mutual_coeff = MatrixKernel->getMutualCoefficient(); // 1 if symmetric; -1 if antisymmetric
+
             const FReal coef = (targetsPhysicalValues[idxTarget] * targetsPhysicalValues[idxSource]);
 
             dKxy[0] *= coef;
@@ -541,7 +277,7 @@ static void GenericInner(ContainerClass* const FRestrict inTargets, const Matrix
             targetsForcesX[idxSource] -= dKxy[0];
             targetsForcesY[idxSource] -= dKxy[1];
             targetsForcesZ[idxSource] -= dKxy[2];
-            targetsPotentials[idxSource] += Kxy[0] * targetsPhysicalValues[idxTarget];
+            targetsPotentials[idxSource] += mutual_coeff * Kxy[0] * targetsPhysicalValues[idxTarget];
         }
     }
 }
@@ -549,6 +285,7 @@ static void GenericInner(ContainerClass* const FRestrict inTargets, const Matrix
 template <class FReal, class ContainerClass, class MatrixKernelClass, class ComputeClass, int NbFRealInComputeClass>
 static void GenericFullRemote(ContainerClass* const FRestrict inTargets, const ContainerClass* const inNeighbors[],
                               const int limiteNeighbors, const MatrixKernelClass *const MatrixKernel){
+
     const FSize nbParticlesTargets = inTargets->getNbParticles();
     const FReal*const targetsPhysicalValues = inTargets->getPhysicalValues();
     const FReal*const targetsX = inTargets->getPositions()[0];
@@ -580,8 +317,9 @@ static void GenericFullRemote(ContainerClass* const FRestrict inTargets, const C
                 for(FSize idxSource = 0 ; idxSource < nbParticlesSources ; ++idxSource){
                     ComputeClass Kxy[1];
                     ComputeClass dKxy[3];
-                    MatrixKernel->evaluateBlockAndDerivative(sourcesX[idxSource],sourcesY[idxSource],sourcesZ[idxSource],
-                                                             tx,ty,tz,Kxy,dKxy);
+                    MatrixKernel->evaluateBlockAndDerivative(tx,ty,tz,
+                                                             sourcesX[idxSource],sourcesY[idxSource],sourcesZ[idxSource],
+                                                             Kxy,dKxy);
                     const ComputeClass coef = (tv * sourcesPhysicalValues[idxSource]);
 
                     dKxy[0] *= coef;
@@ -775,6 +513,8 @@ struct FP2PT<float>{
 };
 #endif
 
+#include "FP2PTensorialKij.hpp"
+
 #include "FP2PMultiRhs.hpp"
 
 #endif // FP2P_HPP
diff --git a/Src/Kernels/P2P/FP2PMultiRhs.hpp b/Src/Kernels/P2P/FP2PMultiRhs.hpp
index dc45ae59beabc993de6fb5fa1cd2617c3a6d105c..bbbfccf40f89dc57f57e4402b38199f42c39ad56 100644
--- a/Src/Kernels/P2P/FP2PMultiRhs.hpp
+++ b/Src/Kernels/P2P/FP2PMultiRhs.hpp
@@ -58,8 +58,10 @@ namespace FP2P {
                     const FPoint<FReal> targetPoint(targetsX[idxTarget],targetsY[idxTarget],targetsZ[idxTarget]);
                     FReal Kxy[1];
                     FReal dKxy[3];
-                    MatrixKernel->evaluateBlockAndDerivative(sourcePoint.getX(),sourcePoint.getY(),sourcePoint.getZ(),
-                                                             targetPoint.getX(),targetPoint.getY(),targetPoint.getZ(),Kxy,dKxy);
+                    MatrixKernel->evaluateBlockAndDerivative(targetPoint.getX(),targetPoint.getY(),targetPoint.getZ(),
+                                                             sourcePoint.getX(),sourcePoint.getY(),sourcePoint.getZ(),
+                                                             Kxy,dKxy);
+                    const FReal mutual_coeff = MatrixKernel->getMutualCoefficient(); // 1 if symmetric; -1 if antisymmetric
 
                     for(int idxVals = 0 ; idxVals < NVALS ; ++idxVals){
                       
@@ -76,7 +78,7 @@ namespace FP2P {
                         sourcesForcesX[idxSourceValue] -= dKxy[0] * coef;
                         sourcesForcesY[idxSourceValue] -= dKxy[1] * coef;
                         sourcesForcesZ[idxSourceValue] -= dKxy[2] * coef;
-                        sourcesPotentials[idxSourceValue] += ( Kxy[0] * targetsPhysicalValues[idxTargetValue] );
+                        sourcesPotentials[idxSourceValue] += ( mutual_coeff * Kxy[0] * targetsPhysicalValues[idxTargetValue] );
 
                     } // NVALS
 
@@ -109,8 +111,10 @@ namespace FP2P {
             const FPoint<FReal> targetPoint(targetsX[idxTarget],targetsY[idxTarget],targetsZ[idxTarget]);
             FReal Kxy[1];
             FReal dKxy[3];
-            MatrixKernel->evaluateBlockAndDerivative(sourcePoint.getX(),sourcePoint.getY(),sourcePoint.getZ(),
-                                                     targetPoint.getX(),targetPoint.getY(),targetPoint.getZ(),Kxy,dKxy);
+            MatrixKernel->evaluateBlockAndDerivative(targetPoint.getX(),targetPoint.getY(),targetPoint.getZ(),
+                                                     sourcePoint.getX(),sourcePoint.getY(),sourcePoint.getZ(),
+                                                     Kxy,dKxy);
+            const FReal mutual_coeff = MatrixKernel->getMutualCoefficient(); // 1 if symmetric; -1 if antisymmetric
 
             for(int idxVals = 0 ; idxVals < NVALS ; ++idxVals){
                       
@@ -127,7 +131,7 @@ namespace FP2P {
                 targetsForcesX[idxSourceValue] -= dKxy[0] * coef;
                 targetsForcesY[idxSourceValue] -= dKxy[1] * coef;
                 targetsForcesZ[idxSourceValue] -= dKxy[2] * coef;
-                targetsPotentials[idxSourceValue] += ( Kxy[0] * targetsPhysicalValues[idxTargetValue] );
+                targetsPotentials[idxSourceValue] += ( mutual_coeff * Kxy[0] * targetsPhysicalValues[idxTargetValue] );
 
             }// NVALS
 
@@ -172,8 +176,9 @@ inline void FullRemoteMultiRhs(ContainerClass* const FRestrict inTargets, const
                     const FPoint<FReal> targetPoint(targetsX[idxTarget],targetsY[idxTarget],targetsZ[idxTarget]);
                     FReal Kxy[1];
                     FReal dKxy[3];
-                    MatrixKernel->evaluateBlockAndDerivative(sourcePoint.getX(),sourcePoint.getY(),sourcePoint.getZ(),
-                                                             targetPoint.getX(),targetPoint.getY(),targetPoint.getZ(),Kxy,dKxy);
+                    MatrixKernel->evaluateBlockAndDerivative(targetPoint.getX(),targetPoint.getY(),targetPoint.getZ(),
+                                                             sourcePoint.getX(),sourcePoint.getY(),sourcePoint.getZ(),
+                                                             Kxy,dKxy);
 
                     for(int idxVals = 0 ; idxVals < NVALS ; ++idxVals){
 
diff --git a/Src/Kernels/P2P/FP2PR.hpp b/Src/Kernels/P2P/FP2PR.hpp
index 8ee64307ac0d8b531657f094c8c411500f5b5b38..da340848007e1da76b0ad48829354ee045dfe9c0 100644
--- a/Src/Kernels/P2P/FP2PR.hpp
+++ b/Src/Kernels/P2P/FP2PR.hpp
@@ -25,14 +25,13 @@
  */
 namespace FP2PR{
 template <class FReal>
-inline void MutualParticles(const FReal sourceX,const FReal sourceY,const FReal sourceZ, const FReal sourcePhysicalValue,
-                            FReal* sourceForceX, FReal* sourceForceY, FReal* sourceForceZ, FReal* sourcePotential,
-                            const FReal targetX,const FReal targetY,const FReal targetZ, const FReal targetPhysicalValue,
-                            FReal* targetForceX, FReal* targetForceY, FReal* targetForceZ, FReal* targetPotential
-                            ){
-    FReal dx = sourceX - targetX;
-    FReal dy = sourceY - targetY;
-    FReal dz = sourceZ - targetZ;
+inline void MutualParticles(const FReal targetX,const FReal targetY,const FReal targetZ, const FReal targetPhysicalValue,
+                            FReal* targetForceX, FReal* targetForceY, FReal* targetForceZ, FReal* targetPotential,
+                            const FReal sourceX,const FReal sourceY,const FReal sourceZ, const FReal sourcePhysicalValue,
+                            FReal* sourceForceX, FReal* sourceForceY, FReal* sourceForceZ, FReal* sourcePotential){
+    FReal dx = targetX - sourceX;
+    FReal dy = targetY - sourceY;
+    FReal dz = targetZ - sourceZ;
 
     FReal inv_square_distance = FReal(1.0) / (dx*dx + dy*dy + dz*dz);
     FReal inv_distance = FMath::Sqrt(inv_square_distance);
@@ -40,9 +39,9 @@ inline void MutualParticles(const FReal sourceX,const FReal sourceY,const FReal
     inv_square_distance *= inv_distance;
     inv_square_distance *= targetPhysicalValue * sourcePhysicalValue;
 
-    dx *= inv_square_distance;
-    dy *= inv_square_distance;
-    dz *= inv_square_distance;
+    dx *= - inv_square_distance;
+    dy *= - inv_square_distance;
+    dz *= - inv_square_distance;
 
     *targetForceX += dx;
     *targetForceY += dy;
@@ -56,12 +55,12 @@ inline void MutualParticles(const FReal sourceX,const FReal sourceY,const FReal
 }
 
 template <class FReal>
-inline void NonMutualParticles(const FReal sourceX,const FReal sourceY,const FReal sourceZ, const FReal sourcePhysicalValue,
-                               const FReal targetX,const FReal targetY,const FReal targetZ, const FReal targetPhysicalValue,
-                               FReal* targetForceX, FReal* targetForceY, FReal* targetForceZ, FReal* targetPotential){
-    FReal dx = sourceX - targetX;
-    FReal dy = sourceY - targetY;
-    FReal dz = sourceZ - targetZ;
+inline void NonMutualParticles(const FReal targetX,const FReal targetY,const FReal targetZ, const FReal targetPhysicalValue,
+                               FReal* targetForceX, FReal* targetForceY, FReal* targetForceZ, FReal* targetPotential, 
+                               const FReal sourceX,const FReal sourceY,const FReal sourceZ, const FReal sourcePhysicalValue){
+    FReal dx = targetX - sourceX;
+    FReal dy = targetY - sourceY;
+    FReal dz = targetZ - sourceZ;
 
     FReal inv_square_distance = FReal(1.0) / (dx*dx + dy*dy + dz*dz);
     FReal inv_distance = FMath::Sqrt(inv_square_distance);
@@ -69,9 +68,10 @@ inline void NonMutualParticles(const FReal sourceX,const FReal sourceY,const FRe
     inv_square_distance *= inv_distance;
     inv_square_distance *= targetPhysicalValue * sourcePhysicalValue;
 
-    dx *= inv_square_distance;
-    dy *= inv_square_distance;
-    dz *= inv_square_distance;
+    // d/dx(1/|x-y|)=-(x-y)/r^3
+    dx *= - inv_square_distance;
+    dy *= - inv_square_distance;
+    dz *= - inv_square_distance;
 
     *targetForceX += dx;
     *targetForceY += dy;
@@ -119,9 +119,9 @@ static void GenericFullMutual(ContainerClass* const FRestrict inTargets, Contain
                 ComputeClass  tpo = FMath::Zero<ComputeClass>();
 
                 for(FSize idxSource = 0 ; idxSource < nbParticlesSources ; ++idxSource){
-                    ComputeClass dx = sourcesX[idxSource] - tx;
-                    ComputeClass dy = sourcesY[idxSource] - ty;
-                    ComputeClass dz = sourcesZ[idxSource] - tz;
+                    ComputeClass dx = tx - sourcesX[idxSource];
+                    ComputeClass dy = ty - sourcesY[idxSource];
+                    ComputeClass dz = tz - sourcesZ[idxSource];
 
                     ComputeClass inv_square_distance = mOne / (dx*dx + dy*dy + dz*dz);
                     const ComputeClass inv_distance = FMath::Sqrt(inv_square_distance);
@@ -129,9 +129,9 @@ static void GenericFullMutual(ContainerClass* const FRestrict inTargets, Contain
                     inv_square_distance *= inv_distance;
                     inv_square_distance *= tv * sourcesPhysicalValues[idxSource];
 
-                    dx *= inv_square_distance;
-                    dy *= inv_square_distance;
-                    dz *= inv_square_distance;
+                    dx *= - inv_square_distance;
+                    dy *= - inv_square_distance;
+                    dz *= - inv_square_distance;
 
                     tfx += dx;
                     tfy += dy;
@@ -193,18 +193,18 @@ static void GenericInner(ContainerClass* const FRestrict inTargets){
 
             for(FSize idxSource = (idxTarget+NbFRealInComputeClass)/NbFRealInComputeClass ; idxSource < nbParticlesSources ; ++idxSource){
 
-                ComputeClass dx = sourcesX[idxSource] - tx;
-                ComputeClass dy = sourcesY[idxSource] - ty;
-                ComputeClass dz = sourcesZ[idxSource] - tz;
+                ComputeClass dx = tx - sourcesX[idxSource];
+                ComputeClass dy = ty - sourcesY[idxSource];
+                ComputeClass dz = tz - sourcesZ[idxSource];
                 ComputeClass inv_square_distance = mOne / (dx*dx + dy*dy + dz*dz);
                 const ComputeClass inv_distance = FMath::Sqrt(inv_square_distance);
 
                 inv_square_distance *= inv_distance;
                 inv_square_distance *= tv * sourcesPhysicalValues[idxSource];
 
-                dx *= inv_square_distance;
-                dy *= inv_square_distance;
-                dz *= inv_square_distance;
+                dx *= - inv_square_distance;
+                dy *= - inv_square_distance;
+                dz *= - inv_square_distance;
 
                 tfx += dx;
                 tfy += dy;
@@ -228,9 +228,9 @@ static void GenericInner(ContainerClass* const FRestrict inTargets){
         const FSize limitForTarget = NbFRealInComputeClass-(idxTarget%NbFRealInComputeClass);
         for(FSize idxS = 1 ; idxS < limitForTarget ; ++idxS){
             const FSize idxSource = idxTarget + idxS;
-            FReal dx = targetsX[idxSource] - targetsX[idxTarget];
-            FReal dy = targetsY[idxSource] - targetsY[idxTarget];
-            FReal dz = targetsZ[idxSource] - targetsZ[idxTarget];
+            FReal dx = targetsX[idxTarget] - targetsX[idxSource];
+            FReal dy = targetsY[idxTarget] - targetsY[idxSource];
+            FReal dz = targetsZ[idxTarget] - targetsZ[idxSource];
 
             FReal inv_square_distance = FReal(1.0) / (dx*dx + dy*dy + dz*dz);
             const FReal inv_distance = FMath::Sqrt(inv_square_distance);
@@ -238,9 +238,9 @@ static void GenericInner(ContainerClass* const FRestrict inTargets){
             inv_square_distance *= inv_distance;
             inv_square_distance *= targetsPhysicalValues[idxTarget] * targetsPhysicalValues[idxSource];
 
-            dx *= inv_square_distance;
-            dy *= inv_square_distance;
-            dz *= inv_square_distance;
+            dx *= - inv_square_distance;
+            dy *= - inv_square_distance;
+            dz *= - inv_square_distance;
 
             targetsForcesX[idxTarget] += dx;
             targetsForcesY[idxTarget] += dy;
@@ -289,9 +289,9 @@ static void GenericFullRemote(ContainerClass* const FRestrict inTargets, const C
                 ComputeClass  tpo = FMath::Zero<ComputeClass>();
 
                 for(FSize idxSource = 0 ; idxSource < nbParticlesSources ; ++idxSource){
-                    ComputeClass dx = sourcesX[idxSource] - tx;
-                    ComputeClass dy = sourcesY[idxSource] - ty;
-                    ComputeClass dz = sourcesZ[idxSource] - tz;
+                    ComputeClass dx = tx - sourcesX[idxSource];
+                    ComputeClass dy = ty - sourcesY[idxSource];
+                    ComputeClass dz = tz - sourcesZ[idxSource];
 
                     ComputeClass inv_square_distance = mOne / (dx*dx + dy*dy + dz*dz);
                     const ComputeClass inv_distance = FMath::Sqrt(inv_square_distance);
@@ -299,9 +299,9 @@ static void GenericFullRemote(ContainerClass* const FRestrict inTargets, const C
                     inv_square_distance *= inv_distance;
                     inv_square_distance *= tv * sourcesPhysicalValues[idxSource];
 
-                    dx *= inv_square_distance;
-                    dy *= inv_square_distance;
-                    dz *= inv_square_distance;
+                    dx *= - inv_square_distance;
+                    dy *= - inv_square_distance;
+                    dz *= - inv_square_distance;
 
                     tfx += dx;
                     tfy += dy;
diff --git a/Src/Kernels/P2P/FP2PTensorialKij.hpp b/Src/Kernels/P2P/FP2PTensorialKij.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..ffb59fcc3c73344b8a0fce07e623598831f31d56
--- /dev/null
+++ b/Src/Kernels/P2P/FP2PTensorialKij.hpp
@@ -0,0 +1,305 @@
+// ===================================================================================
+// Copyright ScalFmm 2011 INRIA, Olivier Coulaud, Bérenger Bramas, Matthias Messner
+// olivier.coulaud@inria.fr, berenger.bramas@inria.fr
+// This software is a computer program whose purpose is to compute the FMM.
+//
+// This software is governed by the CeCILL-C and LGPL licenses and
+// abiding by the rules of distribution of free software.
+//
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public and CeCILL-C Licenses for more details.
+// "http://www.cecill.info".
+// "http://www.gnu.org/licenses".
+// ===================================================================================
+#ifndef FP2P_TENSORIALKIJ_HPP
+#define FP2P_TENSORIALKIJ_HPP
+
+namespace FP2P {
+
+//////////////////////////////////////////////////////////////////////////////////////////////////////////
+//////////////////////////////////////////////////////////////////////////////////////////////////////////
+//////////////////////////////////////////////////////////////////////////////////////////////////////////
+//////////////////////////////////////////////////////////////////////////////////////////////////////////
+// Tensorial Matrix Kernels: K_IJ / p_i=\sum_j K_{ij} w_j
+//////////////////////////////////////////////////////////////////////////////////////////////////////////
+//////////////////////////////////////////////////////////////////////////////////////////////////////////
+//////////////////////////////////////////////////////////////////////////////////////////////////////////
+//////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+/**
+   * @brief MutualParticlesKIJ
+   * @param sourceX
+   * @param sourceY
+   * @param sourceZ
+   * @param sourcePhysicalValue
+   * @param sourceForceX
+   * @param sourceForceY
+   * @param sourceForceZ
+   * @param sourcePotential
+   * @param targetX
+   * @param targetY
+   * @param targetZ
+   * @param targetPhysicalValue
+   * @param targetForceX
+   * @param targetForceY
+   * @param targetForceZ
+   * @param targetPotential
+   */
+template<class FReal, typename MatrixKernelClass>
+inline void MutualParticlesKIJ(const FReal targetX,const FReal targetY,const FReal targetZ, const FReal* targetPhysicalValue,
+                               FReal* targetForceX, FReal* targetForceY, FReal* targetForceZ, FReal* targetPotential,
+                               const FReal sourceX,const FReal sourceY,const FReal sourceZ, const FReal* sourcePhysicalValue,
+                               FReal* sourceForceX, FReal* sourceForceY, FReal* sourceForceZ, FReal* sourcePotential,
+                               const MatrixKernelClass *const MatrixKernel){
+
+    // get information on tensorial aspect of matrix kernel
+    const int ncmp = MatrixKernelClass::NCMP;    
+    const int npv = MatrixKernelClass::NPV;    
+    const int npot = MatrixKernelClass::NPOT;    
+
+    // evaluate kernel and its partial derivatives
+    const FPoint<FReal> sourcePoint(sourceX,sourceY,sourceZ);
+    const FPoint<FReal> targetPoint(targetX,targetY,targetZ);
+    FReal Kxy[ncmp];
+    FReal dKxy[ncmp][3];
+    MatrixKernel->evaluateBlockAndDerivative(targetPoint,sourcePoint,Kxy,dKxy);
+    const FReal mutual_coeff = MatrixKernel->getMutualCoefficient(); // 1 if symmetric; -1 if antisymmetric
+
+    for(unsigned int i = 0 ; i < npot ; ++i){
+        for(unsigned int j = 0 ; j < npv ; ++j){
+
+            // update component to be applied
+            const int d = MatrixKernel->getPosition(i*npv+j);
+
+            // forces prefactor
+            const FReal coef = (targetPhysicalValue[j] * sourcePhysicalValue[j]);
+
+            targetForceX[i] += dKxy[d][0] * coef;
+            targetForceY[i] += dKxy[d][1] * coef;
+            targetForceZ[i] += dKxy[d][2] * coef;
+            targetPotential[i] += ( Kxy[d] * sourcePhysicalValue[j] );
+
+            sourceForceX[i] -= dKxy[d][0] * coef;
+            sourceForceY[i] -= dKxy[d][1] * coef;
+            sourceForceZ[i] -= dKxy[d][2] * coef;
+            sourcePotential[i] += ( mutual_coeff * Kxy[d] * targetPhysicalValue[j] );
+
+        }// j
+    }// i
+
+}
+
+//////////////////////////////////////////////////////////////////////////////////////////////////////////
+//////////////////////////////////////////////////////////////////////////////////////////////////////////
+//////////////////////////////////////////////////////////////////////////////////////////////////////////
+//////////////////////////////////////////////////////////////////////////////////////////////////////////
+// Tensorial Matrix Kernels: K_IJ
+// TODO: Implement SSE and AVX variants then move following FullMutualKIJ and FullRemoteKIJ to FP2P.hpp
+//////////////////////////////////////////////////////////////////////////////////////////////////////////
+//////////////////////////////////////////////////////////////////////////////////////////////////////////
+//////////////////////////////////////////////////////////////////////////////////////////////////////////
+//////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+/**
+ * @brief FullMutualKIJ
+ */
+template <class FReal, class ContainerClass, typename MatrixKernelClass>
+inline void FullMutualKIJ(ContainerClass* const FRestrict inTargets, ContainerClass* const inNeighbors[],
+                          const int limiteNeighbors, const MatrixKernelClass *const MatrixKernel){
+
+    // get information on tensorial aspect of matrix kernel
+    const int ncmp = MatrixKernelClass::NCMP;
+    const int npv = MatrixKernelClass::NPV;    
+    const int npot = MatrixKernelClass::NPOT;
+
+    // get info on targets
+    const FSize nbParticlesTargets = inTargets->getNbParticles();
+    const FReal*const targetsX = inTargets->getPositions()[0];
+    const FReal*const targetsY = inTargets->getPositions()[1];
+    const FReal*const targetsZ = inTargets->getPositions()[2];
+
+    for(FSize idxNeighbors = 0 ; idxNeighbors < limiteNeighbors ; ++idxNeighbors){
+        for(FSize idxTarget = 0 ; idxTarget < nbParticlesTargets ; ++idxTarget){
+            if( inNeighbors[idxNeighbors] ){
+                const FSize nbParticlesSources = inNeighbors[idxNeighbors]->getNbParticles();
+                const FReal*const sourcesX = inNeighbors[idxNeighbors]->getPositions()[0];
+                const FReal*const sourcesY = inNeighbors[idxNeighbors]->getPositions()[1];
+                const FReal*const sourcesZ = inNeighbors[idxNeighbors]->getPositions()[2];
+
+                for(FSize idxSource = 0 ; idxSource < nbParticlesSources ; ++idxSource){
+
+                    // evaluate kernel and its partial derivatives
+                    const FPoint<FReal> sourcePoint(sourcesX[idxSource],sourcesY[idxSource],sourcesZ[idxSource]);
+                    const FPoint<FReal> targetPoint(targetsX[idxTarget],targetsY[idxTarget],targetsZ[idxTarget]);
+                    FReal Kxy[ncmp];
+                    FReal dKxy[ncmp][3];
+                    MatrixKernel->evaluateBlockAndDerivative(targetPoint,sourcePoint,Kxy,dKxy);
+                    const FReal mutual_coeff = MatrixKernel->getMutualCoefficient(); // 1 if symmetric; -1 if antisymmetric
+
+                    for(unsigned int i = 0 ; i < npot ; ++i){
+                        FReal*const targetsPotentials = inTargets->getPotentials(i);
+                        FReal*const targetsForcesX = inTargets->getForcesX(i);
+                        FReal*const targetsForcesY = inTargets->getForcesY(i);
+                        FReal*const targetsForcesZ = inTargets->getForcesZ(i);
+                        FReal*const sourcesPotentials = inNeighbors[idxNeighbors]->getPotentials(i);
+                        FReal*const sourcesForcesX = inNeighbors[idxNeighbors]->getForcesX(i);
+                        FReal*const sourcesForcesY = inNeighbors[idxNeighbors]->getForcesY(i);
+                        FReal*const sourcesForcesZ = inNeighbors[idxNeighbors]->getForcesZ(i);
+
+                        for(unsigned int j = 0 ; j < npv ; ++j){
+                            const FReal*const targetsPhysicalValues = inTargets->getPhysicalValues(j);
+                            const FReal*const sourcesPhysicalValues = inNeighbors[idxNeighbors]->getPhysicalValues(j);
+
+                            // update component to be applied
+                            const int d = MatrixKernel->getPosition(i*npv+j);
+
+                            // forces prefactor
+                            FReal coef = (targetsPhysicalValues[idxTarget] * sourcesPhysicalValues[idxSource]);
+
+                            targetsForcesX[idxTarget] += dKxy[d][0] * coef;
+                            targetsForcesY[idxTarget] += dKxy[d][1] * coef;
+                            targetsForcesZ[idxTarget] += dKxy[d][2] * coef;
+                            targetsPotentials[idxTarget] += ( Kxy[d] * sourcesPhysicalValues[idxSource] );
+
+                            sourcesForcesX[idxSource] -= dKxy[d][0] * coef;
+                            sourcesForcesY[idxSource] -= dKxy[d][1] * coef;
+                            sourcesForcesZ[idxSource] -= dKxy[d][2] * coef;
+                            sourcesPotentials[idxSource] += mutual_coeff * Kxy[d] * targetsPhysicalValues[idxTarget];
+
+                        }// j
+                    }// i
+                }
+            }
+        }
+    }
+}
+
+template <class FReal, class ContainerClass, typename MatrixKernelClass>
+inline void InnerKIJ(ContainerClass* const FRestrict inTargets, const MatrixKernelClass *const MatrixKernel){
+
+    // get information on tensorial aspect of matrix kernel
+    const int ncmp = MatrixKernelClass::NCMP;
+    const int npv = MatrixKernelClass::NPV;    
+    const int npot = MatrixKernelClass::NPOT;
+
+    // get info on targets
+    const FSize nbParticlesTargets = inTargets->getNbParticles();
+    const FReal*const targetsX = inTargets->getPositions()[0];
+    const FReal*const targetsY = inTargets->getPositions()[1];
+    const FReal*const targetsZ = inTargets->getPositions()[2];
+
+    for(FSize idxTarget = 0 ; idxTarget < nbParticlesTargets ; ++idxTarget){
+        for(FSize idxSource = idxTarget + 1 ; idxSource < nbParticlesTargets ; ++idxSource){
+
+            // evaluate kernel and its partial derivatives
+            const FPoint<FReal> sourcePoint(targetsX[idxSource],targetsY[idxSource],targetsZ[idxSource]);
+            const FPoint<FReal> targetPoint(targetsX[idxTarget],targetsY[idxTarget],targetsZ[idxTarget]);
+            FReal Kxy[ncmp];
+            FReal dKxy[ncmp][3];
+            MatrixKernel->evaluateBlockAndDerivative(targetPoint,sourcePoint,Kxy,dKxy);
+            const FReal mutual_coeff = MatrixKernel->getMutualCoefficient(); // 1 if symmetric; -1 if antisymmetric
+
+            for(unsigned int i = 0 ; i < npot ; ++i){
+                FReal*const targetsPotentials = inTargets->getPotentials(i);
+                FReal*const targetsForcesX = inTargets->getForcesX(i);
+                FReal*const targetsForcesY = inTargets->getForcesY(i);
+                FReal*const targetsForcesZ = inTargets->getForcesZ(i);
+
+                for(unsigned int j = 0 ; j < npv ; ++j){
+                    const FReal*const targetsPhysicalValues = inTargets->getPhysicalValues(j);
+
+                    // update component to be applied
+                    const int d = MatrixKernel->getPosition(i*npv+j);
+
+                    // forces prefactor
+                    const FReal coef = (targetsPhysicalValues[idxTarget] * targetsPhysicalValues[idxSource]);
+
+                    targetsForcesX[idxTarget] += dKxy[d][0] * coef;
+                    targetsForcesY[idxTarget] += dKxy[d][1] * coef;
+                    targetsForcesZ[idxTarget] += dKxy[d][2] * coef;
+                    targetsPotentials[idxTarget] += ( Kxy[d] * targetsPhysicalValues[idxSource] );
+
+                    targetsForcesX[idxSource] -= dKxy[d][0] * coef;
+                    targetsForcesY[idxSource] -= dKxy[d][1] * coef;
+                    targetsForcesZ[idxSource] -= dKxy[d][2] * coef;
+                    targetsPotentials[idxSource] += mutual_coeff * Kxy[d] * targetsPhysicalValues[idxTarget];
+                }// j
+            }// i
+
+        }
+    }
+}
+
+/**
+   * @brief FullRemoteKIJ
+   */
+template <class FReal, class ContainerClass, typename MatrixKernelClass>
+inline void FullRemoteKIJ(ContainerClass* const FRestrict inTargets, const ContainerClass* const inNeighbors[],
+                          const int limiteNeighbors, const MatrixKernelClass *const MatrixKernel){
+
+    // get information on tensorial aspect of matrix kernel
+    const int ncmp = MatrixKernelClass::NCMP;
+    const int npv = MatrixKernelClass::NPV;    
+    const int npot = MatrixKernelClass::NPOT;
+
+    // get info on targets
+    const FSize nbParticlesTargets = inTargets->getNbParticles();
+    const FReal*const targetsX = inTargets->getPositions()[0];
+    const FReal*const targetsY = inTargets->getPositions()[1];
+    const FReal*const targetsZ = inTargets->getPositions()[2];
+
+    for(FSize idxNeighbors = 0 ; idxNeighbors < limiteNeighbors ; ++idxNeighbors){
+        for(FSize idxTarget = 0 ; idxTarget < nbParticlesTargets ; ++idxTarget){
+            if( inNeighbors[idxNeighbors] ){
+                const FSize nbParticlesSources = inNeighbors[idxNeighbors]->getNbParticles();
+                const FReal*const sourcesX = inNeighbors[idxNeighbors]->getPositions()[0];
+                const FReal*const sourcesY = inNeighbors[idxNeighbors]->getPositions()[1];
+                const FReal*const sourcesZ = inNeighbors[idxNeighbors]->getPositions()[2];
+
+                for(FSize idxSource = 0 ; idxSource < nbParticlesSources ; ++idxSource){
+
+                    // evaluate kernel and its partial derivatives
+                    const FPoint<FReal> sourcePoint(sourcesX[idxSource],sourcesY[idxSource],sourcesZ[idxSource]);
+                    const FPoint<FReal> targetPoint(targetsX[idxTarget],targetsY[idxTarget],targetsZ[idxTarget]);
+                    FReal Kxy[ncmp];
+                    FReal dKxy[ncmp][3];
+                    MatrixKernel->evaluateBlockAndDerivative(targetPoint,sourcePoint,Kxy,dKxy);
+
+                    for(unsigned int i = 0 ; i < npot ; ++i){
+                        FReal*const targetsPotentials = inTargets->getPotentials(i);
+                        FReal*const targetsForcesX = inTargets->getForcesX(i);
+                        FReal*const targetsForcesY = inTargets->getForcesY(i);
+                        FReal*const targetsForcesZ = inTargets->getForcesZ(i);
+
+                        for(unsigned int j = 0 ; j < npv ; ++j){
+                            const FReal*const targetsPhysicalValues = inTargets->getPhysicalValues(j);
+                            const FReal*const sourcesPhysicalValues = inNeighbors[idxNeighbors]->getPhysicalValues(j);
+
+                            // update component to be applied
+                            const int d = MatrixKernel->getPosition(i*npv+j);
+
+                            // forces prefactor
+                            const FReal coef = (targetsPhysicalValues[idxTarget] * sourcesPhysicalValues[idxSource]);
+
+                            targetsForcesX[idxTarget] += dKxy[d][0] * coef;
+                            targetsForcesY[idxTarget] += dKxy[d][1] * coef;
+                            targetsForcesZ[idxTarget] += dKxy[d][2] * coef;
+                            targetsPotentials[idxTarget] += ( Kxy[d] * sourcesPhysicalValues[idxSource] );
+
+                        }// j
+                    }// i
+
+                }
+            }
+        }
+    }
+}
+
+
+
+} // End namespace
+
+
+#endif // FP2P_TENSORIALKIJ_HPP
diff --git a/Src/Kernels/Uniform/FUnifTensorialKernel.hpp b/Src/Kernels/Uniform/FUnifTensorialKernel.hpp
index 87fd28ec05cb55c618295ae033534b5af9075bf7..2a9e3c466e8c14482cbe22258188afee60998df3 100644
--- a/Src/Kernels/Uniform/FUnifTensorialKernel.hpp
+++ b/Src/Kernels/Uniform/FUnifTensorialKernel.hpp
@@ -253,15 +253,23 @@ public:
              ContainerClass* const FRestrict inTargets, const ContainerClass* const FRestrict inSources,
              ContainerClass* const inNeighbors[], const int neighborPositions[],
              const int inSize) override {
-        if(inTargets == inSources){
-            P2POuter(inPosition, inTargets, inNeighbors, neighborPositions, inSize);
-            DirectInteractionComputer<FReal, MatrixKernelClass::NCMP, NVALS>::P2PInner(inTargets,MatrixKernel);
+        // Standard FMM separation criterion, i.e. max 27 neighbor clusters per leaf
+        if(LeafLevelSeparationCriterion==1) {
+            if(inTargets == inSources){
+                P2POuter(inPosition, inTargets, inNeighbors, neighborPositions, inSize);
+                DirectInteractionComputer<FReal, MatrixKernelClass::NCMP, NVALS>::P2PInner(inTargets,MatrixKernel);
+            }
+            else{
+                const ContainerClass* const srcPtr[1] = {inSources};
+                DirectInteractionComputer<FReal, MatrixKernelClass::NCMP, NVALS>::P2PRemote(inTargets,srcPtr,1,MatrixKernel);
+                DirectInteractionComputer<FReal, MatrixKernelClass::NCMP, NVALS>::P2PRemote(inTargets,inNeighbors,inSize,MatrixKernel);
+            }
         }
-        else{
-            const ContainerClass* const srcPtr[1] = {inSources};
-            DirectInteractionComputer<FReal, MatrixKernelClass::NCMP, NVALS>::P2PRemote(inTargets,srcPtr,1,MatrixKernel);
-            DirectInteractionComputer<FReal, MatrixKernelClass::NCMP, NVALS>::P2PRemote(inTargets,inNeighbors,inSize,MatrixKernel);
+        // Nearfield interactions are only computed within the target leaf
+        else if(LeafLevelSeparationCriterion==0){
+            DirectInteractionComputer<FReal,MatrixKernelClass::NCMP, NVALS>::P2PRemote(inTargets,inNeighbors,inSize,MatrixKernel);
         }
+        // If criterion equals -1 then no P2P need to be performed.
     }
 
     void P2POuter(const FTreeCoordinate& /*inLeafPosition*/,
@@ -281,7 +289,13 @@ public:
                    ContainerClass* const FRestrict inTargets, const ContainerClass* const FRestrict /*inSources*/,
                    const ContainerClass* const inNeighbors[], const int /*neighborPositions*/[],
                    const int inSize) override {
-        DirectInteractionComputer<FReal, MatrixKernelClass::NCMP, NVALS>::P2PRemote(inTargets,inNeighbors,inSize,MatrixKernel);
+        // Standard FMM separation criterion, i.e. max 27 neighbor clusters per leaf
+        if(LeafLevelSeparationCriterion==1) 
+            DirectInteractionComputer<FReal, MatrixKernelClass::NCMP, NVALS>::P2PRemote(inTargets,inNeighbors,inSize,MatrixKernel);
+        // Nearfield interactions are only computed within the target leaf
+        if(LeafLevelSeparationCriterion==0) 
+            DirectInteractionComputer<FReal, MatrixKernelClass::NCMP, NVALS>::P2PRemote(inTargets,inNeighbors,0,MatrixKernel);
+        // If criterion equals -1 then no P2P need to be performed.        
     }
 
 };
diff --git a/Tests/Kernels/DirectAlgorithm.cpp b/Tests/Kernels/DirectAlgorithm.cpp
index 9680aaff3922061961bd95323480ec3c3919e6cb..07996bb0951d225bb68007c9e07c5c1abfa602b3 100644
--- a/Tests/Kernels/DirectAlgorithm.cpp
+++ b/Tests/Kernels/DirectAlgorithm.cpp
@@ -226,12 +226,14 @@ int main(int argc, char ** argv){
 				for(int idxOther = 0; idxOther < nbParticles ; ++idxOther){
 					if( idxOther != idxTarget ){
 						FP2P::NonMutualParticles(
-								particles[idxOther].position.getX(), particles[idxOther].position.getY(),
-								particles[idxOther].position.getZ(),particles[idxOther].physicalValue,
 								particlesDirect[idxTarget].position.getX(), particlesDirect[idxTarget].position.getY(),
-								particlesDirect[idxTarget].position.getZ(),particlesDirect[idxTarget].physicalValue,
-								&particlesDirect[idxTarget].forces[0],&particlesDirect[idxTarget].forces[1],
-								&particlesDirect[idxTarget].forces[2],&particlesDirect[idxTarget].potential, &MatrixKernel);
+                                particlesDirect[idxTarget].position.getZ(),particlesDirect[idxTarget].physicalValue,
+                                &particlesDirect[idxTarget].forces[0],&particlesDirect[idxTarget].forces[1],
+                                &particlesDirect[idxTarget].forces[2],&particlesDirect[idxTarget].potential, 
+                                particles[idxOther].position.getX(), particles[idxOther].position.getY(),
+								particles[idxOther].position.getZ(),particles[idxOther].physicalValue,
+                                &MatrixKernel
+                        );
 					}
 				}
 				//
@@ -251,9 +253,10 @@ int main(int argc, char ** argv){
                                     MDParticle<FReal> source = particles[idxSource];
 									source.position += offset;
 									FP2P::NonMutualParticles(
-											source.position.getX(), source.position.getY(),source.position.getZ(),source.physicalValue,
 											particlesDirect[idxTarget].position.getX(), particlesDirect[idxTarget].position.getY(),particlesDirect[idxTarget].position.getZ(),particlesDirect[idxTarget].physicalValue,
-											&particlesDirect[idxTarget].forces[0],&particlesDirect[idxTarget].forces[1],&particlesDirect[idxTarget].forces[2],&particlesDirect[idxTarget].potential, &MatrixKernel
+                                            &particlesDirect[idxTarget].forces[0],&particlesDirect[idxTarget].forces[1],&particlesDirect[idxTarget].forces[2],&particlesDirect[idxTarget].potential,
+                                            source.position.getX(), source.position.getY(),source.position.getZ(),source.physicalValue,
+                                            &MatrixKernel
 									);
 								}
 							}
diff --git a/Tests/Kernels/testChebTensorialAlgorithm.cpp b/Tests/Kernels/testChebTensorialAlgorithm.cpp
index 5dc9cccf8cff0ee95f940ba5c361ac3c94e2828a..c742ed0e6c2a5d5dbe6c344e6bd6b7bcbcdfc611 100644
--- a/Tests/Kernels/testChebTensorialAlgorithm.cpp
+++ b/Tests/Kernels/testChebTensorialAlgorithm.cpp
@@ -27,7 +27,7 @@
 #include "Files/FFmaGenericLoader.hpp"
 
 
-#include "Kernels/Interpolation/FInterpMatrixKernel.hpp"
+#include "Kernels/Interpolation/FInterpMatrixKernel_TensorialInteractions.hpp"
 #include "Kernels/Chebyshev/FChebCell.hpp"
 #include "Kernels/Chebyshev/FChebTensorialKernel.hpp"
 
@@ -72,25 +72,30 @@ int main(int argc, char* argv[])
     // init timer
     FTic time;
 
-
-  // typedefs
-  typedef FInterpMatrixKernel_R_IJ<FReal> MatrixKernelClass;
-
-  const unsigned int NPV  = MatrixKernelClass::NPV;
-  const unsigned int NPOT = MatrixKernelClass::NPOT;
-  const unsigned int NRHS = MatrixKernelClass::NRHS;
-  const unsigned int NLHS = MatrixKernelClass::NLHS;
-
-  const double CoreWidth = 0.1;
-  const MatrixKernelClass MatrixKernel(CoreWidth);
-
-  // init particles position and physical value
-  struct TestParticle{
-    FPoint<FReal> position;
-    FReal forces[3][NPOT];
-    FReal physicalValue[NPV];
-    FReal potential[NPOT];
-  };
+    // typedefs and infos
+    typedef FInterpMatrixKernel_R_IJ<FReal> MatrixKernelClass;
+    MatrixKernelClass::printInfo();
+
+    // useful features of matrix kernel
+    const unsigned int NPV  = MatrixKernelClass::NPV;
+    const unsigned int NPOT = MatrixKernelClass::NPOT;
+    const unsigned int NRHS = MatrixKernelClass::NRHS;
+    const unsigned int NLHS = MatrixKernelClass::NLHS;
+
+    const FReal CoreWidth = 0.;
+    std::cout << "Core width: a=" << CoreWidth << std::endl;
+    std::cout << std::endl;
+
+    // Build matrix kernel
+    const MatrixKernelClass MatrixKernel(CoreWidth);
+
+    // init particles position and physical value
+    struct TestParticle{
+        FPoint<FReal> position;
+        FReal forces[3][NPOT];
+        FReal physicalValue[NPV];
+        FReal potential[NPOT];
+    };
 
   // open particle file
   FFmaGenericLoader<FReal> loader(filename);
diff --git a/Tests/Kernels/testSphericalBench.cpp b/Tests/Kernels/testSphericalBench.cpp
index 0f6c17f9dc3667ec636b1f77e103672a6e2703d0..5330b53653f309fff87799f093335b184cbe122a 100644
--- a/Tests/Kernels/testSphericalBench.cpp
+++ b/Tests/Kernels/testSphericalBench.cpp
@@ -102,9 +102,9 @@ void doATest(const FSize NbParticles, const int minP, const int maxP, const int
                                           particles[idxTarget].position.getZ(),particles[idxTarget].physicalValue,
                                           &particles[idxTarget].forces[0],&particles[idxTarget].forces[1],
                                           &particles[idxTarget].forces[2],&particles[idxTarget].potential,
-                                    particles[idxOther].position.getX(), particles[idxOther].position.getY(),
-                                    particles[idxOther].position.getZ(),particles[idxOther].physicalValue,
-                                    &particles[idxOther].forces[0],&particles[idxOther].forces[1],
+                                          particles[idxOther].position.getX(), particles[idxOther].position.getY(),
+                                          particles[idxOther].position.getZ(),particles[idxOther].physicalValue,
+                                          &particles[idxOther].forces[0],&particles[idxOther].forces[1],
                                           &particles[idxOther].forces[2],&particles[idxOther].potential);
                 }
             }
@@ -311,9 +311,9 @@ int main(int argc, char ** argv){
                                       centeredParticle.position.getZ(),centeredParticle.physicalValue,
                                       &centeredParticle.forces[0],&centeredParticle.forces[1],
                                       &centeredParticle.forces[2],&centeredParticle.potential,
-                                otherParticle.position.getX(), otherParticle.position.getY(),
-                                otherParticle.position.getZ(),otherParticle.physicalValue,
-                                &otherParticle.forces[0],&otherParticle.forces[1],
+                                      otherParticle.position.getX(), otherParticle.position.getY(),
+                                      otherParticle.position.getZ(),otherParticle.physicalValue,
+                                      &otherParticle.forces[0],&otherParticle.forces[1],
                                       &otherParticle.forces[2],&otherParticle.potential);
 
                 { // Check that each particle has been summed with all other
diff --git a/Tests/Kernels/testSphericalDlpolyAlgorithm.cpp b/Tests/Kernels/testSphericalDlpolyAlgorithm.cpp
index 4fdd46c2f3e3ef50b47b21e38097f441da890226..bf6b996198fc5b8841d0497f325e6073ae04573d 100644
--- a/Tests/Kernels/testSphericalDlpolyAlgorithm.cpp
+++ b/Tests/Kernels/testSphericalDlpolyAlgorithm.cpp
@@ -310,12 +310,13 @@ int main(int argc, char ** argv){
 			for(FSize idxOther = 0; idxOther < loader->getNumberOfParticles() ; ++idxOther){
 				if( idxOther != idxTarget ){
 					FP2P::NonMutualParticles(
-							particles[idxOther].position.getX(), particles[idxOther].position.getY(),
-							particles[idxOther].position.getZ(),particles[idxOther].physicalValue,
 							part.position.getX(), part.position.getY(),
-							part.position.getZ(),part.physicalValue,
-							&part.forces[0],&part.forces[1],
-							&part.forces[2],&part.potential,&MatrixKernel);
+                            part.position.getZ(),part.physicalValue,
+                            &part.forces[0],&part.forces[1],
+                            &part.forces[2],&part.potential,
+                            particles[idxOther].position.getX(), particles[idxOther].position.getY(),
+							particles[idxOther].position.getZ(),particles[idxOther].physicalValue,
+                            &MatrixKernel);
 				}
 			}
 			//
@@ -341,10 +342,10 @@ int main(int argc, char ** argv){
 								source.position += offset;
 								//								std::cout << "Part "<<idxSource<< " " <<source.position.getX()<< " " << source.position.getY()<< " " <<source.position.getZ()<< " " <<source.physicalValue <<std::endl ;
 								FP2P::NonMutualParticles(
-										source.position.getX(), source.position.getY(),source.position.getZ(),source.physicalValue,
 										part.position.getX(), part.position.getY(),part.position.getZ(),part.physicalValue,
-										&part.forces[0],&part.forces[1],&part.forces[2],&part.potential,&MatrixKernel
-								);
+                                        &part.forces[0],&part.forces[1],&part.forces[2],&part.potential,
+                                        source.position.getX(), source.position.getY(),source.position.getZ(),source.physicalValue,
+										&MatrixKernel);
 							}
 							//						std::cout <<std::endl<<std::endl<<std::endl;
 						}
diff --git a/Tests/Kernels/testUnifTensorialAlgorithm.cpp b/Tests/Kernels/testUnifTensorialAlgorithm.cpp
index 99e5e678fbd95511fe5c052150bde8956a600957..e72ae7c01c930470211f52e7c6d0c50239ee19ba 100644
--- a/Tests/Kernels/testUnifTensorialAlgorithm.cpp
+++ b/Tests/Kernels/testUnifTensorialAlgorithm.cpp
@@ -30,7 +30,7 @@
 #include "Files/FFmaGenericLoader.hpp"
 
 
-#include "Kernels/Interpolation/FInterpMatrixKernel.hpp"
+#include "Kernels/Interpolation/FInterpMatrixKernel_TensorialInteractions.hpp"
 #include "Kernels/Uniform/FUnifCell.hpp"
 #include "Kernels/Uniform/FUnifTensorialKernel.hpp"
 
@@ -46,7 +46,10 @@
 #include "Core/FFmmAlgorithm.hpp"
 #include "Core/FFmmAlgorithmThread.hpp"
 
-#include "../../Src/Utils/FParameterNames.hpp"
+#include "Utils/FParameterNames.hpp"
+
+// For std::array<> (i.e. for Tensorial kernels purpose)
+#include <array>
 
 /**
  * This program runs the FMM Algorithm with the Uniform kernel and compares the results with a direct computation.
@@ -77,8 +80,9 @@ int main(int argc, char* argv[])
      FTic time;
 
 
-    // typedefs
+    // typedefs and infos
     typedef FInterpMatrixKernel_R_IJ<FReal> MatrixKernelClass;
+    MatrixKernelClass::printInfo();
 
     // useful features of matrix kernel
     const unsigned int NPV  = MatrixKernelClass::NPV;
@@ -86,7 +90,11 @@ int main(int argc, char* argv[])
     const unsigned int NRHS = MatrixKernelClass::NRHS;
     const unsigned int NLHS = MatrixKernelClass::NLHS;
 
-    const FReal CoreWidth = 0.1;
+    const FReal CoreWidth = 0.;
+    std::cout << "Core width: a=" << CoreWidth << std::endl;
+    std::cout << std::endl;
+
+    // Build matrix kernel
     const MatrixKernelClass MatrixKernel(CoreWidth);
 
     // init particles position and physical value
@@ -172,12 +180,14 @@ int main(int argc, char* argv[])
         if(MatrixKernelClass::Type==HOMOGENEOUS && BoxWidthExtension>0.)
             throw std::runtime_error("Extension of box width is not yet supported for homogeneous kernels! Work-around: artificially set Type to NON_HOMOGENEOUS.");
 
-        typedef FP2PParticleContainerIndexed<FReal,NRHS,NLHS> ContainerClass;
+        const unsigned int NVALS = 1;
+
+        typedef FP2PParticleContainerIndexed<FReal,NRHS,NLHS,NVALS> ContainerClass;
 
         typedef FSimpleLeaf<FReal, ContainerClass >  LeafClass;
-        typedef FUnifCell<FReal,ORDER,NRHS,NLHS> CellClass;
+        typedef FUnifCell<FReal,ORDER,NRHS,NLHS,NVALS> CellClass;
         typedef FOctree<FReal, CellClass,ContainerClass,LeafClass> OctreeClass;
-        typedef FUnifTensorialKernel<FReal,CellClass,ContainerClass,MatrixKernelClass,ORDER> KernelClass;
+        typedef FUnifTensorialKernel<FReal,CellClass,ContainerClass,MatrixKernelClass,ORDER,NVALS> KernelClass;
         typedef FFmmAlgorithm<OctreeClass,CellClass,ContainerClass,KernelClass,LeafClass> FmmClass;
         //  typedef FFmmAlgorithmThread<OctreeClass,CellClass,ContainerClass,KernelClass,LeafClass> FmmClass;
 
@@ -201,6 +211,22 @@ int main(int argc, char* argv[])
                                 particles[idxPart].physicalValue[0], particles[idxPart].physicalValue[1], particles[idxPart].physicalValue[2]);
                 else
                     std::runtime_error("NPV not yet supported in test! Add new case.");
+
+                // 
+                // [TODO] Fix insertion of multiple physical values using std::array !!
+                //
+                //// Convert FReal[NVALS] to std::array<FReal,NVALS>
+                //std::array<FReal, (NPV+4*NPOT)*NVALS> physicalState;
+                //for(int idxVals = 0 ; idxVals < NVALS ; ++idxVals){
+                //    for(int idxPV = 0 ; idxPV < NPV ; ++idxPV)
+                //        physicalState[idxPV*NVALS+idxVals]=particles[idxPart].physicalValue[idxPV];
+                //    physicalState[(NPV+0)*NVALS+idxVals]=0.0;
+                //    physicalState[(NPV+1)*NVALS+idxVals]=0.0;
+                //    physicalState[(NPV+2)*NVALS+idxVals]=0.0;
+                //    physicalState[(NPV+3)*NVALS+idxVals]=0.0;
+                //}
+                //tree.insert(particles[idxPart].position, idxPart,physicalState);
+
             }
 
             time.tac();
@@ -245,6 +271,8 @@ int main(int argc, char* argv[])
                             //PB: store potential in array[nbParticles]
                             checkPotential[indexPartOrig][idxPot]=potentials[idxPart];
                             checkfx[indexPartOrig][idxPot]=forcesX[idxPart];
+                            //if(idxPart<10)
+                            //    std::cout << "  FMM potentials[idxPartOrigin="<< indexPartOrig <<"]=" << potentials[idxPart] << "  DIRECT potentials[idxPartOrigin="<< indexPartOrig <<"]=" << particles[indexPartOrig].potential[idxPot] << std::endl;
 
                             potentialDiff[idxPot].add(particles[indexPartOrig].potential[idxPot],potentials[idxPart]);
                             fx[idxPot].add(particles[indexPartOrig].forces[0][idxPot],forcesX[idxPart]);
diff --git a/Tests/noDist/testSmallCase.cpp b/Tests/noDist/testSmallCase.cpp
index 259b47ca98f85cb6bc14c6ed84c434efc79fbc7f..44b76b0f651318b486b8cd7e522a9427e0a88ba6 100644
--- a/Tests/noDist/testSmallCase.cpp
+++ b/Tests/noDist/testSmallCase.cpp
@@ -164,10 +164,10 @@ int main(int argc, char ** argv){
             FP2PR::MutualParticles(
                 particles[idxTarget].pos.getX(),particles[idxTarget].pos.getY(), particles[idxTarget].pos.getZ(),
                 particles[idxTarget].physicalValue, &particles[idxTarget].forces[0], &particles[idxTarget].forces[1],
-                    &particles[idxTarget].forces[2], &particles[idxTarget].pot,
+                &particles[idxTarget].forces[2], &particles[idxTarget].pot,
                 particles[idxSource].pos.getX(),particles[idxSource].pos.getY(), particles[idxSource].pos.getZ(),
                 particles[idxSource].physicalValue, &particles[idxSource].forces[0], &particles[idxSource].forces[1],
-                    &particles[idxSource].forces[2], &particles[idxSource].pot);
+                &particles[idxSource].forces[2], &particles[idxSource].pot);
         }
     }
 
diff --git a/UTests/utestChebyshevDirectPeriodic.cpp b/UTests/utestChebyshevDirectPeriodic.cpp
index 489ad9c9acf3ecabaa23c279487f9c4a91f7d8f6..c492a74a4c1f8c331c28174a1dc175ffcbd47725 100644
--- a/UTests/utestChebyshevDirectPeriodic.cpp
+++ b/UTests/utestChebyshevDirectPeriodic.cpp
@@ -140,12 +140,12 @@ class TestChebyshevDirect : public FUTester<TestChebyshevDirect> {
                 for(FSize idxOther =  0 ; idxOther < loader.getNumberOfParticles() ; ++idxOther){
                     if(idxTarget != idxOther){
                         FP2P::NonMutualParticles(
-                                particles[idxOther].position.getX(), particles[idxOther].position.getY(),
-                                particles[idxOther].position.getZ(),particles[idxOther].physicalValue,
                                 particles[idxTarget].position.getX(), particles[idxTarget].position.getY(),
                                 particles[idxTarget].position.getZ(),particles[idxTarget].physicalValue,
                                 &particles[idxTarget].forces[0],&particles[idxTarget].forces[1],
                                 &particles[idxTarget].forces[2],&particles[idxTarget].potential,
+                                particles[idxOther].position.getX(), particles[idxOther].position.getY(),
+                                particles[idxOther].position.getZ(),particles[idxOther].physicalValue,
                                 &MatrixKernel);
                     }
 
@@ -170,12 +170,13 @@ class TestChebyshevDirect : public FUTester<TestChebyshevDirect> {
 
                                 source.position += offset;
                                 FP2P::NonMutualParticles(
-                                            source.position.getX(), source.position.getY(),
-                                            source.position.getZ(),source.physicalValue,
                                             particles[idxTarget].position.getX(), particles[idxTarget].position.getY(),
                                             particles[idxTarget].position.getZ(),particles[idxTarget].physicalValue,
                                             &particles[idxTarget].forces[0],&particles[idxTarget].forces[1],
-                                        &particles[idxTarget].forces[2],&particles[idxTarget].potential,&MatrixKernel);
+                                            &particles[idxTarget].forces[2],&particles[idxTarget].potential,
+                                            source.position.getX(), source.position.getY(),
+                                            source.position.getZ(),source.physicalValue,
+                                            &MatrixKernel);
                             }
                         }
                     }
diff --git a/UTests/utestChebyshevDirectTsm.cpp b/UTests/utestChebyshevDirectTsm.cpp
index e9e8951dc35e2be3f10102c26f5332a9730aaff3..c0c32fa5b883214dc8e09397bc070f6ae4bbb56f 100644
--- a/UTests/utestChebyshevDirectTsm.cpp
+++ b/UTests/utestChebyshevDirectTsm.cpp
@@ -105,12 +105,12 @@ class TestChebyshevDirectTsm : public FUTester<TestChebyshevDirectTsm> {
         for(int idxTarget = 0 ; idxTarget < nbTargets ; ++idxTarget){
             for(int idxOther = 0 ; idxOther < nbSources ; ++idxOther){
                 FP2P::NonMutualParticles(
-                        particlesSources[idxOther].getPosition().getX(), particlesSources[idxOther].getPosition().getY(),
-                        particlesSources[idxOther].getPosition().getZ(),particlesSources[idxOther].getPhysicalValue(),
                         particlesTargets[idxTarget].getPosition().getX(), particlesTargets[idxTarget].getPosition().getY(),
                         particlesTargets[idxTarget].getPosition().getZ(),particlesTargets[idxTarget].getPhysicalValue(),
                         &particlesTargets[idxTarget].setForces()[0],&particlesTargets[idxTarget].setForces()[1],
                         &particlesTargets[idxTarget].setForces()[2],particlesTargets[idxTarget].setPotential(),
+                        particlesSources[idxOther].getPosition().getX(), particlesSources[idxOther].getPosition().getY(),
+                        particlesSources[idxOther].getPosition().getZ(),particlesSources[idxOther].getPhysicalValue(),
                         &MatrixKernel);
             }
         }
diff --git a/UTests/utestRotationDirectTsm.cpp b/UTests/utestRotationDirectTsm.cpp
index a963c8a6cb74fdbb645bcd2bb65ce96f45b7dcd6..3ea1f79bedb27a20ce7dd54de70ad497a519ff66 100644
--- a/UTests/utestRotationDirectTsm.cpp
+++ b/UTests/utestRotationDirectTsm.cpp
@@ -104,12 +104,12 @@ class TestRotationDirectTsm : public FUTester<TestRotationDirectTsm> {
 		for(int idxTarget = 0 ; idxTarget < nbTargets ; ++idxTarget){
 			for(int idxOther = 0 ; idxOther < nbSources ; ++idxOther){
                 FP2PR::NonMutualParticles(
-						particlesSources[idxOther].getPosition().getX(), particlesSources[idxOther].getPosition().getY(),
-						particlesSources[idxOther].getPosition().getZ(),particlesSources[idxOther].getPhysicalValue(),
 						particlesTargets[idxTarget].getPosition().getX(), particlesTargets[idxTarget].getPosition().getY(),
-						particlesTargets[idxTarget].getPosition().getZ(),particlesTargets[idxTarget].getPhysicalValue(),
-						&particlesTargets[idxTarget].setForces()[0],&particlesTargets[idxTarget].setForces()[1],
-                        &particlesTargets[idxTarget].setForces()[2],particlesTargets[idxTarget].setPotential());
+                        particlesTargets[idxTarget].getPosition().getZ(),particlesTargets[idxTarget].getPhysicalValue(),
+                        &particlesTargets[idxTarget].setForces()[0],&particlesTargets[idxTarget].setForces()[1],
+                        &particlesTargets[idxTarget].setForces()[2],particlesTargets[idxTarget].setPotential(),
+                        particlesSources[idxOther].getPosition().getX(), particlesSources[idxOther].getPosition().getY(),
+						particlesSources[idxOther].getPosition().getZ(),particlesSources[idxOther].getPhysicalValue());
 			}
 		}