diff --git a/Src/Kernels/Chebyshev/FChebInterpolator.hpp b/Src/Kernels/Chebyshev/FChebInterpolator.hpp
index 4f7ca0f0df496965c5579c1de53c371bacc3d7ee..e0e0fd7142baf5f763395060f9429e5e0d040b90 100644
--- a/Src/Kernels/Chebyshev/FChebInterpolator.hpp
+++ b/Src/Kernels/Chebyshev/FChebInterpolator.hpp
@@ -91,6 +91,144 @@ class FChebInterpolator : FNoCopyable
 	} perm2;
 	////////////////////////////////////////////////////////////////////
 
+	////////////////////////////////////////////////////////////////////
+	// needed for L2P
+	struct IJK2JKI {
+		enum {size = ORDER * ORDER * ORDER};
+		unsigned int ijk[size], jki[size];
+		IJK2JKI() {
+			unsigned int counter = 0;
+			for (unsigned int i=0; i<ORDER; ++i) {
+				for (unsigned int j=0; j<ORDER; ++j) {
+					for (unsigned int k=0; k<ORDER; ++k) {
+						ijk[counter] = k*ORDER*ORDER + j*ORDER + i;
+						jki[counter] = i*ORDER*ORDER + k*ORDER + j;
+						counter++;
+					}
+				}
+			}
+		}
+		void permute(const FReal *const IN, FReal *const OUT) const
+		{ for (unsigned int i=0; i<size; ++i) OUT[jki[i]] = IN[ijk[i]]; }
+	} perm3;
+
+	struct IJK2KIJ {
+		enum {size = ORDER * ORDER * ORDER};
+		unsigned int ijk[size], kij[size];
+		IJK2KIJ() {
+			unsigned int counter = 0;
+			for (unsigned int i=0; i<ORDER; ++i) {
+				for (unsigned int j=0; j<ORDER; ++j) {
+					for (unsigned int k=0; k<ORDER; ++k) {
+						ijk[counter] = k*ORDER*ORDER + j*ORDER + i;
+						kij[counter] = j*ORDER*ORDER + i*ORDER + k;
+						counter++;
+					}
+				}
+			}
+		}
+		void permute(const FReal *const IN, FReal *const OUT) const
+		{ for (unsigned int i=0; i<size; ++i) OUT[kij[i]] = IN[ijk[i]]; }
+	} perm4;
+
+	struct LJK2JKL {
+		enum {size = (ORDER-1) * ORDER * ORDER};
+		unsigned int ljk[size], jkl[size];
+		LJK2JKL() {
+			unsigned int counter = 0;
+			for (unsigned int l=0; l<ORDER-1; ++l) {
+				for (unsigned int j=0; j<ORDER; ++j) {
+					for (unsigned int k=0; k<ORDER; ++k) {
+						ljk[counter] = k*ORDER*(ORDER-1) + j*(ORDER-1) + l;
+						jkl[counter] = l*ORDER*ORDER + k*ORDER + j;
+						counter++;
+					}
+				}
+			}
+		}
+		void permute(const FReal *const IN, FReal *const OUT) const
+		{ for (unsigned int i=0; i<size; ++i) OUT[jkl[i]] = IN[ljk[i]]; }
+	} perm5;
+
+	struct LJK2KLJ {
+		enum {size = (ORDER-1) * ORDER * ORDER};
+		unsigned int ljk[size], klj[size];
+		LJK2KLJ() {
+			unsigned int counter = 0;
+			for (unsigned int l=0; l<ORDER-1; ++l) {
+				for (unsigned int j=0; j<ORDER; ++j) {
+					for (unsigned int k=0; k<ORDER; ++k) {
+						ljk[counter] = k*ORDER*(ORDER-1) + j*(ORDER-1) + l;
+						klj[counter] = j*(ORDER-1)*ORDER + l*ORDER + k;
+						counter++;
+					}
+				}
+			}
+		}
+		void permute(const FReal *const IN, FReal *const OUT) const
+		{ for (unsigned int i=0; i<size; ++i) OUT[klj[i]] = IN[ljk[i]]; }
+	} perm6;
+
+	struct MKI2KIM {
+		enum {size = (ORDER-1) * ORDER * ORDER};
+		unsigned int mki[size], kim[size];
+		MKI2KIM() {
+			unsigned int counter = 0;
+			for (unsigned int m=0; m<ORDER-1; ++m) {
+				for (unsigned int k=0; k<ORDER; ++k) {
+					for (unsigned int i=0; i<ORDER; ++i) {
+						mki[counter] = i*ORDER*(ORDER-1) + k*(ORDER-1) + m;
+						kim[counter] = m*ORDER*ORDER + i*ORDER + k;
+						counter++;
+					}
+				}
+			}
+		}
+		void permute(const FReal *const IN, FReal *const OUT) const
+		{ for (unsigned int i=0; i<size; ++i) OUT[kim[i]] = IN[mki[i]]; }
+	} perm7;
+
+	struct MKL2KLM {
+		enum {size = (ORDER-1) * ORDER * (ORDER-1)};
+		unsigned int mkl[size], klm[size];
+		MKL2KLM() {
+			unsigned int counter = 0;
+			for (unsigned int m=0; m<ORDER-1; ++m) {
+				for (unsigned int k=0; k<ORDER; ++k) {
+					for (unsigned int l=0; l<ORDER-1; ++l) {
+						mkl[counter] = l*ORDER*(ORDER-1) + k*(ORDER-1) + m;
+						klm[counter] = m*(ORDER-1)*ORDER + l*ORDER + k;
+						counter++;
+					}
+				}
+			}
+		}
+		void permute(const FReal *const IN, FReal *const OUT) const
+		{ for (unsigned int i=0; i<size; ++i) OUT[klm[i]] = IN[mkl[i]]; }
+	} perm8;
+
+	struct NLM2LMN {
+		enum {size = (ORDER-1) * (ORDER-1) * (ORDER-1)};
+		unsigned int nlm[size], lmn[size];
+		NLM2LMN() {
+			unsigned int counter = 0;
+			for (unsigned int n=0; n<ORDER-1; ++n) {
+				for (unsigned int l=0; l<ORDER-1; ++l) {
+					for (unsigned int m=0; m<ORDER-1; ++m) {
+						nlm[counter] = m*(ORDER-1)*(ORDER-1) + l*(ORDER-1) + n;
+						lmn[counter] = n*(ORDER-1)*(ORDER-1) + m*(ORDER-1) + l;
+						counter++;
+					}
+				}
+			}
+		}
+		void permute(const FReal *const IN, FReal *const OUT) const
+		{ for (unsigned int i=0; i<size; ++i) OUT[lmn[i]] = IN[nlm[i]]; }
+	} perm9;
+
+	////////////////////////////////////////////////////////////////////
+
+
 
 	/**
 	 * Initialize the child - parent - interpolator, it is basically the matrix
@@ -590,9 +728,6 @@ inline void FChebInterpolator<ORDER>::applyP2M(const FPoint& center,
 
 
 
-
-
-
 /**
  * Local to particle operation: application of \f$S_\ell(x,\bar x_m)\f$ (interpolation)
  */
@@ -603,190 +738,314 @@ inline void FChebInterpolator<ORDER>::applyL2P(const FPoint& center,
 																							 const FReal *const localExpansion,
 																							 ContainerClass *const localParticles) const
 {
-	// allocate stuff
+	FReal f1;
+	FReal W2[3][ ORDER-1];
+	FReal W4[3][(ORDER-1)*(ORDER-1)];
+	FReal W8[   (ORDER-1)*(ORDER-1)*(ORDER-1)];
+	{ // sum over interpolation points
+		f1 = FReal(0.);
+		for(unsigned int i=0; i<ORDER-1; ++i)	                   W2[0][i] = W2[1][i] = W2[2][i] = FReal(0.);
+		for(unsigned int i=0; i<(ORDER-1)*(ORDER-1); ++i)        W4[0][i] = W4[1][i] = W4[2][i] = FReal(0.);
+		for(unsigned int i=0; i<(ORDER-1)*(ORDER-1)*(ORDER-1); ++i)	W8[i] = FReal(0.);
+		
+		for (unsigned int idx=0; idx<nnodes; ++idx) {
+			const unsigned int i = node_ids[idx][0];
+			const unsigned int j = node_ids[idx][1];
+			const unsigned int k = node_ids[idx][2];
+			
+			f1 += localExpansion[idx]; // 1 flop
+
+			for (unsigned int l=0; l<ORDER-1; ++l) {
+				const FReal wx = T[l*ORDER+i] * localExpansion[idx]; // 1 flops
+				const FReal wy = T[l*ORDER+j] * localExpansion[idx]; // 1 flops
+				const FReal wz = T[l*ORDER+k] * localExpansion[idx]; // 1 flops
+				W2[0][l] += wx; // 1 flops
+				W2[1][l] += wy; // 1 flops
+				W2[2][l] += wz; // 1 flops
+				for (unsigned int m=0; m<ORDER-1; ++m) {
+					const FReal wxy = wx * T[m*ORDER + j]; // 1 flops
+					const FReal wxz = wx * T[m*ORDER + k]; // 1 flops
+					const FReal wyz = wy * T[m*ORDER + k]; // 1 flops
+					W4[0][m*(ORDER-1)+l] += wxy; // 1 flops
+					W4[1][m*(ORDER-1)+l] += wxz; // 1 flops
+					W4[2][m*(ORDER-1)+l] += wyz; // 1 flops
+					for (unsigned int n=0; n<ORDER-1; ++n) {
+						const FReal wxyz = wxy * T[n*ORDER + k]; // 1 flops
+						W8[n*(ORDER-1)*(ORDER-1) + m*(ORDER-1) + l]	+= wxyz; // 1 flops
+					} // (ORDER-1) * 2 flops
+				} // (ORDER-1) * (6 + (ORDER-1)*2) flops
+			} // (ORDER-1) * (6 + (ORDER-1) * (6 + (ORDER-1)*2)) flops
+		} // ORDER*ORDER*ORDER * (1 + (ORDER-1) * (6 + (ORDER-1) * (6 + (ORDER-1)*2))) flops
+		
+	}
+
+
+	// loop over particles
 	const map_glob_loc map(center, width);
 	FPoint localPosition;
-	FReal T_of_x[ORDER][3];
-	FReal xpx,ypy,zpz ;
-	FReal S[3],c1;
-	//
-	c1 = FReal(8.) / nnodes ;
 	typename ContainerClass::BasicIterator iter(*localParticles);
 	while(iter.hasNotFinished()){
 			
 		// map global position to [-1,1]
-		map(iter.data().getPosition(), localPosition);
-
-		// evaluate chebyshev polynomials of source particle: T_o(x_i)
-		T_of_x[0][0] = FReal(1.);	T_of_x[1][0] = localPosition.getX();
-		T_of_x[0][1] = FReal(1.);	T_of_x[1][1] = localPosition.getY();
-		T_of_x[0][2] = FReal(1.);	T_of_x[1][2] = localPosition.getZ();
-		xpx = FReal(2.) * localPosition.getX() ;
-		ypy = FReal(2.) * localPosition.getY() ;
-		zpz = FReal(2.) * localPosition.getZ() ;
-		for (unsigned int o=2; o<ORDER; ++o) {
-			T_of_x[o][0] = xpx * T_of_x[o-1][0] - T_of_x[o-2][0];
-			T_of_x[o][1] = ypy * T_of_x[o-1][1] - T_of_x[o-2][1];
-			T_of_x[o][2] = zpz * T_of_x[o-1][2] - T_of_x[o-2][2];
+		map(iter.data().getPosition(), localPosition); // 15 flops
+
+		FReal T_of_x[3][ORDER];
+		{
+			T_of_x[0][0] = FReal(1.); T_of_x[0][1] = localPosition.getX();
+			T_of_x[1][0] = FReal(1.); T_of_x[1][1] = localPosition.getY();
+			T_of_x[2][0] = FReal(1.); T_of_x[2][1] = localPosition.getZ();
+			const FReal x2 = FReal(2.) * T_of_x[0][1]; // 1 flop
+			const FReal y2 = FReal(2.) * T_of_x[1][1]; // 1 flop
+			const FReal z2 = FReal(2.) * T_of_x[2][1]; // 1 flop
+			for (unsigned int j=2; j<ORDER; ++j) {
+				T_of_x[0][j] = x2 * T_of_x[0][j-1] - T_of_x[0][j-2]; // 2 flops
+				T_of_x[1][j] = y2 * T_of_x[1][j-1] - T_of_x[1][j-2]; // 2 flops
+				T_of_x[2][j] = z2 * T_of_x[2][j-1] - T_of_x[2][j-2]; // 2 flops
+			}
 		}
 
 		// interpolate and increment target value
 		FReal targetValue = iter.data().getPotential();
-		for (unsigned int n=0; n<nnodes; ++n) {
-			const unsigned int j[3] = {node_ids[n][0], node_ids[n][1], node_ids[n][2]};
-			S[0] = T_of_x[1][0] * T_of_roots[1][j[0]];
-			S[1] = T_of_x[1][1] * T_of_roots[1][j[1]];
-			S[2] = T_of_x[1][2] * T_of_roots[1][j[2]];
-			for (unsigned int o=2; o<ORDER; ++o) {
-				S[0] += T_of_x[o][0] * T_of_roots[o][j[0]];
-				S[1] += T_of_x[o][1] * T_of_roots[o][j[1]];
-				S[2] += T_of_x[o][2] * T_of_roots[o][j[2]];
+		{
+			FReal f2, f4, f8;
+			{
+				f2 = f4 = f8 = FReal(0.);
+				for (unsigned int l=1; l<ORDER; ++l) {
+					f2 +=
+						T_of_x[0][l] * W2[0][l-1] +
+						T_of_x[1][l] * W2[1][l-1] +
+						T_of_x[2][l] * W2[2][l-1]; // 6 flops
+					for (unsigned int m=1; m<ORDER; ++m) {
+						f4 +=
+							T_of_x[0][l] * T_of_x[1][m] * W4[0][(m-1)*(ORDER-1)+(l-1)] +
+							T_of_x[0][l] * T_of_x[2][m] * W4[1][(m-1)*(ORDER-1)+(l-1)] +
+							T_of_x[1][l] * T_of_x[2][m] * W4[2][(m-1)*(ORDER-1)+(l-1)]; // 9 flops
+						for (unsigned int n=1; n<ORDER; ++n) {
+							f8 +=
+								T_of_x[0][l] * T_of_x[1][m] * T_of_x[2][n] *
+								W8[(n-1)*(ORDER-1)*(ORDER-1) + (m-1)*(ORDER-1) + (l-1)];
+						} // ORDER * 4 flops
+					} // ORDER * (9 + ORDER * 4) flops
+				} // ORDER * (ORDER * (9 + ORDER * 4)) flops
 			}
-			// gather contributions
-			// S[0] *= FReal(2.); S[0] += FReal(1.);
-			// S[1] *= FReal(2.); S[1] += FReal(1.);
-			// S[2] *= FReal(2.); S[2] += FReal(1.);
-			// targetValue	+= S[0] * S[1] * S[2] * localExpansion[n];
-			S[0] += FReal(0.5);
-			S[1] += FReal(0.5);
-			S[2] += FReal(0.5);
-			//
-			targetValue	+= S[0] * S[1] * S[2] * localExpansion[n];
-		}
-		// scale
-		//		targetValue /= nnodes;
-		targetValue *= c1;
+			targetValue = (f1 + FReal(2.)*f2 + FReal(4.)*f4 + FReal(8.)*f8) / nnodes; // 7 flops
+		} // 7 + ORDER * (ORDER * (9 + ORDER * 4)) flops
 
 		// set potential
 		iter.data().setPotential(targetValue);
+
 		// increment target iterator
 		iter.gotoNext();
-	}
+	} // N * (7 + ORDER * (ORDER * (9 + ORDER * 4))) flops
 }
 
 
+//	FReal F2[3][ORDER-1];
+//	FBlas::gemtv(ORDER, ORDER-1, FReal(1.), const_cast<FReal*>(T), const_cast<FReal*>(localExpansion), F2[0]);
+//	for (unsigned int i=1; i<ORDER*ORDER; ++i)
+//		FBlas::gemtva(ORDER, ORDER-1, FReal(1.), const_cast<FReal*>(T),
+//									const_cast<FReal*>(localExpansion) + ORDER*i, F2[0]);
+//	for (unsigned int i=0; i<ORDER-1; ++i)
+//		std::cout << W2[0][i] << "\t" << F2[0][i] << std::endl;
 
+//	FReal F2[(ORDER-1) * ORDER*ORDER];
+//	FBlas::gemtm(ORDER, ORDER-1, ORDER*ORDER, FReal(1.), const_cast<FReal*>(T), ORDER,
+//							 const_cast<FReal*>(localExpansion), ORDER, F2, ORDER-1);
+//	FReal F[ORDER-1]; FBlas::setzero(ORDER-1, F);
+//	for (unsigned int i=0; i<ORDER-1; ++i)
+//		for (unsigned int j=0; j<ORDER*ORDER; ++j) F[i] += F2[j*(ORDER-1) + i];
+//	for (unsigned int i=0; i<ORDER-1; ++i)
+//		std::cout << W2[0][i] << "\t" << F[i] << std::endl;
 
 
+///**
+// * Local to particle operation: application of \f$S_\ell(x,\bar x_m)\f$ (interpolation)
+// */
+//template <int ORDER>
+//template <class ContainerClass>
+//inline void FChebInterpolator<ORDER>::applyL2P(const FPoint& center,
+//																							 const FReal width,
+//																							 const FReal *const localExpansion,
+//																							 ContainerClass *const localParticles) const
+//{
+//	// allocate stuff
+//	const map_glob_loc map(center, width);
+//	FPoint localPosition;
+//	FReal T_of_x[ORDER][3];
+//	FReal xpx,ypy,zpz ;
+//	FReal S[3],c1;
+//	//
+//	c1 = FReal(8.) / nnodes ;
+//	typename ContainerClass::BasicIterator iter(*localParticles);
+//	while(iter.hasNotFinished()){
+//			
+//		// map global position to [-1,1]
+//		map(iter.data().getPosition(), localPosition); // 15 flops
+//
+//		// evaluate chebyshev polynomials of source particle: T_o(x_i)
+//		T_of_x[0][0] = FReal(1.);	T_of_x[1][0] = localPosition.getX();
+//		T_of_x[0][1] = FReal(1.);	T_of_x[1][1] = localPosition.getY();
+//		T_of_x[0][2] = FReal(1.);	T_of_x[1][2] = localPosition.getZ();
+//		xpx = FReal(2.) * localPosition.getX() ; // 1 flop
+//		ypy = FReal(2.) * localPosition.getY() ; // 1 flop
+//		zpz = FReal(2.) * localPosition.getZ() ; // 1 flop
+//		for (unsigned int o=2; o<ORDER; ++o) {
+//			T_of_x[o][0] = xpx * T_of_x[o-1][0] - T_of_x[o-2][0]; // 2 flop
+//			T_of_x[o][1] = ypy * T_of_x[o-1][1] - T_of_x[o-2][1]; // 2 flop
+//			T_of_x[o][2] = zpz * T_of_x[o-1][2] - T_of_x[o-2][2]; // 2 flop
+//		} // (ORDER-2) * 6 flops
+//
+//		// interpolate and increment target value
+//		FReal targetValue = iter.data().getPotential();
+//		for (unsigned int n=0; n<nnodes; ++n) {
+//			const unsigned int j[3] = {node_ids[n][0], node_ids[n][1], node_ids[n][2]};
+//			S[0] = T_of_x[1][0] * T_of_roots[1][j[0]]; // 1 flops
+//			S[1] = T_of_x[1][1] * T_of_roots[1][j[1]]; // 1 flops
+//			S[2] = T_of_x[1][2] * T_of_roots[1][j[2]]; // 1 flops
+//			for (unsigned int o=2; o<ORDER; ++o) {
+//				S[0] += T_of_x[o][0] * T_of_roots[o][j[0]]; // 2 flops
+//				S[1] += T_of_x[o][1] * T_of_roots[o][j[1]]; // 2 flops
+//				S[2] += T_of_x[o][2] * T_of_roots[o][j[2]]; // 2 flops
+//			} // (ORDER-2) * 6 flops 
+//			// gather contributions
+//			S[0] += FReal(0.5); // 1 flops
+//			S[1] += FReal(0.5); // 1 flops
+//			S[2] += FReal(0.5); // 1 flops
+//			targetValue	+= S[0] * S[1] * S[2] * localExpansion[n]; // 4 flops
+//		} // ORDER*ORDER*ORDER * (10 + (ORDER-2) * 6) flops
+//		// scale
+//		targetValue *= c1; // 1 flops
+//
+//		// set potential
+//		iter.data().setPotential(targetValue);
+//
+//		// increment target iterator
+//		iter.gotoNext();
+//	} // N * ORDER*ORDER*ORDER * (10 + (ORDER-2) * 6) flops
+//}
 
-/**
- * Local to particle operation: application of \f$\nabla_x S_\ell(x,\bar x_m)\f$ (interpolation)
- */
-template <int ORDER>
-template <class ContainerClass>
-inline void FChebInterpolator<ORDER>::applyL2PGradient(const FPoint& center,
-																											 const FReal width,
-																											 const FReal *const localExpansion,
-																											 ContainerClass *const localParticles) const
-{
-	// setup local to global mapping
-	const map_glob_loc map(center, width);
-	FPoint Jacobian;
-	map.computeJacobian(Jacobian);
-	const FReal jacobian[3] = {Jacobian.getX(), Jacobian.getY(), Jacobian.getZ()}; 
-	FPoint localPosition;
-	FReal T_of_x[ORDER][3];
-	FReal U_of_x[ORDER][3];
-	FReal P[3];
-
-	typename ContainerClass::BasicIterator iter(*localParticles);
-	while(iter.hasNotFinished()){
-			
-		// map global position to [-1,1]
-		map(iter.data().getPosition(), localPosition);
-			
-		// evaluate chebyshev polynomials of source particle
-		// T_0(x_i) and T_1(x_i)
-		T_of_x[0][0] = FReal(1.);	T_of_x[1][0] = localPosition.getX();
-		T_of_x[0][1] = FReal(1.);	T_of_x[1][1] = localPosition.getY();
-		T_of_x[0][2] = FReal(1.);	T_of_x[1][2] = localPosition.getZ();
-		// U_0(x_i) and U_1(x_i)
-		U_of_x[0][0] = FReal(1.);	U_of_x[1][0] = localPosition.getX() * FReal(2.);
-		U_of_x[0][1] = FReal(1.);	U_of_x[1][1] = localPosition.getY() * FReal(2.);
-		U_of_x[0][2] = FReal(1.);	U_of_x[1][2] = localPosition.getZ() * FReal(2.);
-		for (unsigned int o=2; o<ORDER; ++o) {
-			// T_o(x_i)
-			T_of_x[o][0] = FReal(2.)*localPosition.getX()*T_of_x[o-1][0] - T_of_x[o-2][0];
-			T_of_x[o][1] = FReal(2.)*localPosition.getY()*T_of_x[o-1][1] - T_of_x[o-2][1];
-			T_of_x[o][2] = FReal(2.)*localPosition.getZ()*T_of_x[o-1][2] - T_of_x[o-2][2];
-			// U_o(x_i)
-			U_of_x[o][0] = FReal(2.)*localPosition.getX()*U_of_x[o-1][0] - U_of_x[o-2][0];
-			U_of_x[o][1] = FReal(2.)*localPosition.getY()*U_of_x[o-1][1] - U_of_x[o-2][1];
-			U_of_x[o][2] = FReal(2.)*localPosition.getZ()*U_of_x[o-1][2] - U_of_x[o-2][2];
-		}
-
-		// scale, because dT_o/dx = oU_{o-1}
-		for (unsigned int o=2; o<ORDER; ++o) {
-			U_of_x[o-1][0] *= FReal(o);
-			U_of_x[o-1][1] *= FReal(o);
-			U_of_x[o-1][2] *= FReal(o);
-		}
 
-		// apply P and increment forces
-		FReal forces[3] = {FReal(0.), FReal(0.), FReal(0.)};
-		for (unsigned int n=0; n<nnodes; ++n) {
-			
-			// tensor indices of chebyshev nodes
-			const unsigned int j[3] = {node_ids[n][0], node_ids[n][1], node_ids[n][2]};
-
-			// f0 component //////////////////////////////////////
-			P[0] = U_of_x[0][0] * T_of_roots[1][j[0]];
-			P[1] = T_of_x[1][1] * T_of_roots[1][j[1]];
-			P[2] = T_of_x[1][2] * T_of_roots[1][j[2]];
-			for (unsigned int o=2; o<ORDER; ++o) {
-				P[0] += U_of_x[o-1][0] * T_of_roots[o][j[0]];
-				P[1] += T_of_x[o  ][1] * T_of_roots[o][j[1]];
-				P[2] += T_of_x[o  ][2] * T_of_roots[o][j[2]];
-			}
-			P[0] *= FReal(2.);
-			P[1] *= FReal(2.); P[1] += FReal(1.);
-			P[2] *= FReal(2.); P[2] += FReal(1.);
-			forces[0]	+= P[0] * P[1] * P[2] * localExpansion[n];
-
-			// f1 component //////////////////////////////////////
-			P[0] = T_of_x[1][0] * T_of_roots[1][j[0]];
-			P[1] = U_of_x[0][1] * T_of_roots[1][j[1]];
-			P[2] = T_of_x[1][2] * T_of_roots[1][j[2]];
-			for (unsigned int o=2; o<ORDER; ++o) {
-				P[0] += T_of_x[o  ][0] * T_of_roots[o][j[0]];
-				P[1] += U_of_x[o-1][1] * T_of_roots[o][j[1]];
-				P[2] += T_of_x[o  ][2] * T_of_roots[o][j[2]];
-			}
-			P[0] *= FReal(2.); P[0] += FReal(1.);
-			P[1] *= FReal(2.); 
-			P[2] *= FReal(2.); P[2] += FReal(1.);
-			forces[1]	+= P[0] * P[1] * P[2] * localExpansion[n];
-
-			// f2 component //////////////////////////////////////
-			P[0] = T_of_x[1][0] * T_of_roots[1][j[0]];
-			P[1] = T_of_x[1][1] * T_of_roots[1][j[1]];
-			P[2] = U_of_x[0][2] * T_of_roots[1][j[2]];
-			for (unsigned int o=2; o<ORDER; ++o) {
-				P[0] += T_of_x[o  ][0] * T_of_roots[o][j[0]];
-				P[1] += T_of_x[o  ][1] * T_of_roots[o][j[1]];
-				P[2] += U_of_x[o-1][2] * T_of_roots[o][j[2]];
-			}
-			P[0] *= FReal(2.); P[0] += FReal(1.);
-			P[1] *= FReal(2.); P[1] += FReal(1.);
-			P[2] *= FReal(2.);
-			forces[2]	+= P[0] * P[1] * P[2] * localExpansion[n];
-		}
 
-		// scale forces
-		forces[0] *= jacobian[0] / nnodes;
-		forces[1] *= jacobian[1] / nnodes;
-		forces[2] *= jacobian[2] / nnodes;
 
-		// set computed forces
-		iter.data().incForces(forces[0] * iter.data().getPhysicalValue(),
-													forces[1] * iter.data().getPhysicalValue(),
-													forces[2] * iter.data().getPhysicalValue());
 
-		// increment iterator
-		iter.gotoNext();
-	}
-}
 
+///**
+// * Local to particle operation: application of \f$\nabla_x S_\ell(x,\bar x_m)\f$ (interpolation)
+// */
+//template <int ORDER>
+//template <class ContainerClass>
+//inline void FChebInterpolator<ORDER>::applyL2PGradient(const FPoint& center,
+//																											 const FReal width,
+//																											 const FReal *const localExpansion,
+//																											 ContainerClass *const localParticles) const
+//{
+//	// setup local to global mapping
+//	const map_glob_loc map(center, width);
+//	FPoint Jacobian;
+//	map.computeJacobian(Jacobian);
+//	const FReal jacobian[3] = {Jacobian.getX(), Jacobian.getY(), Jacobian.getZ()}; 
+//	FPoint localPosition;
+//	FReal T_of_x[ORDER][3];
+//	FReal U_of_x[ORDER][3];
+//	FReal P[3];
+//
+//	typename ContainerClass::BasicIterator iter(*localParticles);
+//	while(iter.hasNotFinished()){
+//			
+//		// map global position to [-1,1]
+//		map(iter.data().getPosition(), localPosition);
+//			
+//		// evaluate chebyshev polynomials of source particle
+//		// T_0(x_i) and T_1(x_i)
+//		T_of_x[0][0] = FReal(1.);	T_of_x[1][0] = localPosition.getX();
+//		T_of_x[0][1] = FReal(1.);	T_of_x[1][1] = localPosition.getY();
+//		T_of_x[0][2] = FReal(1.);	T_of_x[1][2] = localPosition.getZ();
+//		// U_0(x_i) and U_1(x_i)
+//		U_of_x[0][0] = FReal(1.);	U_of_x[1][0] = localPosition.getX() * FReal(2.);
+//		U_of_x[0][1] = FReal(1.);	U_of_x[1][1] = localPosition.getY() * FReal(2.);
+//		U_of_x[0][2] = FReal(1.);	U_of_x[1][2] = localPosition.getZ() * FReal(2.);
+//		for (unsigned int o=2; o<ORDER; ++o) {
+//			// T_o(x_i)
+//			T_of_x[o][0] = FReal(2.)*localPosition.getX()*T_of_x[o-1][0] - T_of_x[o-2][0];
+//			T_of_x[o][1] = FReal(2.)*localPosition.getY()*T_of_x[o-1][1] - T_of_x[o-2][1];
+//			T_of_x[o][2] = FReal(2.)*localPosition.getZ()*T_of_x[o-1][2] - T_of_x[o-2][2];
+//			// U_o(x_i)
+//			U_of_x[o][0] = FReal(2.)*localPosition.getX()*U_of_x[o-1][0] - U_of_x[o-2][0];
+//			U_of_x[o][1] = FReal(2.)*localPosition.getY()*U_of_x[o-1][1] - U_of_x[o-2][1];
+//			U_of_x[o][2] = FReal(2.)*localPosition.getZ()*U_of_x[o-1][2] - U_of_x[o-2][2];
+//		}
+//
+//		// scale, because dT_o/dx = oU_{o-1}
+//		for (unsigned int o=2; o<ORDER; ++o) {
+//			U_of_x[o-1][0] *= FReal(o);
+//			U_of_x[o-1][1] *= FReal(o);
+//			U_of_x[o-1][2] *= FReal(o);
+//		}
+//
+//		// apply P and increment forces
+//		FReal forces[3] = {FReal(0.), FReal(0.), FReal(0.)};
+//		for (unsigned int n=0; n<nnodes; ++n) {
+//			
+//			// tensor indices of chebyshev nodes
+//			const unsigned int j[3] = {node_ids[n][0], node_ids[n][1], node_ids[n][2]};
+//
+//			// f0 component //////////////////////////////////////
+//			P[0] = U_of_x[0][0] * T_of_roots[1][j[0]];
+//			P[1] = T_of_x[1][1] * T_of_roots[1][j[1]];
+//			P[2] = T_of_x[1][2] * T_of_roots[1][j[2]];
+//			for (unsigned int o=2; o<ORDER; ++o) {
+//				P[0] += U_of_x[o-1][0] * T_of_roots[o][j[0]];
+//				P[1] += T_of_x[o  ][1] * T_of_roots[o][j[1]];
+//				P[2] += T_of_x[o  ][2] * T_of_roots[o][j[2]];
+//			}
+//			P[0] *= FReal(2.);
+//			P[1] *= FReal(2.); P[1] += FReal(1.);
+//			P[2] *= FReal(2.); P[2] += FReal(1.);
+//			forces[0]	+= P[0] * P[1] * P[2] * localExpansion[n];
+//
+//			// f1 component //////////////////////////////////////
+//			P[0] = T_of_x[1][0] * T_of_roots[1][j[0]];
+//			P[1] = U_of_x[0][1] * T_of_roots[1][j[1]];
+//			P[2] = T_of_x[1][2] * T_of_roots[1][j[2]];
+//			for (unsigned int o=2; o<ORDER; ++o) {
+//				P[0] += T_of_x[o  ][0] * T_of_roots[o][j[0]];
+//				P[1] += U_of_x[o-1][1] * T_of_roots[o][j[1]];
+//				P[2] += T_of_x[o  ][2] * T_of_roots[o][j[2]];
+//			}
+//			P[0] *= FReal(2.); P[0] += FReal(1.);
+//			P[1] *= FReal(2.); 
+//			P[2] *= FReal(2.); P[2] += FReal(1.);
+//			forces[1]	+= P[0] * P[1] * P[2] * localExpansion[n];
+//
+//			// f2 component //////////////////////////////////////
+//			P[0] = T_of_x[1][0] * T_of_roots[1][j[0]];
+//			P[1] = T_of_x[1][1] * T_of_roots[1][j[1]];
+//			P[2] = U_of_x[0][2] * T_of_roots[1][j[2]];
+//			for (unsigned int o=2; o<ORDER; ++o) {
+//				P[0] += T_of_x[o  ][0] * T_of_roots[o][j[0]];
+//				P[1] += T_of_x[o  ][1] * T_of_roots[o][j[1]];
+//				P[2] += U_of_x[o-1][2] * T_of_roots[o][j[2]];
+//			}
+//			P[0] *= FReal(2.); P[0] += FReal(1.);
+//			P[1] *= FReal(2.); P[1] += FReal(1.);
+//			P[2] *= FReal(2.);
+//			forces[2]	+= P[0] * P[1] * P[2] * localExpansion[n];
+//		}
+//
+//		// scale forces
+//		forces[0] *= jacobian[0] / nnodes;
+//		forces[1] *= jacobian[1] / nnodes;
+//		forces[2] *= jacobian[2] / nnodes;
+//
+//		// set computed forces
+//		iter.data().incForces(forces[0] * iter.data().getPhysicalValue(),
+//													forces[1] * iter.data().getPhysicalValue(),
+//													forces[2] * iter.data().getPhysicalValue());
+//
+//		// increment iterator
+//		iter.gotoNext();
+//	}
+//}
 
 
 /**
@@ -800,116 +1059,312 @@ inline void FChebInterpolator<ORDER>::applyL2PTotal(const FPoint& center,
 																										const FReal *const localExpansion,
 																										ContainerClass *const localParticles) const
 {
-	// setup local to global mapping
+	FReal f1;
+	FReal W2[3][ ORDER-1];
+	FReal W4[3][(ORDER-1)*(ORDER-1)];
+	FReal W8[   (ORDER-1)*(ORDER-1)*(ORDER-1)];
+
+	//{ // sum over interpolation points
+	//	f1 = FReal(0.);
+	//	for(unsigned int i=0; i<ORDER-1; ++i)	                   W2[0][i] = W2[1][i] = W2[2][i] = FReal(0.);
+	//	for(unsigned int i=0; i<(ORDER-1)*(ORDER-1); ++i)        W4[0][i] = W4[1][i] = W4[2][i] = FReal(0.);
+	//	for(unsigned int i=0; i<(ORDER-1)*(ORDER-1)*(ORDER-1); ++i)	W8[i] = FReal(0.);
+	//	
+	//	for (unsigned int idx=0; idx<nnodes; ++idx) {
+	//		const unsigned int i = node_ids[idx][0];
+	//		const unsigned int j = node_ids[idx][1];
+	//		const unsigned int k = node_ids[idx][2];
+	//		
+	//		f1 += localExpansion[idx]; // 1 flop
+	//		
+	//		for (unsigned int l=0; l<ORDER-1; ++l) {
+	//			const FReal wx = T[l*ORDER+i] * localExpansion[idx]; // 1 flops
+	//			const FReal wy = T[l*ORDER+j] * localExpansion[idx]; // 1 flops
+	//			const FReal wz = T[l*ORDER+k] * localExpansion[idx]; // 1 flops
+	//			W2[0][l] += wx; // 1 flops
+	//			W2[1][l] += wy; // 1 flops
+	//			W2[2][l] += wz; // 1 flops
+	//			for (unsigned int m=0; m<ORDER-1; ++m) {
+	//				const FReal wxy = wx * T[m*ORDER + j]; // 1 flops
+	//				const FReal wxz = wx * T[m*ORDER + k]; // 1 flops
+	//				const FReal wyz = wy * T[m*ORDER + k]; // 1 flops
+	//				W4[0][m*(ORDER-1)+l] += wxy; // 1 flops
+	//				W4[1][m*(ORDER-1)+l] += wxz; // 1 flops
+	//				W4[2][m*(ORDER-1)+l] += wyz; // 1 flops
+	//				for (unsigned int n=0; n<ORDER-1; ++n) {
+	//					const FReal wxyz = wxy * T[n*ORDER + k]; // 1 flops
+	//					W8[n*(ORDER-1)*(ORDER-1) + m*(ORDER-1) + l]	+= wxyz; // 1 flops
+	//				} // (ORDER-1) * 2 flops
+	//			} // (ORDER-1) * (6 + (ORDER-1)*2) flops
+	//		} // (ORDER-1) * (6 + (ORDER-1) * (6 + (ORDER-1)*2)) flops
+	//
+	//	} // ORDER*ORDER*ORDER * (1 + (ORDER-1) * (6 + (ORDER-1) * (6 + (ORDER-1)*2))) flops
+	//	
+	//}
+
+	{
+		// for W2
+		FReal lE[nnodes];
+		FReal F2[(ORDER-1) * ORDER*ORDER];
+		// for W4
+		FReal F4[ORDER * ORDER*(ORDER-1)];
+		FReal G4[(ORDER-1) * ORDER*(ORDER-1)];
+		// for W8
+		FReal G8[ORDER * (ORDER-1)*(ORDER-1)];
+
+		// sum local expansions
+		f1 = FReal(0.);
+		for (unsigned int idx=0; idx<nnodes; ++idx)	f1 += localExpansion[idx]; // 1 flop
+
+		//////////////////////////////////////////////////////////////////
+		// IMPORTANT: NOT CHANGE ORDER OF COMPUTATIONS!!! ////////////////
+		//////////////////////////////////////////////////////////////////
+
+		// W2[0] /////////////////
+		FBlas::gemtm(ORDER, ORDER-1, ORDER*ORDER, FReal(1.), const_cast<FReal*>(T), ORDER,
+								 const_cast<FReal*>(localExpansion), ORDER, F2, ORDER-1);
+		for (unsigned int l=0; l<ORDER-1; ++l) { W2[0][l] = F2[l];
+			for (unsigned int j=1; j<ORDER*ORDER; ++j) W2[0][l] += F2[j*(ORDER-1) + l];	}
+		// W4[0] /////////////////
+		perm5.permute(F2, F4);
+		FBlas::gemtm(ORDER, ORDER-1, ORDER*(ORDER-1), FReal(1.), const_cast<FReal*>(T), ORDER, F4, ORDER, G4, ORDER-1);
+		for (unsigned int l=0; l<ORDER-1; ++l)
+			for (unsigned int m=0; m<ORDER-1; ++m) { W4[0][m*(ORDER-1)+l] = G4[l*ORDER*(ORDER-1) + m];
+				for (unsigned int k=1; k<ORDER; ++k) W4[0][m*(ORDER-1)+l] += G4[l*ORDER*(ORDER-1) + k*(ORDER-1) + m];	}
+		// W8 ////////////////////
+		perm8.permute(G4, G8);
+		FReal F8[(ORDER-1)*(ORDER-1)*(ORDER-1)];
+		FBlas::gemtm(ORDER, ORDER-1, (ORDER-1)*(ORDER-1), FReal(1.), const_cast<FReal*>(T), ORDER, G8, ORDER, F8, ORDER-1);
+		perm9.permute(F8, W8);
+		// W4[1] /////////////////
+		perm6.permute(F2, F4);
+		FBlas::gemtm(ORDER, ORDER-1, (ORDER-1)*ORDER, FReal(1.), const_cast<FReal*>(T), ORDER, F4, ORDER, G4, ORDER-1);
+		for (unsigned int l=0; l<ORDER-1; ++l)
+			for (unsigned int n=0; n<ORDER-1; ++n) { W4[1][n*(ORDER-1)+l] = G4[l*(ORDER-1) + n];
+				for (unsigned int j=1; j<ORDER; ++j) W4[1][n*(ORDER-1)+l] += G4[j*(ORDER-1)*(ORDER-1) + l*(ORDER-1) + n];	}
+		// W2[1] /////////////////
+		perm3.permute(localExpansion, lE);
+		FBlas::gemtm(ORDER, ORDER-1, ORDER*ORDER, FReal(1.), const_cast<FReal*>(T), ORDER, lE, ORDER, F2, ORDER-1);
+		for (unsigned int i=0; i<ORDER-1; ++i) { W2[1][i] = F2[i];
+			for (unsigned int j=1; j<ORDER*ORDER; ++j) W2[1][i] += F2[j*(ORDER-1) + i]; }
+		// W4[2] /////////////////
+		perm7.permute(F2, F4);
+		FBlas::gemtm(ORDER, ORDER-1, (ORDER-1)*ORDER, FReal(1.), const_cast<FReal*>(T), ORDER, F4, ORDER, G4, ORDER-1);
+		for (unsigned int m=0; m<ORDER-1; ++m)
+			for (unsigned int n=0; n<ORDER-1; ++n) { W4[2][n*(ORDER-1)+m] = G4[m*ORDER*(ORDER-1) + n];
+				for (unsigned int i=1; i<ORDER; ++i) W4[2][n*(ORDER-1)+m] += G4[m*ORDER*(ORDER-1) + i*(ORDER-1) + n];	}
+		// W2[2] /////////////////
+		perm4.permute(localExpansion, lE);
+		FBlas::gemtm(ORDER, ORDER-1, ORDER*ORDER, FReal(1.), const_cast<FReal*>(T), ORDER, lE, ORDER, F2, ORDER-1);
+		for (unsigned int i=0; i<ORDER-1; ++i) { W2[2][i] = F2[i];
+			for (unsigned int j=1; j<ORDER*ORDER; ++j) W2[2][i] += F2[j*(ORDER-1) + i]; }
+	}
+
+	
+	// loop over particles
 	const map_glob_loc map(center, width);
 	FPoint Jacobian;
 	map.computeJacobian(Jacobian); // 6 flops
 	const FReal jacobian[3] = {Jacobian.getX(), Jacobian.getY(), Jacobian.getZ()}; 
 	FPoint localPosition;
-	FReal T_of_x[ORDER][3];
-	FReal U_of_x[ORDER][3];
-	FReal P[6];
-	//
-	FReal xpx,ypy,zpz ;
-	FReal c1 = FReal(8.0) / nnodes; // 1 flop
-	//
+
 	typename ContainerClass::BasicIterator iter(*localParticles);
 	while(iter.hasNotFinished()){
 			
 		// map global position to [-1,1]
 		map(iter.data().getPosition(), localPosition); // 15 flops
+
+		FReal U_of_x[3][ORDER];
+		FReal T_of_x[3][ORDER];
+		{
+			T_of_x[0][0] = FReal(1.); T_of_x[0][1] = localPosition.getX();
+			T_of_x[1][0] = FReal(1.); T_of_x[1][1] = localPosition.getY();
+			T_of_x[2][0] = FReal(1.); T_of_x[2][1] = localPosition.getZ();
+			const FReal x2 = FReal(2.) * T_of_x[0][1]; // 1 flop
+			const FReal y2 = FReal(2.) * T_of_x[1][1]; // 1 flop
+			const FReal z2 = FReal(2.) * T_of_x[2][1]; // 1 flop
+			U_of_x[0][0] = FReal(1.);	U_of_x[0][1] = x2;
+			U_of_x[1][0] = FReal(1.);	U_of_x[1][1] = y2;
+			U_of_x[2][0] = FReal(1.);	U_of_x[2][1] = z2;
+			for (unsigned int j=2; j<ORDER; ++j) {
+				T_of_x[0][j] = x2 * T_of_x[0][j-1] - T_of_x[0][j-2]; // 2 flops
+				T_of_x[1][j] = y2 * T_of_x[1][j-1] - T_of_x[1][j-2]; // 2 flops
+				T_of_x[2][j] = z2 * T_of_x[2][j-1] - T_of_x[2][j-2]; // 2 flops
+				U_of_x[0][j] = x2 * U_of_x[0][j-1] - U_of_x[0][j-2]; // 2 flops
+				U_of_x[1][j] = y2 * U_of_x[1][j-1] - U_of_x[1][j-2]; // 2 flops
+				U_of_x[2][j] = z2 * U_of_x[2][j-1] - U_of_x[2][j-2]; // 2 flops
+			}
 			
-		// evaluate chebyshev polynomials of source particle
-		// T_0(x_i) and T_1(x_i)
-		xpx = FReal(2.) * localPosition.getX(); // 1 flop
-		ypy = FReal(2.) * localPosition.getY(); // 1 flop
-		zpz = FReal(2.) * localPosition.getZ(); // 1 flop
-		//
-		T_of_x[0][0] = FReal(1.);	T_of_x[1][0] = localPosition.getX();
-		T_of_x[0][1] = FReal(1.);	T_of_x[1][1] = localPosition.getY();
-		T_of_x[0][2] = FReal(1.);	T_of_x[1][2] = localPosition.getZ();
-		// U_0(x_i) and U_1(x_i)
-		// U_of_x[0][0] = FReal(1.);	U_of_x[1][0] = localPosition.getX() * FReal(2.);
-		// U_of_x[0][1] = FReal(1.);	U_of_x[1][1] = localPosition.getY() * FReal(2.);
-		// U_of_x[0][2] = FReal(1.);	U_of_x[1][2] = localPosition.getZ() * FReal(2.);
-		U_of_x[0][0] = FReal(1.);	U_of_x[1][0] = xpx;
-		U_of_x[0][1] = FReal(1.);	U_of_x[1][1] = ypy;
-		U_of_x[0][2] = FReal(1.);	U_of_x[1][2] = zpz;
-		for (unsigned int o=2; o<ORDER; ++o) {
-			// T_o(x_i)
-			// T_of_x[o][0] = FReal(2.)*localPosition.getX()*T_of_x[o-1][0] - T_of_x[o-2][0];
-			// T_of_x[o][1] = FReal(2.)*localPosition.getY()*T_of_x[o-1][1] - T_of_x[o-2][1];
-			// T_of_x[o][2] = FReal(2.)*localPosition.getZ()*T_of_x[o-1][2] - T_of_x[o-2][2];
-			// // U_o(x_i)
-			// U_of_x[o][0] = FReal(2.)*localPosition.getX()*U_of_x[o-1][0] - U_of_x[o-2][0];
-			// U_of_x[o][1] = FReal(2.)*localPosition.getY()*U_of_x[o-1][1] - U_of_x[o-2][1];
-			// U_of_x[o][2] = FReal(2.)*localPosition.getZ()*U_of_x[o-1][2] - U_of_x[o-2][2];
-			T_of_x[o][0] = xpx * T_of_x[o-1][0] - T_of_x[o-2][0]; // 2 flops 
-			T_of_x[o][1] = ypy * T_of_x[o-1][1] - T_of_x[o-2][1]; // 2 flops
-			T_of_x[o][2] = zpz * T_of_x[o-1][2] - T_of_x[o-2][2]; // 2 flops
-			// U_o(x_i)
-			U_of_x[o][0] = xpx * U_of_x[o-1][0] - U_of_x[o-2][0]; // 2 flops
-			U_of_x[o][1] = ypy * U_of_x[o-1][1] - U_of_x[o-2][1]; // 2 flops
-			U_of_x[o][2] = zpz * U_of_x[o-1][2] - U_of_x[o-2][2]; // 2 flops
-		}
+			// scale, because dT_j/dx = jU_{j-1}
+			for (unsigned int j=2; j<ORDER; ++j) {
+				U_of_x[0][j-1] *= FReal(j); // 1 flops
+				U_of_x[1][j-1] *= FReal(j); // 1 flops
+				U_of_x[2][j-1] *= FReal(j); // 1 flops
+			}
 
-		// scale, because dT_o/dx = oU_{o-1}
-		for (unsigned int o=2; o<ORDER; ++o) {
-			U_of_x[o-1][0] *= FReal(o); // 1 flops
-			U_of_x[o-1][1] *= FReal(o); // 1 flops
-			U_of_x[o-1][2] *= FReal(o); // 1 flops
 		}
 
 		// apply P and increment forces
 		FReal potential = FReal(0.);
 		FReal forces[3] = {FReal(0.), FReal(0.), FReal(0.)};
-		//
-		// Optimization:
-		//   Here we compute 1/2 S and 1/2 P  rather S and F like in the paper
-		for (unsigned int n=0; n<nnodes; ++n) {
-		  
-		  // tensor indices of chebyshev nodes
-		  const unsigned int j[3] = {node_ids[n][0], node_ids[n][1], node_ids[n][2]};
-		  //
-		  P[0] = FReal(0.5) + T_of_x[1][0] * T_of_roots[1][j[0]]; // 2 flops 
-		  P[1] = FReal(0.5) + T_of_x[1][1] * T_of_roots[1][j[1]]; // 2 flops
-		  P[2] = FReal(0.5) + T_of_x[1][2] * T_of_roots[1][j[2]]; // 2 flops
-		  P[3] = U_of_x[0][0] * T_of_roots[1][j[0]]; // 1 flop
-		  P[4] = U_of_x[0][1] * T_of_roots[1][j[1]]; // 1 flop
-		  P[5] = U_of_x[0][2] * T_of_roots[1][j[2]]; // 1 flop
-		  for (unsigned int o=2; o<ORDER; ++o) {
-		    P[0] += T_of_x[o  ][0] * T_of_roots[o][j[0]]; // 2 flop
-		    P[1] += T_of_x[o  ][1] * T_of_roots[o][j[1]]; // 2 flop
-		    P[2] += T_of_x[o  ][2] * T_of_roots[o][j[2]]; // 2 flop
-		    P[3] += U_of_x[o-1][0] * T_of_roots[o][j[0]]; // 2 flop
-		    P[4] += U_of_x[o-1][1] * T_of_roots[o][j[1]]; // 2 flop
-		    P[5] += U_of_x[o-1][2] * T_of_roots[o][j[2]]; // 2 flop
-		  }
-		  //
-		  potential	+= P[0] * P[1] * P[2] * localExpansion[n]; // 4 flops
-		  forces[0]	+= P[3] * P[1] * P[2] * localExpansion[n]; // 4 flops
-		  forces[1]	+= P[0] * P[4] * P[2] * localExpansion[n]; // 4 flops
-		  forces[2]	+= P[0] * P[1] * P[5] * localExpansion[n]; // 4 flops
-		}
-		//
-		potential *= c1 ; // 1 flop
-		forces[0] *= jacobian[0] *c1; // 2 flops 
-		forces[1] *= jacobian[1] *c1; // 2 flops
-		forces[2] *= jacobian[2] *c1; // 2 flops
+		{
+			FReal f2[4], f4[4], f8[4];
+			for (unsigned int i=0; i<4; ++i) f2[i] = f4[i] = f8[i] = FReal(0.);
+			{
+				for (unsigned int l=1; l<ORDER; ++l) {
+					const FReal w2[3] = {W2[0][l-1], W2[1][l-1], W2[2][l-1]};
+					f2[0] += T_of_x[0][l  ] * w2[0] + T_of_x[1][l] * w2[1] + T_of_x[2][l] * w2[2]; // 6 flops
+					f2[1] += U_of_x[0][l-1] * w2[0]; // 2 flops
+					f2[2] += U_of_x[1][l-1] * w2[1]; // 2 flops
+					f2[3] += U_of_x[2][l-1] * w2[2]; // 2 flops
+					for (unsigned int m=1; m<ORDER; ++m) {
+						const unsigned int w4idx = (m-1)*(ORDER-1)+(l-1);
+						const FReal w4[3] = {W4[0][w4idx], W4[1][w4idx], W4[2][w4idx]};
+						f4[0] +=
+							T_of_x[0][l] * T_of_x[1][m] * w4[0] +
+							T_of_x[0][l] * T_of_x[2][m] * w4[1] +
+							T_of_x[1][l] * T_of_x[2][m] * w4[2]; // 9 flops
+						f4[1] += U_of_x[0][l-1] * T_of_x[1][m]   * w4[0] + U_of_x[0][l-1] * T_of_x[2][m]   * w4[1]; // 6 flops
+						f4[2] += T_of_x[0][l]   * U_of_x[1][m-1] * w4[0] + U_of_x[1][l-1] * T_of_x[2][m]   * w4[2]; // 6 flops
+						f4[3] += T_of_x[0][l]   * U_of_x[2][m-1] * w4[1] + T_of_x[1][l]   * U_of_x[2][m-1] * w4[2]; // 6 flops
+						for (unsigned int n=1; n<ORDER; ++n) {
+							const FReal w8 = W8[(n-1)*(ORDER-1)*(ORDER-1) + (m-1)*(ORDER-1) + (l-1)];
+							f8[0] += T_of_x[0][l]   * T_of_x[1][m]   * T_of_x[2][n]   * w8;
+							f8[1] += U_of_x[0][l-1] * T_of_x[1][m]   * T_of_x[2][n]   * w8;
+							f8[2] += T_of_x[0][l]   * U_of_x[1][m-1] * T_of_x[2][n]   * w8;
+							f8[3] += T_of_x[0][l]   * T_of_x[1][m]   * U_of_x[2][n-1] * w8;
+						} // ORDER * 4 flops
+					} // ORDER * (9 + ORDER * 4) flops
+				} // ORDER * (ORDER * (9 + ORDER * 4)) flops
+			}
+			potential = (f1 + FReal(2.)*f2[0] + FReal(4.)*f4[0] + FReal(8.)*f8[0]) / nnodes; // 7 flops
+			forces[0] = (     FReal(2.)*f2[1] + FReal(4.)*f4[1] + FReal(8.)*f8[1]) * jacobian[0] / nnodes; // 6 flops
+			forces[1] = (     FReal(2.)*f2[2] + FReal(4.)*f4[2] + FReal(8.)*f8[2]) * jacobian[1] / nnodes; // 6 flops
+			forces[2] = (     FReal(2.)*f2[3] + FReal(4.)*f4[3] + FReal(8.)*f8[3]) * jacobian[2] / nnodes; // 6 flops
+		} // 7 + ORDER * (ORDER * (9 + ORDER * 4)) flops
+
 		// set computed potential
 		iter.data().incPotential(potential); // 1 flop
-		
+
 		// set computed forces
 		iter.data().incForces(forces[0] * iter.data().getPhysicalValue(),
 													forces[1] * iter.data().getPhysicalValue(),
 													forces[2] * iter.data().getPhysicalValue()); // 6 flops
 
-		// increment iterator
+		// increment target iterator
 		iter.gotoNext();
-	}
+	} // N * (7 + ORDER * (ORDER * (9 + ORDER * 4))) flops
 }
 
 
+///**
+// * Local to particle operation: application of \f$S_\ell(x,\bar x_m)\f$ and
+// * \f$\nabla_x S_\ell(x,\bar x_m)\f$ (interpolation)
+// */
+//template <int ORDER>
+//template <class ContainerClass>
+//inline void FChebInterpolator<ORDER>::applyL2PTotal(const FPoint& center,
+//																										const FReal width,
+//																										const FReal *const localExpansion,
+//																										ContainerClass *const localParticles) const
+//{
+//	// setup local to global mapping
+//	const map_glob_loc map(center, width);
+//	FPoint Jacobian;
+//	map.computeJacobian(Jacobian); // 6 flops
+//	const FReal jacobian[3] = {Jacobian.getX(), Jacobian.getY(), Jacobian.getZ()}; 
+//	FPoint localPosition;
+//	FReal T_of_x[ORDER][3];
+//	FReal U_of_x[ORDER][3];
+//	FReal P[6];
+//	//
+//	FReal xpx,ypy,zpz ;
+//	FReal c1 = FReal(8.0) / nnodes; // 1 flop
+//	//
+//	typename ContainerClass::BasicIterator iter(*localParticles);
+//	while(iter.hasNotFinished()){
+//			
+//		// map global position to [-1,1]
+//		map(iter.data().getPosition(), localPosition); // 15 flops
+//			
+//		// evaluate chebyshev polynomials of source particle
+//		// T_0(x_i) and T_1(x_i)
+//		xpx = FReal(2.) * localPosition.getX(); // 1 flop
+//		ypy = FReal(2.) * localPosition.getY(); // 1 flop
+//		zpz = FReal(2.) * localPosition.getZ(); // 1 flop
+//		//
+//		T_of_x[0][0] = FReal(1.);	T_of_x[1][0] = localPosition.getX();
+//		T_of_x[0][1] = FReal(1.);	T_of_x[1][1] = localPosition.getY();
+//		T_of_x[0][2] = FReal(1.);	T_of_x[1][2] = localPosition.getZ();
+//		U_of_x[0][0] = FReal(1.);	U_of_x[1][0] = xpx;
+//		U_of_x[0][1] = FReal(1.);	U_of_x[1][1] = ypy;
+//		U_of_x[0][2] = FReal(1.);	U_of_x[1][2] = zpz;
+//		for (unsigned int o=2; o<ORDER; ++o) {
+//			T_of_x[o][0] = xpx * T_of_x[o-1][0] - T_of_x[o-2][0]; // 2 flops 
+//			T_of_x[o][1] = ypy * T_of_x[o-1][1] - T_of_x[o-2][1]; // 2 flops
+//			T_of_x[o][2] = zpz * T_of_x[o-1][2] - T_of_x[o-2][2]; // 2 flops
+//			U_of_x[o][0] = xpx * U_of_x[o-1][0] - U_of_x[o-2][0]; // 2 flops
+//			U_of_x[o][1] = ypy * U_of_x[o-1][1] - U_of_x[o-2][1]; // 2 flops
+//			U_of_x[o][2] = zpz * U_of_x[o-1][2] - U_of_x[o-2][2]; // 2 flops
+//		}
+//
+//		// scale, because dT_o/dx = oU_{o-1}
+//		for (unsigned int o=2; o<ORDER; ++o) {
+//			U_of_x[o-1][0] *= FReal(o); // 1 flops
+//			U_of_x[o-1][1] *= FReal(o); // 1 flops
+//			U_of_x[o-1][2] *= FReal(o); // 1 flops
+//		}
+//
+//		// apply P and increment forces
+//		FReal potential = FReal(0.);
+//		FReal forces[3] = {FReal(0.), FReal(0.), FReal(0.)};
+//		//
+//		// Optimization:
+//		//   Here we compute 1/2 S and 1/2 P  rather S and F like in the paper
+//		for (unsigned int n=0; n<nnodes; ++n) {
+//		  
+//		  // tensor indices of chebyshev nodes
+//		  const unsigned int j[3] = {node_ids[n][0], node_ids[n][1], node_ids[n][2]};
+//		  //
+//		  P[0] = FReal(0.5) + T_of_x[1][0] * T_of_roots[1][j[0]]; // 2 flops 
+//		  P[1] = FReal(0.5) + T_of_x[1][1] * T_of_roots[1][j[1]]; // 2 flops
+//		  P[2] = FReal(0.5) + T_of_x[1][2] * T_of_roots[1][j[2]]; // 2 flops
+//		  P[3] = U_of_x[0][0] * T_of_roots[1][j[0]]; // 1 flop
+//		  P[4] = U_of_x[0][1] * T_of_roots[1][j[1]]; // 1 flop
+//		  P[5] = U_of_x[0][2] * T_of_roots[1][j[2]]; // 1 flop
+//		  for (unsigned int o=2; o<ORDER; ++o) {
+//		    P[0] += T_of_x[o  ][0] * T_of_roots[o][j[0]]; // 2 flop
+//		    P[1] += T_of_x[o  ][1] * T_of_roots[o][j[1]]; // 2 flop
+//		    P[2] += T_of_x[o  ][2] * T_of_roots[o][j[2]]; // 2 flop
+//		    P[3] += U_of_x[o-1][0] * T_of_roots[o][j[0]]; // 2 flop
+//		    P[4] += U_of_x[o-1][1] * T_of_roots[o][j[1]]; // 2 flop
+//		    P[5] += U_of_x[o-1][2] * T_of_roots[o][j[2]]; // 2 flop
+//		  }
+//		  //
+//		  potential	+= P[0] * P[1] * P[2] * localExpansion[n]; // 4 flops
+//		  forces[0]	+= P[3] * P[1] * P[2] * localExpansion[n]; // 4 flops
+//		  forces[1]	+= P[0] * P[4] * P[2] * localExpansion[n]; // 4 flops
+//		  forces[2]	+= P[0] * P[1] * P[5] * localExpansion[n]; // 4 flops
+//		}
+//		//
+//		potential *= c1 ; // 1 flop
+//		forces[0] *= jacobian[0] *c1; // 2 flops 
+//		forces[1] *= jacobian[1] *c1; // 2 flops
+//		forces[2] *= jacobian[2] *c1; // 2 flops
+//		// set computed potential
+//		iter.data().incPotential(potential); // 1 flop
+//		
+//		// set computed forces
+//		iter.data().incForces(forces[0] * iter.data().getPhysicalValue(),
+//													forces[1] * iter.data().getPhysicalValue(),
+//													forces[2] * iter.data().getPhysicalValue()); // 6 flops
+//
+//		// increment iterator
+//		iter.gotoNext();
+//	}
+//}
+
+
 #endif
 
 
diff --git a/Tests/Kernels/testChebAlgorithm.cpp b/Tests/Kernels/testChebAlgorithm.cpp
index 08f6d435990970125261c78b849b662f708c1393..ecfe8a71b2695e03fbf23526b4e854f187ce3105 100644
--- a/Tests/Kernels/testChebAlgorithm.cpp
+++ b/Tests/Kernels/testChebAlgorithm.cpp
@@ -78,8 +78,8 @@ int main(int argc, char* argv[])
 	const unsigned int SubTreeHeight = FParameters::getValue(argc, argv, "-sh", 2);
 	const unsigned int NbThreads     = FParameters::getValue(argc, argv, "-t", 1);
 
-	const unsigned int ORDER = 7;
-	const FReal epsilon = FReal(1e-7);
+	const unsigned int ORDER = 3;
+	const FReal epsilon = FReal(1e-3);
 
 	// set threads
 	omp_set_num_threads(NbThreads); 
@@ -113,7 +113,8 @@ int main(int argc, char* argv[])
 	OctreeClass tree(TreeHeight, SubTreeHeight, loader.getBoxWidth(), loader.getCenterOfBox());
 
 	// -----------------------------------------------------
-	std::cout << "Creating and inserting " << loader.getNumberOfParticles() << " particles in a octree of height " << TreeHeight
+	std::cout << "Creating and inserting " << loader.getNumberOfParticles()
+						<< " particles in a octree of height " << TreeHeight
 						<< " ..." << std::endl;
 	time.tic();
 	loader.fillTree(tree);
diff --git a/Tests/Utils/testChebSxUCBSy.cpp b/Tests/Utils/testChebSxUCBSy.cpp
index 55271446c859cf6408527101378684c3b1fa58f4..a01cd552199f633326fca82e9fdece33c43fcbd2 100644
--- a/Tests/Utils/testChebSxUCBSy.cpp
+++ b/Tests/Utils/testChebSxUCBSy.cpp
@@ -157,16 +157,35 @@ int main(int argc, char* argv[])
 	FPoint rootsX[nnodes], rootsY[nnodes];
 	FChebTensor<ORDER>::setRoots(cx, width, rootsX);
 	FChebTensor<ORDER>::setRoots(cy, width, rootsY);
-	for (unsigned int i=0; i<nnodes; ++i) {
-		F[i] = FReal(0.);
-		for (unsigned int j=0; j<nnodes; ++j)
-			F[i] += MatrixKernel.evaluate(rootsX[i], rootsY[j]) * W[j];
+
+	{
+		for (unsigned int i=0; i<nnodes; ++i) {
+			F[i] = FReal(0.);
+			for (unsigned int j=0; j<nnodes; ++j)
+				F[i] += MatrixKernel.evaluate(rootsX[i], rootsY[j]) * W[j];
+		}
 	}
 
+//	{
+//		for (unsigned int ix=0; ix<ORDER; ++ix)
+//			for (unsigned int jx=0; jx<ORDER; ++jx)
+//				for (unsigned int kx=0; kx<ORDER; ++kx)  {
+//					const unsigned int idx = kx*ORDER*ORDER + jx*ORDER + ix;
+//					F[idx] = FReal(0.);
+//					for (unsigned int iy=0; iy<ORDER; ++iy)
+//						for (unsigned int jy=0; jy<ORDER; ++jy)
+//							for (unsigned int ky=0; ky<ORDER; ++ky) {
+//								const unsigned int idy = ky*ORDER*ORDER + jy*ORDER + iy;
+//								F[idx] += MatrixKernel.evaluate(rootsX[idx], rootsY[idy]) * W[idy];
+//							}
+//				}
+//	}
+		
+
 	// Interpolate f_i = \sum_m^L S(x_i,\bar x_m) * F_m
 	time.tic();
 	//S.applyL2PTotal(cx, width, F, X.getTargets());
-	S.applyL2P(cx, width, F, X.getTargets());
+	S.applyL2PTotal(cx, width, F, X.getTargets());
 	std::cout << "L2P done in " << time.tacAndElapsed() << "s" << std::endl;
 
 	// -----------------------------------------------------
@@ -205,8 +224,8 @@ int main(int argc, char* argv[])
 	}
 
 	
-//	for (unsigned int i=0; i<1; ++i)
-//		std::cout << f[i] << "\t" << approx_f[i] << "\t" << approx_f[i]/f[i] << std::endl;
+	//for (unsigned int i=0; i<8; ++i)
+	//	std::cout << f[i] << "\t" << approx_f[i] << "\t" << approx_f[i]/f[i] << std::endl;
 	
 
 	std::cout << "\nRelative L2 error  = " << computeL2norm( M, f, approx_f) << std::endl;
diff --git a/Tests/Utils/testChebTensorProduct.cpp b/Tests/Utils/testChebTensorProduct.cpp
index 26e0daabe7adb69d679ffb1b00d1fc067b8aece7..cd2b75dc8a08cf2776c0bba8af52e447b0d6ea65 100644
--- a/Tests/Utils/testChebTensorProduct.cpp
+++ b/Tests/Utils/testChebTensorProduct.cpp
@@ -44,7 +44,7 @@ void applym2m(FReal *const S,	FReal *const w, const unsigned int n,	FReal *const
 { FBlas::gemtm(n, n, n*n, FReal(1.), S, n, w, n, W, n); }
 
 void applyL2L(FReal *const S,	FReal *const F, const unsigned int n,	FReal *const f, const unsigned int N)
-{ FBlas::gemva(n, n, FReal(1.), S, F, f);	}
+{ FBlas::gemva(N, n, FReal(1.), S, F, f);	}
 
 void applyl2l(FReal *const S,	FReal *const F, const unsigned int n,	FReal *const f, const unsigned int N)
 { FBlas::gemm(n, n, n*n, FReal(1.), S, n, F, n, f, n); }
@@ -145,7 +145,8 @@ int main(int argc, char* argv[])
 			m2m_error += w[n] - W0[n];
 		}
 
-		std::cout << "ERROR M2M = " << m2m_error << std::endl;
+		std::cout << "------------------------------------------"
+							<< "\n - M2M: ERROR = " << m2m_error << std::endl;
 
 
 
@@ -163,8 +164,9 @@ int main(int argc, char* argv[])
 			//std::cout << n << "\t" << f[n] << " - " << F0[n] << " = " << f[n]-F0[n] << std::endl;
 			l2l_error += f[n] - F0[n];
 		}
-
-		std::cout << "ERROR L2L = " << l2l_error << std::endl;
+		
+		std::cout << "------------------------------------------"
+							<< "\n - L2L: ERROR = " << l2l_error << std::endl;
 		
 	}
 
@@ -173,10 +175,6 @@ int main(int argc, char* argv[])
 	// P2M /////////////////////////////////////////////////////////////
 	////////////////////////////////////////////////////////////////////
 	////////////////////////////////////////////////////////////////////
-
-	std::cout << "\n--------------------------------------\n"
-						<< "-- P2M -------------------------------\n"
-						<< "--------------------------------------" << std::endl;
 		
 
 	const unsigned int M = 10;
@@ -192,19 +190,14 @@ int main(int argc, char* argv[])
 			points[1][p] = (FReal(rand())/FRandMax - FReal(.5)) * FReal(2.);
 			points[2][p] = (FReal(rand())/FRandMax - FReal(.5)) * FReal(2.);
 			weights[p] = FReal(rand())/FRandMax;
-			//std::cout << points[0][p] << "\t"
-			//					<< points[1][p] << "\t"
-			//					<< points[2][p] << "\t"
-			//					<< weights[p] << std::endl;
 			lp[p].setX(points[0][p]);
 			lp[p].setY(points[1][p]);
 			lp[p].setZ(points[2][p]);
-			//std::cout << lp[p] << std::endl;
 		}
 	} ////////////////////////////////////////////////////////
 
 	
-	{
+	{ // compute exact equivalent source values
 		for(unsigned int i=0; i<nnodes; ++i) equivW[i] = FReal(0.);
 		FReal Snorm[M * nnodes];
 		Interpolator.assembleInterpolator(M, lp, Snorm);
@@ -272,8 +265,8 @@ int main(int argc, char* argv[])
 	FReal F8[   ORDER*ORDER*ORDER];
 
 	{ ////////////////////////////////////////////////////////
-		//for(unsigned int i=0; i<ORDER; ++i)
-		//	F2[0][i] = F2[1][i] = F2[2][i] = FReal(0.);
+		for(unsigned int i=0; i<ORDER; ++i)
+			F2[0][i] = F2[1][i] = F2[2][i] = FReal(0.);
 		for(unsigned int i=0; i<ORDER*ORDER; ++i)
 			F4[0][i] = F4[1][i] = F4[2][i] = FReal(0.);
 		for(unsigned int i=0; i<ORDER*ORDER*ORDER; ++i)
@@ -284,76 +277,25 @@ int main(int argc, char* argv[])
       for (unsigned int j=0; j<ORDER; ++j)
         T_of_y[(o-1)*ORDER + j] = FReal(FChebRoots<ORDER>::T(o, FReal(FChebRoots<ORDER>::roots[j])));
 
-		FBlas::gemv(ORDER, ORDER-1, FReal(1.), T_of_y, W2[0], F2[0]);
-		FBlas::gemv(ORDER, ORDER-1, FReal(1.), T_of_y, W2[1], F2[1]);
-		FBlas::gemv(ORDER, ORDER-1, FReal(1.), T_of_y, W2[2], F2[2]);
-
-		FReal C[ORDER * (ORDER-1)];
-		FBlas::gemmt(ORDER, ORDER-1, ORDER-1, FReal(1.), T_of_y, ORDER, W4[0], ORDER-1, C, ORDER);
-		FBlas::gemmt(ORDER, ORDER-1, ORDER-1, FReal(1.), T_of_y, ORDER, C, ORDER-1, F4[0], ORDER);
-		FBlas::gemmt(ORDER, ORDER-1, ORDER-1, FReal(1.), T_of_y, ORDER, W4[1], ORDER-1, C, ORDER);
-		FBlas::gemmt(ORDER, ORDER-1, ORDER-1, FReal(1.), T_of_y, ORDER, C, ORDER-1, F4[1], ORDER);
-		FBlas::gemmt(ORDER, ORDER-1, ORDER-1, FReal(1.), T_of_y, ORDER, W4[2], ORDER-1, C, ORDER);
-		FBlas::gemmt(ORDER, ORDER-1, ORDER-1, FReal(1.), T_of_y, ORDER, C, ORDER-1, F4[2], ORDER);
-
-		FReal D[ORDER * (ORDER-1) * (ORDER-1)];
-		FBlas::gemm(ORDER, ORDER-1, (ORDER-1)*(ORDER-1), FReal(1.), T_of_y, ORDER, W8, ORDER-1, D, ORDER);
-		FReal E[(ORDER-1) * (ORDER-1) * ORDER];
-		for (unsigned int i=0; i<ORDER; ++i) {
-			for (unsigned int m=0; m<ORDER-1; ++m) {
-				for (unsigned int n=0; n<ORDER-1; ++n) {
-					const unsigned int a = n*(ORDER-1)*ORDER + m*ORDER + i;
-					const unsigned int b = i*(ORDER-1)*(ORDER-1) + n*(ORDER-1) + m;
-					E[b] = D[a];
-				}
-			}
-		}
-		FReal F[ORDER * (ORDER-1) * ORDER];
-		FBlas::gemm(ORDER, ORDER-1, ORDER*(ORDER-1), FReal(1.), T_of_y, ORDER, E, ORDER-1, F, ORDER);
-		FReal G[(ORDER-1) * ORDER * ORDER];
-		for (unsigned int i=0; i<ORDER; ++i) {
-			for (unsigned int j=0; j<ORDER; ++j) {
-				for (unsigned int n=0; n<ORDER-1; ++n) {
-					const unsigned int a = i*(ORDER-1)*ORDER + n*ORDER + j;
-					const unsigned int b = j*ORDER*(ORDER-1) + i*(ORDER-1) + n;
-					G[b] = F[a];
-				}
-			}
-		}
-		
-		FReal H[ORDER * ORDER * ORDER];
-		FBlas::gemm(ORDER, ORDER-1, ORDER*ORDER, FReal(1.), T_of_y, ORDER, G, ORDER-1, H, ORDER);
-		for (unsigned int i=0; i<ORDER; ++i) {
-			for (unsigned int j=0; j<ORDER; ++j) {
-				for (unsigned int k=0; k<ORDER; ++k) {
-					const unsigned int a = j*ORDER*ORDER + i*ORDER + k;
-					const unsigned int b = j*ORDER*ORDER + k*ORDER + i;
-					F8[b] = H[a];
-				}
+		for (unsigned int l=0; l<ORDER-1; ++l)
+			for (unsigned int i=0; i<ORDER; ++i) {
+				F2[0][i] += T_of_y[l*ORDER + i] * W2[0][l];
+				F2[1][i] += T_of_y[l*ORDER + i] * W2[1][l];
+				F2[2][i] += T_of_y[l*ORDER + i] * W2[2][l];
+				
+				for (unsigned int m=0; m<ORDER-1; ++m)
+					for (unsigned int j=0; j<ORDER; ++j) {
+						F4[0][j*ORDER + i] += T_of_y[l*ORDER + i] * T_of_y[m*ORDER + j] * W4[0][m*(ORDER-1) + l];
+						F4[1][j*ORDER + i] += T_of_y[l*ORDER + i] * T_of_y[m*ORDER + j] * W4[1][m*(ORDER-1) + l];
+						F4[2][j*ORDER + i] += T_of_y[l*ORDER + i] * T_of_y[m*ORDER + j] * W4[2][m*(ORDER-1) + l];
+							
+						for (unsigned int n=0; n<ORDER-1; ++n)
+							for (unsigned int k=0; k<ORDER; ++k)
+								F8[k*ORDER*ORDER + j*ORDER + i] +=
+									T_of_y[l*ORDER + i] * T_of_y[m*ORDER + j] * T_of_y[n*ORDER + k]	*
+									W8[n*(ORDER-1)*(ORDER-1) + m*(ORDER-1) + l];
+								}
 			}
-		}
-
-		
-
-		//for (unsigned int l=0; l<ORDER-1; ++l)
-		//	for (unsigned int i=0; i<ORDER; ++i) {
-		//		//F2[0][i] += T_of_y[l*ORDER + i] * W2[0][l];
-		//		//F2[1][i] += T_of_y[l*ORDER + i] * W2[1][l];
-		//		//F2[2][i] += T_of_y[l*ORDER + i] * W2[2][l];
-		//		
-		//		for (unsigned int m=0; m<ORDER-1; ++m)
-		//			for (unsigned int j=0; j<ORDER; ++j) {
-		//				//F4[0][j*ORDER + i] += T_of_y[l*ORDER + i] * T_of_y[m*ORDER + j] * W4[0][m*(ORDER-1) + l];
-		//				//F4[1][j*ORDER + i] += T_of_y[l*ORDER + i] * T_of_y[m*ORDER + j] * W4[1][m*(ORDER-1) + l];
-		//				//F4[2][j*ORDER + i] += T_of_y[l*ORDER + i] * T_of_y[m*ORDER + j] * W4[2][m*(ORDER-1) + l];
-		//					
-		//				for (unsigned int n=0; n<ORDER-1; ++n)
-		//					for (unsigned int k=0; k<ORDER; ++k)
-		//						//F8[k*ORDER*ORDER + j*ORDER + i] +=
-		//						//	T_of_y[l*ORDER + i] * T_of_y[m*ORDER + j] * T_of_y[n*ORDER + k]	*
-		//						//	W8[n*(ORDER-1)*(ORDER-1) + m*(ORDER-1) + l];
-		//						}
-		//	}
 
 	} ////////////////////////////////////////////////////////
 
@@ -374,13 +316,138 @@ int main(int argc, char* argv[])
 		}
 	}
 
-	std::cout << std::endl;
 	FReal p2m_error(0.);
 	for (unsigned int i=0; i<nnodes; ++i) {
 		p2m_error += W[i] - equivW[i];
 		//std::cout << W[i] - equivW[i] << std::endl;
 	}
-	std::cout << "ERROR P2M = " << p2m_error << std::endl;
+
+	std::cout << "------------------------------------------"
+						<< "\n - P2M: ERROR = " << p2m_error << std::endl;
+
+
+
+	////////////////////////////////////////////////////////////////////
+	////////////////////////////////////////////////////////////////////
+	// L2P /////////////////////////////////////////////////////////////
+	////////////////////////////////////////////////////////////////////
+	////////////////////////////////////////////////////////////////////
+
+
+	FReal exactf[M];
+	FReal f[M];
+	for(unsigned int i=0; i<M; ++i) exactf[i] = f[i] = FReal(0.);
+
+
+	{ // compute exact target values
+		FReal Snorm[M * nnodes];
+		Interpolator.assembleInterpolator(M, lp, Snorm);
+		applyL2L(Snorm, W, nnodes, exactf, M);
+		//for (unsigned int i=0; i<M; ++i)
+		//	std::cout << exactf[i] << std::endl;
+	}
+
+	FReal f1;
+	{ // sum over interpolation points
+		FReal T_of_y[ORDER * (ORDER-1)];
+    for (unsigned int o=1; o<ORDER; ++o)
+      for (unsigned int j=0; j<ORDER; ++j)
+        T_of_y[(o-1)*ORDER + j] = FReal(FChebRoots<ORDER>::T(o, FReal(FChebRoots<ORDER>::roots[j])));
+		
+		// set everything to zero
+		f1 = FReal(0.);
+		for(unsigned int i=0; i<ORDER-1; ++i)
+			W2[0][i] = W2[1][i] = W2[2][i] = FReal(0.);
+		for(unsigned int i=0; i<(ORDER-1)*(ORDER-1); ++i)
+			W4[0][i] = W4[1][i] = W4[2][i] = FReal(0.);
+		for(unsigned int i=0; i<(ORDER-1)*(ORDER-1)*(ORDER-1); ++i)
+			W8[i] = FReal(0.);
+
+		{
+			unsigned int nids[nnodes][3];
+			FChebTensor<ORDER>::setNodeIds(nids);
+
+			for (unsigned int idx=0; idx<nnodes; ++idx) {
+				f1 += W[idx];
+				const unsigned int i = nids[idx][0];
+				const unsigned int j = nids[idx][1];
+				const unsigned int k = nids[idx][2];
+				
+				//std::cout << i << "\t" << j << "\t" << k << std::endl;
+
+				for (unsigned int l=0; l<ORDER-1; ++l) {
+					W2[0][l] += T_of_y[l*ORDER+i] * W[idx];
+					W2[1][l] += T_of_y[l*ORDER+j] * W[idx];
+					W2[2][l] += T_of_y[l*ORDER+k] * W[idx];
+					for (unsigned int m=0; m<ORDER-1; ++m) {
+						W4[0][m*(ORDER-1)+l] += T_of_y[l*ORDER+i] * T_of_y[m*ORDER+j] * W[idx];
+						W4[1][m*(ORDER-1)+l] += T_of_y[l*ORDER+i] * T_of_y[m*ORDER+k] * W[idx];
+						W4[2][m*(ORDER-1)+l] += T_of_y[l*ORDER+j] * T_of_y[m*ORDER+k] * W[idx];
+						for (unsigned int n=0; n<ORDER-1; ++n)
+							W8[n*(ORDER-1)*(ORDER-1) + m*(ORDER-1) + l]
+								+= T_of_y[l*ORDER+i]*T_of_y[m*ORDER+j]*T_of_y[n*ORDER+k] * W[idx];
+					}
+				}
+			}
+
+		}
+
+	}
+
+	{ // sum over targets
+		for (unsigned int p=0; p<M; ++p) {
+
+			FReal T_of_x[3][ORDER];
+			{
+				T_of_x[0][0] = FReal(1.); T_of_x[0][1] = points[0][p];
+				T_of_x[1][0] = FReal(1.); T_of_x[1][1] = points[1][p];
+				T_of_x[2][0] = FReal(1.); T_of_x[2][1] = points[2][p];
+				const FReal x2 = FReal(2.) * T_of_x[0][1]; // 1 flop
+				const FReal y2 = FReal(2.) * T_of_x[1][1]; // 1 flop
+				const FReal z2 = FReal(2.) * T_of_x[2][1]; // 1 flop
+				for (unsigned int j=2; j<ORDER; ++j) {
+					T_of_x[0][j] = x2 * T_of_x[0][j-1] - T_of_x[0][j-2]; // 2 flops
+					T_of_x[1][j] = y2 * T_of_x[1][j-1] - T_of_x[1][j-2]; // 2 flops
+					T_of_x[2][j] = z2 * T_of_x[2][j-1] - T_of_x[2][j-2]; // 2 flops
+				}
+			}
+			
+			FReal f2, f4, f8;
+			{
+				f2 = f4 = f8 = FReal(0.);
+				for (unsigned int l=1; l<ORDER; ++l) {
+					f2 +=
+						T_of_x[0][l] * W2[0][l-1] +
+						T_of_x[1][l] * W2[1][l-1] +
+						T_of_x[2][l] * W2[2][l-1];
+					for (unsigned int m=1; m<ORDER; ++m) {
+						f4 +=
+							T_of_x[0][l] * T_of_x[1][m] * W4[0][(m-1)*(ORDER-1)+(l-1)] +
+							T_of_x[0][l] * T_of_x[2][m] * W4[1][(m-1)*(ORDER-1)+(l-1)] +
+							T_of_x[1][l] * T_of_x[2][m] * W4[2][(m-1)*(ORDER-1)+(l-1)];
+						for (unsigned int n=1; n<ORDER; ++n)
+							f8 +=
+								T_of_x[0][l]*T_of_x[1][m]*T_of_x[2][n] *
+								W8[(n-1)*(ORDER-1)*(ORDER-1) + (m-1)*(ORDER-1) + (l-1)];
+					}
+				}
+			}
+			f[p] = (f1 + FReal(2.)*f2 + FReal(4.)*f4 + FReal(8.)*f8) / (ORDER*ORDER*ORDER);
+			
+		}
+	}
+
+
+	FReal l2p_error(0.);
+	for (unsigned int i=0; i<M; ++i) {
+		l2p_error += f[i] - exactf[i];
+		//std::cout << exactf[i] << "\t" << f[i] << std::endl;
+	}
+
+	std::cout << "------------------------------------------"
+						<< "\n - L2P: ERROR = " << l2p_error << std::endl;
+
+
 
 
 	return 0;
@@ -388,3 +455,25 @@ int main(int argc, char* argv[])
 
 
 // [--END--]
+
+		//for (unsigned int l=0; l<ORDER-1; ++l)
+		//	for (unsigned int m=0; m<ORDER-1; ++m)
+		//		for (unsigned int n=0; n<ORDER-1; ++n)
+		//			
+		//			for (unsigned int i=0; i<ORDER; ++i)
+		//				for (unsigned int j=0; j<ORDER; ++j)
+		//					for (unsigned int k=0; k<ORDER; ++k) {
+		//
+		//						const unsigned int idx = k*ORDER*ORDER + j*ORDER + i;
+		//
+		//						W2[0][l] += T_of_y[l*ORDER+i] * W[idx];
+		//						W2[1][m] += T_of_y[m*ORDER+j] * W[idx];
+		//						W2[2][n] += T_of_y[n*ORDER+k] * W[idx];
+		//
+		//						W4[0][m*(ORDER-1) + l] += T_of_y[l*ORDER+i] * T_of_y[m*ORDER+j] * W[idx];
+		//						W4[1][n*(ORDER-1) + l] += T_of_y[l*ORDER+i] * T_of_y[n*ORDER+k] * W[idx];
+		//						W4[2][n*(ORDER-1) + m] += T_of_y[m*ORDER+j] * T_of_y[n*ORDER+k] * W[idx];
+		//
+		//						W8[n*(ORDER-1)*(ORDER-1) + m*(ORDER-1) + l]
+		//							+= T_of_y[l*ORDER+i]*T_of_y[m*ORDER+j]*T_of_y[n*ORDER+k] * W[idx];
+		//					}