diff --git a/Src/Kernels/Chebyshev/FChebInterpolator.hpp b/Src/Kernels/Chebyshev/FChebInterpolator.hpp index 4f7ca0f0df496965c5579c1de53c371bacc3d7ee..e0e0fd7142baf5f763395060f9429e5e0d040b90 100644 --- a/Src/Kernels/Chebyshev/FChebInterpolator.hpp +++ b/Src/Kernels/Chebyshev/FChebInterpolator.hpp @@ -91,6 +91,144 @@ class FChebInterpolator : FNoCopyable } perm2; //////////////////////////////////////////////////////////////////// + //////////////////////////////////////////////////////////////////// + // needed for L2P + struct IJK2JKI { + enum {size = ORDER * ORDER * ORDER}; + unsigned int ijk[size], jki[size]; + IJK2JKI() { + unsigned int counter = 0; + for (unsigned int i=0; i<ORDER; ++i) { + for (unsigned int j=0; j<ORDER; ++j) { + for (unsigned int k=0; k<ORDER; ++k) { + ijk[counter] = k*ORDER*ORDER + j*ORDER + i; + jki[counter] = i*ORDER*ORDER + k*ORDER + j; + counter++; + } + } + } + } + void permute(const FReal *const IN, FReal *const OUT) const + { for (unsigned int i=0; i<size; ++i) OUT[jki[i]] = IN[ijk[i]]; } + } perm3; + + struct IJK2KIJ { + enum {size = ORDER * ORDER * ORDER}; + unsigned int ijk[size], kij[size]; + IJK2KIJ() { + unsigned int counter = 0; + for (unsigned int i=0; i<ORDER; ++i) { + for (unsigned int j=0; j<ORDER; ++j) { + for (unsigned int k=0; k<ORDER; ++k) { + ijk[counter] = k*ORDER*ORDER + j*ORDER + i; + kij[counter] = j*ORDER*ORDER + i*ORDER + k; + counter++; + } + } + } + } + void permute(const FReal *const IN, FReal *const OUT) const + { for (unsigned int i=0; i<size; ++i) OUT[kij[i]] = IN[ijk[i]]; } + } perm4; + + struct LJK2JKL { + enum {size = (ORDER-1) * ORDER * ORDER}; + unsigned int ljk[size], jkl[size]; + LJK2JKL() { + unsigned int counter = 0; + for (unsigned int l=0; l<ORDER-1; ++l) { + for (unsigned int j=0; j<ORDER; ++j) { + for (unsigned int k=0; k<ORDER; ++k) { + ljk[counter] = k*ORDER*(ORDER-1) + j*(ORDER-1) + l; + jkl[counter] = l*ORDER*ORDER + k*ORDER + j; + counter++; + } + } + } + } + void permute(const FReal *const IN, FReal *const OUT) const + { for (unsigned int i=0; i<size; ++i) OUT[jkl[i]] = IN[ljk[i]]; } + } perm5; + + struct LJK2KLJ { + enum {size = (ORDER-1) * ORDER * ORDER}; + unsigned int ljk[size], klj[size]; + LJK2KLJ() { + unsigned int counter = 0; + for (unsigned int l=0; l<ORDER-1; ++l) { + for (unsigned int j=0; j<ORDER; ++j) { + for (unsigned int k=0; k<ORDER; ++k) { + ljk[counter] = k*ORDER*(ORDER-1) + j*(ORDER-1) + l; + klj[counter] = j*(ORDER-1)*ORDER + l*ORDER + k; + counter++; + } + } + } + } + void permute(const FReal *const IN, FReal *const OUT) const + { for (unsigned int i=0; i<size; ++i) OUT[klj[i]] = IN[ljk[i]]; } + } perm6; + + struct MKI2KIM { + enum {size = (ORDER-1) * ORDER * ORDER}; + unsigned int mki[size], kim[size]; + MKI2KIM() { + unsigned int counter = 0; + for (unsigned int m=0; m<ORDER-1; ++m) { + for (unsigned int k=0; k<ORDER; ++k) { + for (unsigned int i=0; i<ORDER; ++i) { + mki[counter] = i*ORDER*(ORDER-1) + k*(ORDER-1) + m; + kim[counter] = m*ORDER*ORDER + i*ORDER + k; + counter++; + } + } + } + } + void permute(const FReal *const IN, FReal *const OUT) const + { for (unsigned int i=0; i<size; ++i) OUT[kim[i]] = IN[mki[i]]; } + } perm7; + + struct MKL2KLM { + enum {size = (ORDER-1) * ORDER * (ORDER-1)}; + unsigned int mkl[size], klm[size]; + MKL2KLM() { + unsigned int counter = 0; + for (unsigned int m=0; m<ORDER-1; ++m) { + for (unsigned int k=0; k<ORDER; ++k) { + for (unsigned int l=0; l<ORDER-1; ++l) { + mkl[counter] = l*ORDER*(ORDER-1) + k*(ORDER-1) + m; + klm[counter] = m*(ORDER-1)*ORDER + l*ORDER + k; + counter++; + } + } + } + } + void permute(const FReal *const IN, FReal *const OUT) const + { for (unsigned int i=0; i<size; ++i) OUT[klm[i]] = IN[mkl[i]]; } + } perm8; + + struct NLM2LMN { + enum {size = (ORDER-1) * (ORDER-1) * (ORDER-1)}; + unsigned int nlm[size], lmn[size]; + NLM2LMN() { + unsigned int counter = 0; + for (unsigned int n=0; n<ORDER-1; ++n) { + for (unsigned int l=0; l<ORDER-1; ++l) { + for (unsigned int m=0; m<ORDER-1; ++m) { + nlm[counter] = m*(ORDER-1)*(ORDER-1) + l*(ORDER-1) + n; + lmn[counter] = n*(ORDER-1)*(ORDER-1) + m*(ORDER-1) + l; + counter++; + } + } + } + } + void permute(const FReal *const IN, FReal *const OUT) const + { for (unsigned int i=0; i<size; ++i) OUT[lmn[i]] = IN[nlm[i]]; } + } perm9; + + //////////////////////////////////////////////////////////////////// + + /** * Initialize the child - parent - interpolator, it is basically the matrix @@ -590,9 +728,6 @@ inline void FChebInterpolator<ORDER>::applyP2M(const FPoint& center, - - - /** * Local to particle operation: application of \f$S_\ell(x,\bar x_m)\f$ (interpolation) */ @@ -603,190 +738,314 @@ inline void FChebInterpolator<ORDER>::applyL2P(const FPoint& center, const FReal *const localExpansion, ContainerClass *const localParticles) const { - // allocate stuff + FReal f1; + FReal W2[3][ ORDER-1]; + FReal W4[3][(ORDER-1)*(ORDER-1)]; + FReal W8[ (ORDER-1)*(ORDER-1)*(ORDER-1)]; + { // sum over interpolation points + f1 = FReal(0.); + for(unsigned int i=0; i<ORDER-1; ++i) W2[0][i] = W2[1][i] = W2[2][i] = FReal(0.); + for(unsigned int i=0; i<(ORDER-1)*(ORDER-1); ++i) W4[0][i] = W4[1][i] = W4[2][i] = FReal(0.); + for(unsigned int i=0; i<(ORDER-1)*(ORDER-1)*(ORDER-1); ++i) W8[i] = FReal(0.); + + for (unsigned int idx=0; idx<nnodes; ++idx) { + const unsigned int i = node_ids[idx][0]; + const unsigned int j = node_ids[idx][1]; + const unsigned int k = node_ids[idx][2]; + + f1 += localExpansion[idx]; // 1 flop + + for (unsigned int l=0; l<ORDER-1; ++l) { + const FReal wx = T[l*ORDER+i] * localExpansion[idx]; // 1 flops + const FReal wy = T[l*ORDER+j] * localExpansion[idx]; // 1 flops + const FReal wz = T[l*ORDER+k] * localExpansion[idx]; // 1 flops + W2[0][l] += wx; // 1 flops + W2[1][l] += wy; // 1 flops + W2[2][l] += wz; // 1 flops + for (unsigned int m=0; m<ORDER-1; ++m) { + const FReal wxy = wx * T[m*ORDER + j]; // 1 flops + const FReal wxz = wx * T[m*ORDER + k]; // 1 flops + const FReal wyz = wy * T[m*ORDER + k]; // 1 flops + W4[0][m*(ORDER-1)+l] += wxy; // 1 flops + W4[1][m*(ORDER-1)+l] += wxz; // 1 flops + W4[2][m*(ORDER-1)+l] += wyz; // 1 flops + for (unsigned int n=0; n<ORDER-1; ++n) { + const FReal wxyz = wxy * T[n*ORDER + k]; // 1 flops + W8[n*(ORDER-1)*(ORDER-1) + m*(ORDER-1) + l] += wxyz; // 1 flops + } // (ORDER-1) * 2 flops + } // (ORDER-1) * (6 + (ORDER-1)*2) flops + } // (ORDER-1) * (6 + (ORDER-1) * (6 + (ORDER-1)*2)) flops + } // ORDER*ORDER*ORDER * (1 + (ORDER-1) * (6 + (ORDER-1) * (6 + (ORDER-1)*2))) flops + + } + + + // loop over particles const map_glob_loc map(center, width); FPoint localPosition; - FReal T_of_x[ORDER][3]; - FReal xpx,ypy,zpz ; - FReal S[3],c1; - // - c1 = FReal(8.) / nnodes ; typename ContainerClass::BasicIterator iter(*localParticles); while(iter.hasNotFinished()){ // map global position to [-1,1] - map(iter.data().getPosition(), localPosition); - - // evaluate chebyshev polynomials of source particle: T_o(x_i) - T_of_x[0][0] = FReal(1.); T_of_x[1][0] = localPosition.getX(); - T_of_x[0][1] = FReal(1.); T_of_x[1][1] = localPosition.getY(); - T_of_x[0][2] = FReal(1.); T_of_x[1][2] = localPosition.getZ(); - xpx = FReal(2.) * localPosition.getX() ; - ypy = FReal(2.) * localPosition.getY() ; - zpz = FReal(2.) * localPosition.getZ() ; - for (unsigned int o=2; o<ORDER; ++o) { - T_of_x[o][0] = xpx * T_of_x[o-1][0] - T_of_x[o-2][0]; - T_of_x[o][1] = ypy * T_of_x[o-1][1] - T_of_x[o-2][1]; - T_of_x[o][2] = zpz * T_of_x[o-1][2] - T_of_x[o-2][2]; + map(iter.data().getPosition(), localPosition); // 15 flops + + FReal T_of_x[3][ORDER]; + { + T_of_x[0][0] = FReal(1.); T_of_x[0][1] = localPosition.getX(); + T_of_x[1][0] = FReal(1.); T_of_x[1][1] = localPosition.getY(); + T_of_x[2][0] = FReal(1.); T_of_x[2][1] = localPosition.getZ(); + const FReal x2 = FReal(2.) * T_of_x[0][1]; // 1 flop + const FReal y2 = FReal(2.) * T_of_x[1][1]; // 1 flop + const FReal z2 = FReal(2.) * T_of_x[2][1]; // 1 flop + for (unsigned int j=2; j<ORDER; ++j) { + T_of_x[0][j] = x2 * T_of_x[0][j-1] - T_of_x[0][j-2]; // 2 flops + T_of_x[1][j] = y2 * T_of_x[1][j-1] - T_of_x[1][j-2]; // 2 flops + T_of_x[2][j] = z2 * T_of_x[2][j-1] - T_of_x[2][j-2]; // 2 flops + } } // interpolate and increment target value FReal targetValue = iter.data().getPotential(); - for (unsigned int n=0; n<nnodes; ++n) { - const unsigned int j[3] = {node_ids[n][0], node_ids[n][1], node_ids[n][2]}; - S[0] = T_of_x[1][0] * T_of_roots[1][j[0]]; - S[1] = T_of_x[1][1] * T_of_roots[1][j[1]]; - S[2] = T_of_x[1][2] * T_of_roots[1][j[2]]; - for (unsigned int o=2; o<ORDER; ++o) { - S[0] += T_of_x[o][0] * T_of_roots[o][j[0]]; - S[1] += T_of_x[o][1] * T_of_roots[o][j[1]]; - S[2] += T_of_x[o][2] * T_of_roots[o][j[2]]; + { + FReal f2, f4, f8; + { + f2 = f4 = f8 = FReal(0.); + for (unsigned int l=1; l<ORDER; ++l) { + f2 += + T_of_x[0][l] * W2[0][l-1] + + T_of_x[1][l] * W2[1][l-1] + + T_of_x[2][l] * W2[2][l-1]; // 6 flops + for (unsigned int m=1; m<ORDER; ++m) { + f4 += + T_of_x[0][l] * T_of_x[1][m] * W4[0][(m-1)*(ORDER-1)+(l-1)] + + T_of_x[0][l] * T_of_x[2][m] * W4[1][(m-1)*(ORDER-1)+(l-1)] + + T_of_x[1][l] * T_of_x[2][m] * W4[2][(m-1)*(ORDER-1)+(l-1)]; // 9 flops + for (unsigned int n=1; n<ORDER; ++n) { + f8 += + T_of_x[0][l] * T_of_x[1][m] * T_of_x[2][n] * + W8[(n-1)*(ORDER-1)*(ORDER-1) + (m-1)*(ORDER-1) + (l-1)]; + } // ORDER * 4 flops + } // ORDER * (9 + ORDER * 4) flops + } // ORDER * (ORDER * (9 + ORDER * 4)) flops } - // gather contributions - // S[0] *= FReal(2.); S[0] += FReal(1.); - // S[1] *= FReal(2.); S[1] += FReal(1.); - // S[2] *= FReal(2.); S[2] += FReal(1.); - // targetValue += S[0] * S[1] * S[2] * localExpansion[n]; - S[0] += FReal(0.5); - S[1] += FReal(0.5); - S[2] += FReal(0.5); - // - targetValue += S[0] * S[1] * S[2] * localExpansion[n]; - } - // scale - // targetValue /= nnodes; - targetValue *= c1; + targetValue = (f1 + FReal(2.)*f2 + FReal(4.)*f4 + FReal(8.)*f8) / nnodes; // 7 flops + } // 7 + ORDER * (ORDER * (9 + ORDER * 4)) flops // set potential iter.data().setPotential(targetValue); + // increment target iterator iter.gotoNext(); - } + } // N * (7 + ORDER * (ORDER * (9 + ORDER * 4))) flops } +// FReal F2[3][ORDER-1]; +// FBlas::gemtv(ORDER, ORDER-1, FReal(1.), const_cast<FReal*>(T), const_cast<FReal*>(localExpansion), F2[0]); +// for (unsigned int i=1; i<ORDER*ORDER; ++i) +// FBlas::gemtva(ORDER, ORDER-1, FReal(1.), const_cast<FReal*>(T), +// const_cast<FReal*>(localExpansion) + ORDER*i, F2[0]); +// for (unsigned int i=0; i<ORDER-1; ++i) +// std::cout << W2[0][i] << "\t" << F2[0][i] << std::endl; +// FReal F2[(ORDER-1) * ORDER*ORDER]; +// FBlas::gemtm(ORDER, ORDER-1, ORDER*ORDER, FReal(1.), const_cast<FReal*>(T), ORDER, +// const_cast<FReal*>(localExpansion), ORDER, F2, ORDER-1); +// FReal F[ORDER-1]; FBlas::setzero(ORDER-1, F); +// for (unsigned int i=0; i<ORDER-1; ++i) +// for (unsigned int j=0; j<ORDER*ORDER; ++j) F[i] += F2[j*(ORDER-1) + i]; +// for (unsigned int i=0; i<ORDER-1; ++i) +// std::cout << W2[0][i] << "\t" << F[i] << std::endl; +///** +// * Local to particle operation: application of \f$S_\ell(x,\bar x_m)\f$ (interpolation) +// */ +//template <int ORDER> +//template <class ContainerClass> +//inline void FChebInterpolator<ORDER>::applyL2P(const FPoint& center, +// const FReal width, +// const FReal *const localExpansion, +// ContainerClass *const localParticles) const +//{ +// // allocate stuff +// const map_glob_loc map(center, width); +// FPoint localPosition; +// FReal T_of_x[ORDER][3]; +// FReal xpx,ypy,zpz ; +// FReal S[3],c1; +// // +// c1 = FReal(8.) / nnodes ; +// typename ContainerClass::BasicIterator iter(*localParticles); +// while(iter.hasNotFinished()){ +// +// // map global position to [-1,1] +// map(iter.data().getPosition(), localPosition); // 15 flops +// +// // evaluate chebyshev polynomials of source particle: T_o(x_i) +// T_of_x[0][0] = FReal(1.); T_of_x[1][0] = localPosition.getX(); +// T_of_x[0][1] = FReal(1.); T_of_x[1][1] = localPosition.getY(); +// T_of_x[0][2] = FReal(1.); T_of_x[1][2] = localPosition.getZ(); +// xpx = FReal(2.) * localPosition.getX() ; // 1 flop +// ypy = FReal(2.) * localPosition.getY() ; // 1 flop +// zpz = FReal(2.) * localPosition.getZ() ; // 1 flop +// for (unsigned int o=2; o<ORDER; ++o) { +// T_of_x[o][0] = xpx * T_of_x[o-1][0] - T_of_x[o-2][0]; // 2 flop +// T_of_x[o][1] = ypy * T_of_x[o-1][1] - T_of_x[o-2][1]; // 2 flop +// T_of_x[o][2] = zpz * T_of_x[o-1][2] - T_of_x[o-2][2]; // 2 flop +// } // (ORDER-2) * 6 flops +// +// // interpolate and increment target value +// FReal targetValue = iter.data().getPotential(); +// for (unsigned int n=0; n<nnodes; ++n) { +// const unsigned int j[3] = {node_ids[n][0], node_ids[n][1], node_ids[n][2]}; +// S[0] = T_of_x[1][0] * T_of_roots[1][j[0]]; // 1 flops +// S[1] = T_of_x[1][1] * T_of_roots[1][j[1]]; // 1 flops +// S[2] = T_of_x[1][2] * T_of_roots[1][j[2]]; // 1 flops +// for (unsigned int o=2; o<ORDER; ++o) { +// S[0] += T_of_x[o][0] * T_of_roots[o][j[0]]; // 2 flops +// S[1] += T_of_x[o][1] * T_of_roots[o][j[1]]; // 2 flops +// S[2] += T_of_x[o][2] * T_of_roots[o][j[2]]; // 2 flops +// } // (ORDER-2) * 6 flops +// // gather contributions +// S[0] += FReal(0.5); // 1 flops +// S[1] += FReal(0.5); // 1 flops +// S[2] += FReal(0.5); // 1 flops +// targetValue += S[0] * S[1] * S[2] * localExpansion[n]; // 4 flops +// } // ORDER*ORDER*ORDER * (10 + (ORDER-2) * 6) flops +// // scale +// targetValue *= c1; // 1 flops +// +// // set potential +// iter.data().setPotential(targetValue); +// +// // increment target iterator +// iter.gotoNext(); +// } // N * ORDER*ORDER*ORDER * (10 + (ORDER-2) * 6) flops +//} -/** - * Local to particle operation: application of \f$\nabla_x S_\ell(x,\bar x_m)\f$ (interpolation) - */ -template <int ORDER> -template <class ContainerClass> -inline void FChebInterpolator<ORDER>::applyL2PGradient(const FPoint& center, - const FReal width, - const FReal *const localExpansion, - ContainerClass *const localParticles) const -{ - // setup local to global mapping - const map_glob_loc map(center, width); - FPoint Jacobian; - map.computeJacobian(Jacobian); - const FReal jacobian[3] = {Jacobian.getX(), Jacobian.getY(), Jacobian.getZ()}; - FPoint localPosition; - FReal T_of_x[ORDER][3]; - FReal U_of_x[ORDER][3]; - FReal P[3]; - - typename ContainerClass::BasicIterator iter(*localParticles); - while(iter.hasNotFinished()){ - - // map global position to [-1,1] - map(iter.data().getPosition(), localPosition); - - // evaluate chebyshev polynomials of source particle - // T_0(x_i) and T_1(x_i) - T_of_x[0][0] = FReal(1.); T_of_x[1][0] = localPosition.getX(); - T_of_x[0][1] = FReal(1.); T_of_x[1][1] = localPosition.getY(); - T_of_x[0][2] = FReal(1.); T_of_x[1][2] = localPosition.getZ(); - // U_0(x_i) and U_1(x_i) - U_of_x[0][0] = FReal(1.); U_of_x[1][0] = localPosition.getX() * FReal(2.); - U_of_x[0][1] = FReal(1.); U_of_x[1][1] = localPosition.getY() * FReal(2.); - U_of_x[0][2] = FReal(1.); U_of_x[1][2] = localPosition.getZ() * FReal(2.); - for (unsigned int o=2; o<ORDER; ++o) { - // T_o(x_i) - T_of_x[o][0] = FReal(2.)*localPosition.getX()*T_of_x[o-1][0] - T_of_x[o-2][0]; - T_of_x[o][1] = FReal(2.)*localPosition.getY()*T_of_x[o-1][1] - T_of_x[o-2][1]; - T_of_x[o][2] = FReal(2.)*localPosition.getZ()*T_of_x[o-1][2] - T_of_x[o-2][2]; - // U_o(x_i) - U_of_x[o][0] = FReal(2.)*localPosition.getX()*U_of_x[o-1][0] - U_of_x[o-2][0]; - U_of_x[o][1] = FReal(2.)*localPosition.getY()*U_of_x[o-1][1] - U_of_x[o-2][1]; - U_of_x[o][2] = FReal(2.)*localPosition.getZ()*U_of_x[o-1][2] - U_of_x[o-2][2]; - } - - // scale, because dT_o/dx = oU_{o-1} - for (unsigned int o=2; o<ORDER; ++o) { - U_of_x[o-1][0] *= FReal(o); - U_of_x[o-1][1] *= FReal(o); - U_of_x[o-1][2] *= FReal(o); - } - // apply P and increment forces - FReal forces[3] = {FReal(0.), FReal(0.), FReal(0.)}; - for (unsigned int n=0; n<nnodes; ++n) { - - // tensor indices of chebyshev nodes - const unsigned int j[3] = {node_ids[n][0], node_ids[n][1], node_ids[n][2]}; - - // f0 component ////////////////////////////////////// - P[0] = U_of_x[0][0] * T_of_roots[1][j[0]]; - P[1] = T_of_x[1][1] * T_of_roots[1][j[1]]; - P[2] = T_of_x[1][2] * T_of_roots[1][j[2]]; - for (unsigned int o=2; o<ORDER; ++o) { - P[0] += U_of_x[o-1][0] * T_of_roots[o][j[0]]; - P[1] += T_of_x[o ][1] * T_of_roots[o][j[1]]; - P[2] += T_of_x[o ][2] * T_of_roots[o][j[2]]; - } - P[0] *= FReal(2.); - P[1] *= FReal(2.); P[1] += FReal(1.); - P[2] *= FReal(2.); P[2] += FReal(1.); - forces[0] += P[0] * P[1] * P[2] * localExpansion[n]; - - // f1 component ////////////////////////////////////// - P[0] = T_of_x[1][0] * T_of_roots[1][j[0]]; - P[1] = U_of_x[0][1] * T_of_roots[1][j[1]]; - P[2] = T_of_x[1][2] * T_of_roots[1][j[2]]; - for (unsigned int o=2; o<ORDER; ++o) { - P[0] += T_of_x[o ][0] * T_of_roots[o][j[0]]; - P[1] += U_of_x[o-1][1] * T_of_roots[o][j[1]]; - P[2] += T_of_x[o ][2] * T_of_roots[o][j[2]]; - } - P[0] *= FReal(2.); P[0] += FReal(1.); - P[1] *= FReal(2.); - P[2] *= FReal(2.); P[2] += FReal(1.); - forces[1] += P[0] * P[1] * P[2] * localExpansion[n]; - - // f2 component ////////////////////////////////////// - P[0] = T_of_x[1][0] * T_of_roots[1][j[0]]; - P[1] = T_of_x[1][1] * T_of_roots[1][j[1]]; - P[2] = U_of_x[0][2] * T_of_roots[1][j[2]]; - for (unsigned int o=2; o<ORDER; ++o) { - P[0] += T_of_x[o ][0] * T_of_roots[o][j[0]]; - P[1] += T_of_x[o ][1] * T_of_roots[o][j[1]]; - P[2] += U_of_x[o-1][2] * T_of_roots[o][j[2]]; - } - P[0] *= FReal(2.); P[0] += FReal(1.); - P[1] *= FReal(2.); P[1] += FReal(1.); - P[2] *= FReal(2.); - forces[2] += P[0] * P[1] * P[2] * localExpansion[n]; - } - // scale forces - forces[0] *= jacobian[0] / nnodes; - forces[1] *= jacobian[1] / nnodes; - forces[2] *= jacobian[2] / nnodes; - // set computed forces - iter.data().incForces(forces[0] * iter.data().getPhysicalValue(), - forces[1] * iter.data().getPhysicalValue(), - forces[2] * iter.data().getPhysicalValue()); - // increment iterator - iter.gotoNext(); - } -} +///** +// * Local to particle operation: application of \f$\nabla_x S_\ell(x,\bar x_m)\f$ (interpolation) +// */ +//template <int ORDER> +//template <class ContainerClass> +//inline void FChebInterpolator<ORDER>::applyL2PGradient(const FPoint& center, +// const FReal width, +// const FReal *const localExpansion, +// ContainerClass *const localParticles) const +//{ +// // setup local to global mapping +// const map_glob_loc map(center, width); +// FPoint Jacobian; +// map.computeJacobian(Jacobian); +// const FReal jacobian[3] = {Jacobian.getX(), Jacobian.getY(), Jacobian.getZ()}; +// FPoint localPosition; +// FReal T_of_x[ORDER][3]; +// FReal U_of_x[ORDER][3]; +// FReal P[3]; +// +// typename ContainerClass::BasicIterator iter(*localParticles); +// while(iter.hasNotFinished()){ +// +// // map global position to [-1,1] +// map(iter.data().getPosition(), localPosition); +// +// // evaluate chebyshev polynomials of source particle +// // T_0(x_i) and T_1(x_i) +// T_of_x[0][0] = FReal(1.); T_of_x[1][0] = localPosition.getX(); +// T_of_x[0][1] = FReal(1.); T_of_x[1][1] = localPosition.getY(); +// T_of_x[0][2] = FReal(1.); T_of_x[1][2] = localPosition.getZ(); +// // U_0(x_i) and U_1(x_i) +// U_of_x[0][0] = FReal(1.); U_of_x[1][0] = localPosition.getX() * FReal(2.); +// U_of_x[0][1] = FReal(1.); U_of_x[1][1] = localPosition.getY() * FReal(2.); +// U_of_x[0][2] = FReal(1.); U_of_x[1][2] = localPosition.getZ() * FReal(2.); +// for (unsigned int o=2; o<ORDER; ++o) { +// // T_o(x_i) +// T_of_x[o][0] = FReal(2.)*localPosition.getX()*T_of_x[o-1][0] - T_of_x[o-2][0]; +// T_of_x[o][1] = FReal(2.)*localPosition.getY()*T_of_x[o-1][1] - T_of_x[o-2][1]; +// T_of_x[o][2] = FReal(2.)*localPosition.getZ()*T_of_x[o-1][2] - T_of_x[o-2][2]; +// // U_o(x_i) +// U_of_x[o][0] = FReal(2.)*localPosition.getX()*U_of_x[o-1][0] - U_of_x[o-2][0]; +// U_of_x[o][1] = FReal(2.)*localPosition.getY()*U_of_x[o-1][1] - U_of_x[o-2][1]; +// U_of_x[o][2] = FReal(2.)*localPosition.getZ()*U_of_x[o-1][2] - U_of_x[o-2][2]; +// } +// +// // scale, because dT_o/dx = oU_{o-1} +// for (unsigned int o=2; o<ORDER; ++o) { +// U_of_x[o-1][0] *= FReal(o); +// U_of_x[o-1][1] *= FReal(o); +// U_of_x[o-1][2] *= FReal(o); +// } +// +// // apply P and increment forces +// FReal forces[3] = {FReal(0.), FReal(0.), FReal(0.)}; +// for (unsigned int n=0; n<nnodes; ++n) { +// +// // tensor indices of chebyshev nodes +// const unsigned int j[3] = {node_ids[n][0], node_ids[n][1], node_ids[n][2]}; +// +// // f0 component ////////////////////////////////////// +// P[0] = U_of_x[0][0] * T_of_roots[1][j[0]]; +// P[1] = T_of_x[1][1] * T_of_roots[1][j[1]]; +// P[2] = T_of_x[1][2] * T_of_roots[1][j[2]]; +// for (unsigned int o=2; o<ORDER; ++o) { +// P[0] += U_of_x[o-1][0] * T_of_roots[o][j[0]]; +// P[1] += T_of_x[o ][1] * T_of_roots[o][j[1]]; +// P[2] += T_of_x[o ][2] * T_of_roots[o][j[2]]; +// } +// P[0] *= FReal(2.); +// P[1] *= FReal(2.); P[1] += FReal(1.); +// P[2] *= FReal(2.); P[2] += FReal(1.); +// forces[0] += P[0] * P[1] * P[2] * localExpansion[n]; +// +// // f1 component ////////////////////////////////////// +// P[0] = T_of_x[1][0] * T_of_roots[1][j[0]]; +// P[1] = U_of_x[0][1] * T_of_roots[1][j[1]]; +// P[2] = T_of_x[1][2] * T_of_roots[1][j[2]]; +// for (unsigned int o=2; o<ORDER; ++o) { +// P[0] += T_of_x[o ][0] * T_of_roots[o][j[0]]; +// P[1] += U_of_x[o-1][1] * T_of_roots[o][j[1]]; +// P[2] += T_of_x[o ][2] * T_of_roots[o][j[2]]; +// } +// P[0] *= FReal(2.); P[0] += FReal(1.); +// P[1] *= FReal(2.); +// P[2] *= FReal(2.); P[2] += FReal(1.); +// forces[1] += P[0] * P[1] * P[2] * localExpansion[n]; +// +// // f2 component ////////////////////////////////////// +// P[0] = T_of_x[1][0] * T_of_roots[1][j[0]]; +// P[1] = T_of_x[1][1] * T_of_roots[1][j[1]]; +// P[2] = U_of_x[0][2] * T_of_roots[1][j[2]]; +// for (unsigned int o=2; o<ORDER; ++o) { +// P[0] += T_of_x[o ][0] * T_of_roots[o][j[0]]; +// P[1] += T_of_x[o ][1] * T_of_roots[o][j[1]]; +// P[2] += U_of_x[o-1][2] * T_of_roots[o][j[2]]; +// } +// P[0] *= FReal(2.); P[0] += FReal(1.); +// P[1] *= FReal(2.); P[1] += FReal(1.); +// P[2] *= FReal(2.); +// forces[2] += P[0] * P[1] * P[2] * localExpansion[n]; +// } +// +// // scale forces +// forces[0] *= jacobian[0] / nnodes; +// forces[1] *= jacobian[1] / nnodes; +// forces[2] *= jacobian[2] / nnodes; +// +// // set computed forces +// iter.data().incForces(forces[0] * iter.data().getPhysicalValue(), +// forces[1] * iter.data().getPhysicalValue(), +// forces[2] * iter.data().getPhysicalValue()); +// +// // increment iterator +// iter.gotoNext(); +// } +//} /** @@ -800,116 +1059,312 @@ inline void FChebInterpolator<ORDER>::applyL2PTotal(const FPoint& center, const FReal *const localExpansion, ContainerClass *const localParticles) const { - // setup local to global mapping + FReal f1; + FReal W2[3][ ORDER-1]; + FReal W4[3][(ORDER-1)*(ORDER-1)]; + FReal W8[ (ORDER-1)*(ORDER-1)*(ORDER-1)]; + + //{ // sum over interpolation points + // f1 = FReal(0.); + // for(unsigned int i=0; i<ORDER-1; ++i) W2[0][i] = W2[1][i] = W2[2][i] = FReal(0.); + // for(unsigned int i=0; i<(ORDER-1)*(ORDER-1); ++i) W4[0][i] = W4[1][i] = W4[2][i] = FReal(0.); + // for(unsigned int i=0; i<(ORDER-1)*(ORDER-1)*(ORDER-1); ++i) W8[i] = FReal(0.); + // + // for (unsigned int idx=0; idx<nnodes; ++idx) { + // const unsigned int i = node_ids[idx][0]; + // const unsigned int j = node_ids[idx][1]; + // const unsigned int k = node_ids[idx][2]; + // + // f1 += localExpansion[idx]; // 1 flop + // + // for (unsigned int l=0; l<ORDER-1; ++l) { + // const FReal wx = T[l*ORDER+i] * localExpansion[idx]; // 1 flops + // const FReal wy = T[l*ORDER+j] * localExpansion[idx]; // 1 flops + // const FReal wz = T[l*ORDER+k] * localExpansion[idx]; // 1 flops + // W2[0][l] += wx; // 1 flops + // W2[1][l] += wy; // 1 flops + // W2[2][l] += wz; // 1 flops + // for (unsigned int m=0; m<ORDER-1; ++m) { + // const FReal wxy = wx * T[m*ORDER + j]; // 1 flops + // const FReal wxz = wx * T[m*ORDER + k]; // 1 flops + // const FReal wyz = wy * T[m*ORDER + k]; // 1 flops + // W4[0][m*(ORDER-1)+l] += wxy; // 1 flops + // W4[1][m*(ORDER-1)+l] += wxz; // 1 flops + // W4[2][m*(ORDER-1)+l] += wyz; // 1 flops + // for (unsigned int n=0; n<ORDER-1; ++n) { + // const FReal wxyz = wxy * T[n*ORDER + k]; // 1 flops + // W8[n*(ORDER-1)*(ORDER-1) + m*(ORDER-1) + l] += wxyz; // 1 flops + // } // (ORDER-1) * 2 flops + // } // (ORDER-1) * (6 + (ORDER-1)*2) flops + // } // (ORDER-1) * (6 + (ORDER-1) * (6 + (ORDER-1)*2)) flops + // + // } // ORDER*ORDER*ORDER * (1 + (ORDER-1) * (6 + (ORDER-1) * (6 + (ORDER-1)*2))) flops + // + //} + + { + // for W2 + FReal lE[nnodes]; + FReal F2[(ORDER-1) * ORDER*ORDER]; + // for W4 + FReal F4[ORDER * ORDER*(ORDER-1)]; + FReal G4[(ORDER-1) * ORDER*(ORDER-1)]; + // for W8 + FReal G8[ORDER * (ORDER-1)*(ORDER-1)]; + + // sum local expansions + f1 = FReal(0.); + for (unsigned int idx=0; idx<nnodes; ++idx) f1 += localExpansion[idx]; // 1 flop + + ////////////////////////////////////////////////////////////////// + // IMPORTANT: NOT CHANGE ORDER OF COMPUTATIONS!!! //////////////// + ////////////////////////////////////////////////////////////////// + + // W2[0] ///////////////// + FBlas::gemtm(ORDER, ORDER-1, ORDER*ORDER, FReal(1.), const_cast<FReal*>(T), ORDER, + const_cast<FReal*>(localExpansion), ORDER, F2, ORDER-1); + for (unsigned int l=0; l<ORDER-1; ++l) { W2[0][l] = F2[l]; + for (unsigned int j=1; j<ORDER*ORDER; ++j) W2[0][l] += F2[j*(ORDER-1) + l]; } + // W4[0] ///////////////// + perm5.permute(F2, F4); + FBlas::gemtm(ORDER, ORDER-1, ORDER*(ORDER-1), FReal(1.), const_cast<FReal*>(T), ORDER, F4, ORDER, G4, ORDER-1); + for (unsigned int l=0; l<ORDER-1; ++l) + for (unsigned int m=0; m<ORDER-1; ++m) { W4[0][m*(ORDER-1)+l] = G4[l*ORDER*(ORDER-1) + m]; + for (unsigned int k=1; k<ORDER; ++k) W4[0][m*(ORDER-1)+l] += G4[l*ORDER*(ORDER-1) + k*(ORDER-1) + m]; } + // W8 //////////////////// + perm8.permute(G4, G8); + FReal F8[(ORDER-1)*(ORDER-1)*(ORDER-1)]; + FBlas::gemtm(ORDER, ORDER-1, (ORDER-1)*(ORDER-1), FReal(1.), const_cast<FReal*>(T), ORDER, G8, ORDER, F8, ORDER-1); + perm9.permute(F8, W8); + // W4[1] ///////////////// + perm6.permute(F2, F4); + FBlas::gemtm(ORDER, ORDER-1, (ORDER-1)*ORDER, FReal(1.), const_cast<FReal*>(T), ORDER, F4, ORDER, G4, ORDER-1); + for (unsigned int l=0; l<ORDER-1; ++l) + for (unsigned int n=0; n<ORDER-1; ++n) { W4[1][n*(ORDER-1)+l] = G4[l*(ORDER-1) + n]; + for (unsigned int j=1; j<ORDER; ++j) W4[1][n*(ORDER-1)+l] += G4[j*(ORDER-1)*(ORDER-1) + l*(ORDER-1) + n]; } + // W2[1] ///////////////// + perm3.permute(localExpansion, lE); + FBlas::gemtm(ORDER, ORDER-1, ORDER*ORDER, FReal(1.), const_cast<FReal*>(T), ORDER, lE, ORDER, F2, ORDER-1); + for (unsigned int i=0; i<ORDER-1; ++i) { W2[1][i] = F2[i]; + for (unsigned int j=1; j<ORDER*ORDER; ++j) W2[1][i] += F2[j*(ORDER-1) + i]; } + // W4[2] ///////////////// + perm7.permute(F2, F4); + FBlas::gemtm(ORDER, ORDER-1, (ORDER-1)*ORDER, FReal(1.), const_cast<FReal*>(T), ORDER, F4, ORDER, G4, ORDER-1); + for (unsigned int m=0; m<ORDER-1; ++m) + for (unsigned int n=0; n<ORDER-1; ++n) { W4[2][n*(ORDER-1)+m] = G4[m*ORDER*(ORDER-1) + n]; + for (unsigned int i=1; i<ORDER; ++i) W4[2][n*(ORDER-1)+m] += G4[m*ORDER*(ORDER-1) + i*(ORDER-1) + n]; } + // W2[2] ///////////////// + perm4.permute(localExpansion, lE); + FBlas::gemtm(ORDER, ORDER-1, ORDER*ORDER, FReal(1.), const_cast<FReal*>(T), ORDER, lE, ORDER, F2, ORDER-1); + for (unsigned int i=0; i<ORDER-1; ++i) { W2[2][i] = F2[i]; + for (unsigned int j=1; j<ORDER*ORDER; ++j) W2[2][i] += F2[j*(ORDER-1) + i]; } + } + + + // loop over particles const map_glob_loc map(center, width); FPoint Jacobian; map.computeJacobian(Jacobian); // 6 flops const FReal jacobian[3] = {Jacobian.getX(), Jacobian.getY(), Jacobian.getZ()}; FPoint localPosition; - FReal T_of_x[ORDER][3]; - FReal U_of_x[ORDER][3]; - FReal P[6]; - // - FReal xpx,ypy,zpz ; - FReal c1 = FReal(8.0) / nnodes; // 1 flop - // + typename ContainerClass::BasicIterator iter(*localParticles); while(iter.hasNotFinished()){ // map global position to [-1,1] map(iter.data().getPosition(), localPosition); // 15 flops + + FReal U_of_x[3][ORDER]; + FReal T_of_x[3][ORDER]; + { + T_of_x[0][0] = FReal(1.); T_of_x[0][1] = localPosition.getX(); + T_of_x[1][0] = FReal(1.); T_of_x[1][1] = localPosition.getY(); + T_of_x[2][0] = FReal(1.); T_of_x[2][1] = localPosition.getZ(); + const FReal x2 = FReal(2.) * T_of_x[0][1]; // 1 flop + const FReal y2 = FReal(2.) * T_of_x[1][1]; // 1 flop + const FReal z2 = FReal(2.) * T_of_x[2][1]; // 1 flop + U_of_x[0][0] = FReal(1.); U_of_x[0][1] = x2; + U_of_x[1][0] = FReal(1.); U_of_x[1][1] = y2; + U_of_x[2][0] = FReal(1.); U_of_x[2][1] = z2; + for (unsigned int j=2; j<ORDER; ++j) { + T_of_x[0][j] = x2 * T_of_x[0][j-1] - T_of_x[0][j-2]; // 2 flops + T_of_x[1][j] = y2 * T_of_x[1][j-1] - T_of_x[1][j-2]; // 2 flops + T_of_x[2][j] = z2 * T_of_x[2][j-1] - T_of_x[2][j-2]; // 2 flops + U_of_x[0][j] = x2 * U_of_x[0][j-1] - U_of_x[0][j-2]; // 2 flops + U_of_x[1][j] = y2 * U_of_x[1][j-1] - U_of_x[1][j-2]; // 2 flops + U_of_x[2][j] = z2 * U_of_x[2][j-1] - U_of_x[2][j-2]; // 2 flops + } - // evaluate chebyshev polynomials of source particle - // T_0(x_i) and T_1(x_i) - xpx = FReal(2.) * localPosition.getX(); // 1 flop - ypy = FReal(2.) * localPosition.getY(); // 1 flop - zpz = FReal(2.) * localPosition.getZ(); // 1 flop - // - T_of_x[0][0] = FReal(1.); T_of_x[1][0] = localPosition.getX(); - T_of_x[0][1] = FReal(1.); T_of_x[1][1] = localPosition.getY(); - T_of_x[0][2] = FReal(1.); T_of_x[1][2] = localPosition.getZ(); - // U_0(x_i) and U_1(x_i) - // U_of_x[0][0] = FReal(1.); U_of_x[1][0] = localPosition.getX() * FReal(2.); - // U_of_x[0][1] = FReal(1.); U_of_x[1][1] = localPosition.getY() * FReal(2.); - // U_of_x[0][2] = FReal(1.); U_of_x[1][2] = localPosition.getZ() * FReal(2.); - U_of_x[0][0] = FReal(1.); U_of_x[1][0] = xpx; - U_of_x[0][1] = FReal(1.); U_of_x[1][1] = ypy; - U_of_x[0][2] = FReal(1.); U_of_x[1][2] = zpz; - for (unsigned int o=2; o<ORDER; ++o) { - // T_o(x_i) - // T_of_x[o][0] = FReal(2.)*localPosition.getX()*T_of_x[o-1][0] - T_of_x[o-2][0]; - // T_of_x[o][1] = FReal(2.)*localPosition.getY()*T_of_x[o-1][1] - T_of_x[o-2][1]; - // T_of_x[o][2] = FReal(2.)*localPosition.getZ()*T_of_x[o-1][2] - T_of_x[o-2][2]; - // // U_o(x_i) - // U_of_x[o][0] = FReal(2.)*localPosition.getX()*U_of_x[o-1][0] - U_of_x[o-2][0]; - // U_of_x[o][1] = FReal(2.)*localPosition.getY()*U_of_x[o-1][1] - U_of_x[o-2][1]; - // U_of_x[o][2] = FReal(2.)*localPosition.getZ()*U_of_x[o-1][2] - U_of_x[o-2][2]; - T_of_x[o][0] = xpx * T_of_x[o-1][0] - T_of_x[o-2][0]; // 2 flops - T_of_x[o][1] = ypy * T_of_x[o-1][1] - T_of_x[o-2][1]; // 2 flops - T_of_x[o][2] = zpz * T_of_x[o-1][2] - T_of_x[o-2][2]; // 2 flops - // U_o(x_i) - U_of_x[o][0] = xpx * U_of_x[o-1][0] - U_of_x[o-2][0]; // 2 flops - U_of_x[o][1] = ypy * U_of_x[o-1][1] - U_of_x[o-2][1]; // 2 flops - U_of_x[o][2] = zpz * U_of_x[o-1][2] - U_of_x[o-2][2]; // 2 flops - } + // scale, because dT_j/dx = jU_{j-1} + for (unsigned int j=2; j<ORDER; ++j) { + U_of_x[0][j-1] *= FReal(j); // 1 flops + U_of_x[1][j-1] *= FReal(j); // 1 flops + U_of_x[2][j-1] *= FReal(j); // 1 flops + } - // scale, because dT_o/dx = oU_{o-1} - for (unsigned int o=2; o<ORDER; ++o) { - U_of_x[o-1][0] *= FReal(o); // 1 flops - U_of_x[o-1][1] *= FReal(o); // 1 flops - U_of_x[o-1][2] *= FReal(o); // 1 flops } // apply P and increment forces FReal potential = FReal(0.); FReal forces[3] = {FReal(0.), FReal(0.), FReal(0.)}; - // - // Optimization: - // Here we compute 1/2 S and 1/2 P rather S and F like in the paper - for (unsigned int n=0; n<nnodes; ++n) { - - // tensor indices of chebyshev nodes - const unsigned int j[3] = {node_ids[n][0], node_ids[n][1], node_ids[n][2]}; - // - P[0] = FReal(0.5) + T_of_x[1][0] * T_of_roots[1][j[0]]; // 2 flops - P[1] = FReal(0.5) + T_of_x[1][1] * T_of_roots[1][j[1]]; // 2 flops - P[2] = FReal(0.5) + T_of_x[1][2] * T_of_roots[1][j[2]]; // 2 flops - P[3] = U_of_x[0][0] * T_of_roots[1][j[0]]; // 1 flop - P[4] = U_of_x[0][1] * T_of_roots[1][j[1]]; // 1 flop - P[5] = U_of_x[0][2] * T_of_roots[1][j[2]]; // 1 flop - for (unsigned int o=2; o<ORDER; ++o) { - P[0] += T_of_x[o ][0] * T_of_roots[o][j[0]]; // 2 flop - P[1] += T_of_x[o ][1] * T_of_roots[o][j[1]]; // 2 flop - P[2] += T_of_x[o ][2] * T_of_roots[o][j[2]]; // 2 flop - P[3] += U_of_x[o-1][0] * T_of_roots[o][j[0]]; // 2 flop - P[4] += U_of_x[o-1][1] * T_of_roots[o][j[1]]; // 2 flop - P[5] += U_of_x[o-1][2] * T_of_roots[o][j[2]]; // 2 flop - } - // - potential += P[0] * P[1] * P[2] * localExpansion[n]; // 4 flops - forces[0] += P[3] * P[1] * P[2] * localExpansion[n]; // 4 flops - forces[1] += P[0] * P[4] * P[2] * localExpansion[n]; // 4 flops - forces[2] += P[0] * P[1] * P[5] * localExpansion[n]; // 4 flops - } - // - potential *= c1 ; // 1 flop - forces[0] *= jacobian[0] *c1; // 2 flops - forces[1] *= jacobian[1] *c1; // 2 flops - forces[2] *= jacobian[2] *c1; // 2 flops + { + FReal f2[4], f4[4], f8[4]; + for (unsigned int i=0; i<4; ++i) f2[i] = f4[i] = f8[i] = FReal(0.); + { + for (unsigned int l=1; l<ORDER; ++l) { + const FReal w2[3] = {W2[0][l-1], W2[1][l-1], W2[2][l-1]}; + f2[0] += T_of_x[0][l ] * w2[0] + T_of_x[1][l] * w2[1] + T_of_x[2][l] * w2[2]; // 6 flops + f2[1] += U_of_x[0][l-1] * w2[0]; // 2 flops + f2[2] += U_of_x[1][l-1] * w2[1]; // 2 flops + f2[3] += U_of_x[2][l-1] * w2[2]; // 2 flops + for (unsigned int m=1; m<ORDER; ++m) { + const unsigned int w4idx = (m-1)*(ORDER-1)+(l-1); + const FReal w4[3] = {W4[0][w4idx], W4[1][w4idx], W4[2][w4idx]}; + f4[0] += + T_of_x[0][l] * T_of_x[1][m] * w4[0] + + T_of_x[0][l] * T_of_x[2][m] * w4[1] + + T_of_x[1][l] * T_of_x[2][m] * w4[2]; // 9 flops + f4[1] += U_of_x[0][l-1] * T_of_x[1][m] * w4[0] + U_of_x[0][l-1] * T_of_x[2][m] * w4[1]; // 6 flops + f4[2] += T_of_x[0][l] * U_of_x[1][m-1] * w4[0] + U_of_x[1][l-1] * T_of_x[2][m] * w4[2]; // 6 flops + f4[3] += T_of_x[0][l] * U_of_x[2][m-1] * w4[1] + T_of_x[1][l] * U_of_x[2][m-1] * w4[2]; // 6 flops + for (unsigned int n=1; n<ORDER; ++n) { + const FReal w8 = W8[(n-1)*(ORDER-1)*(ORDER-1) + (m-1)*(ORDER-1) + (l-1)]; + f8[0] += T_of_x[0][l] * T_of_x[1][m] * T_of_x[2][n] * w8; + f8[1] += U_of_x[0][l-1] * T_of_x[1][m] * T_of_x[2][n] * w8; + f8[2] += T_of_x[0][l] * U_of_x[1][m-1] * T_of_x[2][n] * w8; + f8[3] += T_of_x[0][l] * T_of_x[1][m] * U_of_x[2][n-1] * w8; + } // ORDER * 4 flops + } // ORDER * (9 + ORDER * 4) flops + } // ORDER * (ORDER * (9 + ORDER * 4)) flops + } + potential = (f1 + FReal(2.)*f2[0] + FReal(4.)*f4[0] + FReal(8.)*f8[0]) / nnodes; // 7 flops + forces[0] = ( FReal(2.)*f2[1] + FReal(4.)*f4[1] + FReal(8.)*f8[1]) * jacobian[0] / nnodes; // 6 flops + forces[1] = ( FReal(2.)*f2[2] + FReal(4.)*f4[2] + FReal(8.)*f8[2]) * jacobian[1] / nnodes; // 6 flops + forces[2] = ( FReal(2.)*f2[3] + FReal(4.)*f4[3] + FReal(8.)*f8[3]) * jacobian[2] / nnodes; // 6 flops + } // 7 + ORDER * (ORDER * (9 + ORDER * 4)) flops + // set computed potential iter.data().incPotential(potential); // 1 flop - + // set computed forces iter.data().incForces(forces[0] * iter.data().getPhysicalValue(), forces[1] * iter.data().getPhysicalValue(), forces[2] * iter.data().getPhysicalValue()); // 6 flops - // increment iterator + // increment target iterator iter.gotoNext(); - } + } // N * (7 + ORDER * (ORDER * (9 + ORDER * 4))) flops } +///** +// * Local to particle operation: application of \f$S_\ell(x,\bar x_m)\f$ and +// * \f$\nabla_x S_\ell(x,\bar x_m)\f$ (interpolation) +// */ +//template <int ORDER> +//template <class ContainerClass> +//inline void FChebInterpolator<ORDER>::applyL2PTotal(const FPoint& center, +// const FReal width, +// const FReal *const localExpansion, +// ContainerClass *const localParticles) const +//{ +// // setup local to global mapping +// const map_glob_loc map(center, width); +// FPoint Jacobian; +// map.computeJacobian(Jacobian); // 6 flops +// const FReal jacobian[3] = {Jacobian.getX(), Jacobian.getY(), Jacobian.getZ()}; +// FPoint localPosition; +// FReal T_of_x[ORDER][3]; +// FReal U_of_x[ORDER][3]; +// FReal P[6]; +// // +// FReal xpx,ypy,zpz ; +// FReal c1 = FReal(8.0) / nnodes; // 1 flop +// // +// typename ContainerClass::BasicIterator iter(*localParticles); +// while(iter.hasNotFinished()){ +// +// // map global position to [-1,1] +// map(iter.data().getPosition(), localPosition); // 15 flops +// +// // evaluate chebyshev polynomials of source particle +// // T_0(x_i) and T_1(x_i) +// xpx = FReal(2.) * localPosition.getX(); // 1 flop +// ypy = FReal(2.) * localPosition.getY(); // 1 flop +// zpz = FReal(2.) * localPosition.getZ(); // 1 flop +// // +// T_of_x[0][0] = FReal(1.); T_of_x[1][0] = localPosition.getX(); +// T_of_x[0][1] = FReal(1.); T_of_x[1][1] = localPosition.getY(); +// T_of_x[0][2] = FReal(1.); T_of_x[1][2] = localPosition.getZ(); +// U_of_x[0][0] = FReal(1.); U_of_x[1][0] = xpx; +// U_of_x[0][1] = FReal(1.); U_of_x[1][1] = ypy; +// U_of_x[0][2] = FReal(1.); U_of_x[1][2] = zpz; +// for (unsigned int o=2; o<ORDER; ++o) { +// T_of_x[o][0] = xpx * T_of_x[o-1][0] - T_of_x[o-2][0]; // 2 flops +// T_of_x[o][1] = ypy * T_of_x[o-1][1] - T_of_x[o-2][1]; // 2 flops +// T_of_x[o][2] = zpz * T_of_x[o-1][2] - T_of_x[o-2][2]; // 2 flops +// U_of_x[o][0] = xpx * U_of_x[o-1][0] - U_of_x[o-2][0]; // 2 flops +// U_of_x[o][1] = ypy * U_of_x[o-1][1] - U_of_x[o-2][1]; // 2 flops +// U_of_x[o][2] = zpz * U_of_x[o-1][2] - U_of_x[o-2][2]; // 2 flops +// } +// +// // scale, because dT_o/dx = oU_{o-1} +// for (unsigned int o=2; o<ORDER; ++o) { +// U_of_x[o-1][0] *= FReal(o); // 1 flops +// U_of_x[o-1][1] *= FReal(o); // 1 flops +// U_of_x[o-1][2] *= FReal(o); // 1 flops +// } +// +// // apply P and increment forces +// FReal potential = FReal(0.); +// FReal forces[3] = {FReal(0.), FReal(0.), FReal(0.)}; +// // +// // Optimization: +// // Here we compute 1/2 S and 1/2 P rather S and F like in the paper +// for (unsigned int n=0; n<nnodes; ++n) { +// +// // tensor indices of chebyshev nodes +// const unsigned int j[3] = {node_ids[n][0], node_ids[n][1], node_ids[n][2]}; +// // +// P[0] = FReal(0.5) + T_of_x[1][0] * T_of_roots[1][j[0]]; // 2 flops +// P[1] = FReal(0.5) + T_of_x[1][1] * T_of_roots[1][j[1]]; // 2 flops +// P[2] = FReal(0.5) + T_of_x[1][2] * T_of_roots[1][j[2]]; // 2 flops +// P[3] = U_of_x[0][0] * T_of_roots[1][j[0]]; // 1 flop +// P[4] = U_of_x[0][1] * T_of_roots[1][j[1]]; // 1 flop +// P[5] = U_of_x[0][2] * T_of_roots[1][j[2]]; // 1 flop +// for (unsigned int o=2; o<ORDER; ++o) { +// P[0] += T_of_x[o ][0] * T_of_roots[o][j[0]]; // 2 flop +// P[1] += T_of_x[o ][1] * T_of_roots[o][j[1]]; // 2 flop +// P[2] += T_of_x[o ][2] * T_of_roots[o][j[2]]; // 2 flop +// P[3] += U_of_x[o-1][0] * T_of_roots[o][j[0]]; // 2 flop +// P[4] += U_of_x[o-1][1] * T_of_roots[o][j[1]]; // 2 flop +// P[5] += U_of_x[o-1][2] * T_of_roots[o][j[2]]; // 2 flop +// } +// // +// potential += P[0] * P[1] * P[2] * localExpansion[n]; // 4 flops +// forces[0] += P[3] * P[1] * P[2] * localExpansion[n]; // 4 flops +// forces[1] += P[0] * P[4] * P[2] * localExpansion[n]; // 4 flops +// forces[2] += P[0] * P[1] * P[5] * localExpansion[n]; // 4 flops +// } +// // +// potential *= c1 ; // 1 flop +// forces[0] *= jacobian[0] *c1; // 2 flops +// forces[1] *= jacobian[1] *c1; // 2 flops +// forces[2] *= jacobian[2] *c1; // 2 flops +// // set computed potential +// iter.data().incPotential(potential); // 1 flop +// +// // set computed forces +// iter.data().incForces(forces[0] * iter.data().getPhysicalValue(), +// forces[1] * iter.data().getPhysicalValue(), +// forces[2] * iter.data().getPhysicalValue()); // 6 flops +// +// // increment iterator +// iter.gotoNext(); +// } +//} + + #endif diff --git a/Tests/Kernels/testChebAlgorithm.cpp b/Tests/Kernels/testChebAlgorithm.cpp index 08f6d435990970125261c78b849b662f708c1393..ecfe8a71b2695e03fbf23526b4e854f187ce3105 100644 --- a/Tests/Kernels/testChebAlgorithm.cpp +++ b/Tests/Kernels/testChebAlgorithm.cpp @@ -78,8 +78,8 @@ int main(int argc, char* argv[]) const unsigned int SubTreeHeight = FParameters::getValue(argc, argv, "-sh", 2); const unsigned int NbThreads = FParameters::getValue(argc, argv, "-t", 1); - const unsigned int ORDER = 7; - const FReal epsilon = FReal(1e-7); + const unsigned int ORDER = 3; + const FReal epsilon = FReal(1e-3); // set threads omp_set_num_threads(NbThreads); @@ -113,7 +113,8 @@ int main(int argc, char* argv[]) OctreeClass tree(TreeHeight, SubTreeHeight, loader.getBoxWidth(), loader.getCenterOfBox()); // ----------------------------------------------------- - std::cout << "Creating and inserting " << loader.getNumberOfParticles() << " particles in a octree of height " << TreeHeight + std::cout << "Creating and inserting " << loader.getNumberOfParticles() + << " particles in a octree of height " << TreeHeight << " ..." << std::endl; time.tic(); loader.fillTree(tree); diff --git a/Tests/Utils/testChebSxUCBSy.cpp b/Tests/Utils/testChebSxUCBSy.cpp index 55271446c859cf6408527101378684c3b1fa58f4..a01cd552199f633326fca82e9fdece33c43fcbd2 100644 --- a/Tests/Utils/testChebSxUCBSy.cpp +++ b/Tests/Utils/testChebSxUCBSy.cpp @@ -157,16 +157,35 @@ int main(int argc, char* argv[]) FPoint rootsX[nnodes], rootsY[nnodes]; FChebTensor<ORDER>::setRoots(cx, width, rootsX); FChebTensor<ORDER>::setRoots(cy, width, rootsY); - for (unsigned int i=0; i<nnodes; ++i) { - F[i] = FReal(0.); - for (unsigned int j=0; j<nnodes; ++j) - F[i] += MatrixKernel.evaluate(rootsX[i], rootsY[j]) * W[j]; + + { + for (unsigned int i=0; i<nnodes; ++i) { + F[i] = FReal(0.); + for (unsigned int j=0; j<nnodes; ++j) + F[i] += MatrixKernel.evaluate(rootsX[i], rootsY[j]) * W[j]; + } } +// { +// for (unsigned int ix=0; ix<ORDER; ++ix) +// for (unsigned int jx=0; jx<ORDER; ++jx) +// for (unsigned int kx=0; kx<ORDER; ++kx) { +// const unsigned int idx = kx*ORDER*ORDER + jx*ORDER + ix; +// F[idx] = FReal(0.); +// for (unsigned int iy=0; iy<ORDER; ++iy) +// for (unsigned int jy=0; jy<ORDER; ++jy) +// for (unsigned int ky=0; ky<ORDER; ++ky) { +// const unsigned int idy = ky*ORDER*ORDER + jy*ORDER + iy; +// F[idx] += MatrixKernel.evaluate(rootsX[idx], rootsY[idy]) * W[idy]; +// } +// } +// } + + // Interpolate f_i = \sum_m^L S(x_i,\bar x_m) * F_m time.tic(); //S.applyL2PTotal(cx, width, F, X.getTargets()); - S.applyL2P(cx, width, F, X.getTargets()); + S.applyL2PTotal(cx, width, F, X.getTargets()); std::cout << "L2P done in " << time.tacAndElapsed() << "s" << std::endl; // ----------------------------------------------------- @@ -205,8 +224,8 @@ int main(int argc, char* argv[]) } -// for (unsigned int i=0; i<1; ++i) -// std::cout << f[i] << "\t" << approx_f[i] << "\t" << approx_f[i]/f[i] << std::endl; + //for (unsigned int i=0; i<8; ++i) + // std::cout << f[i] << "\t" << approx_f[i] << "\t" << approx_f[i]/f[i] << std::endl; std::cout << "\nRelative L2 error = " << computeL2norm( M, f, approx_f) << std::endl; diff --git a/Tests/Utils/testChebTensorProduct.cpp b/Tests/Utils/testChebTensorProduct.cpp index 26e0daabe7adb69d679ffb1b00d1fc067b8aece7..cd2b75dc8a08cf2776c0bba8af52e447b0d6ea65 100644 --- a/Tests/Utils/testChebTensorProduct.cpp +++ b/Tests/Utils/testChebTensorProduct.cpp @@ -44,7 +44,7 @@ void applym2m(FReal *const S, FReal *const w, const unsigned int n, FReal *const { FBlas::gemtm(n, n, n*n, FReal(1.), S, n, w, n, W, n); } void applyL2L(FReal *const S, FReal *const F, const unsigned int n, FReal *const f, const unsigned int N) -{ FBlas::gemva(n, n, FReal(1.), S, F, f); } +{ FBlas::gemva(N, n, FReal(1.), S, F, f); } void applyl2l(FReal *const S, FReal *const F, const unsigned int n, FReal *const f, const unsigned int N) { FBlas::gemm(n, n, n*n, FReal(1.), S, n, F, n, f, n); } @@ -145,7 +145,8 @@ int main(int argc, char* argv[]) m2m_error += w[n] - W0[n]; } - std::cout << "ERROR M2M = " << m2m_error << std::endl; + std::cout << "------------------------------------------" + << "\n - M2M: ERROR = " << m2m_error << std::endl; @@ -163,8 +164,9 @@ int main(int argc, char* argv[]) //std::cout << n << "\t" << f[n] << " - " << F0[n] << " = " << f[n]-F0[n] << std::endl; l2l_error += f[n] - F0[n]; } - - std::cout << "ERROR L2L = " << l2l_error << std::endl; + + std::cout << "------------------------------------------" + << "\n - L2L: ERROR = " << l2l_error << std::endl; } @@ -173,10 +175,6 @@ int main(int argc, char* argv[]) // P2M ///////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////// - - std::cout << "\n--------------------------------------\n" - << "-- P2M -------------------------------\n" - << "--------------------------------------" << std::endl; const unsigned int M = 10; @@ -192,19 +190,14 @@ int main(int argc, char* argv[]) points[1][p] = (FReal(rand())/FRandMax - FReal(.5)) * FReal(2.); points[2][p] = (FReal(rand())/FRandMax - FReal(.5)) * FReal(2.); weights[p] = FReal(rand())/FRandMax; - //std::cout << points[0][p] << "\t" - // << points[1][p] << "\t" - // << points[2][p] << "\t" - // << weights[p] << std::endl; lp[p].setX(points[0][p]); lp[p].setY(points[1][p]); lp[p].setZ(points[2][p]); - //std::cout << lp[p] << std::endl; } } //////////////////////////////////////////////////////// - { + { // compute exact equivalent source values for(unsigned int i=0; i<nnodes; ++i) equivW[i] = FReal(0.); FReal Snorm[M * nnodes]; Interpolator.assembleInterpolator(M, lp, Snorm); @@ -272,8 +265,8 @@ int main(int argc, char* argv[]) FReal F8[ ORDER*ORDER*ORDER]; { //////////////////////////////////////////////////////// - //for(unsigned int i=0; i<ORDER; ++i) - // F2[0][i] = F2[1][i] = F2[2][i] = FReal(0.); + for(unsigned int i=0; i<ORDER; ++i) + F2[0][i] = F2[1][i] = F2[2][i] = FReal(0.); for(unsigned int i=0; i<ORDER*ORDER; ++i) F4[0][i] = F4[1][i] = F4[2][i] = FReal(0.); for(unsigned int i=0; i<ORDER*ORDER*ORDER; ++i) @@ -284,76 +277,25 @@ int main(int argc, char* argv[]) for (unsigned int j=0; j<ORDER; ++j) T_of_y[(o-1)*ORDER + j] = FReal(FChebRoots<ORDER>::T(o, FReal(FChebRoots<ORDER>::roots[j]))); - FBlas::gemv(ORDER, ORDER-1, FReal(1.), T_of_y, W2[0], F2[0]); - FBlas::gemv(ORDER, ORDER-1, FReal(1.), T_of_y, W2[1], F2[1]); - FBlas::gemv(ORDER, ORDER-1, FReal(1.), T_of_y, W2[2], F2[2]); - - FReal C[ORDER * (ORDER-1)]; - FBlas::gemmt(ORDER, ORDER-1, ORDER-1, FReal(1.), T_of_y, ORDER, W4[0], ORDER-1, C, ORDER); - FBlas::gemmt(ORDER, ORDER-1, ORDER-1, FReal(1.), T_of_y, ORDER, C, ORDER-1, F4[0], ORDER); - FBlas::gemmt(ORDER, ORDER-1, ORDER-1, FReal(1.), T_of_y, ORDER, W4[1], ORDER-1, C, ORDER); - FBlas::gemmt(ORDER, ORDER-1, ORDER-1, FReal(1.), T_of_y, ORDER, C, ORDER-1, F4[1], ORDER); - FBlas::gemmt(ORDER, ORDER-1, ORDER-1, FReal(1.), T_of_y, ORDER, W4[2], ORDER-1, C, ORDER); - FBlas::gemmt(ORDER, ORDER-1, ORDER-1, FReal(1.), T_of_y, ORDER, C, ORDER-1, F4[2], ORDER); - - FReal D[ORDER * (ORDER-1) * (ORDER-1)]; - FBlas::gemm(ORDER, ORDER-1, (ORDER-1)*(ORDER-1), FReal(1.), T_of_y, ORDER, W8, ORDER-1, D, ORDER); - FReal E[(ORDER-1) * (ORDER-1) * ORDER]; - for (unsigned int i=0; i<ORDER; ++i) { - for (unsigned int m=0; m<ORDER-1; ++m) { - for (unsigned int n=0; n<ORDER-1; ++n) { - const unsigned int a = n*(ORDER-1)*ORDER + m*ORDER + i; - const unsigned int b = i*(ORDER-1)*(ORDER-1) + n*(ORDER-1) + m; - E[b] = D[a]; - } - } - } - FReal F[ORDER * (ORDER-1) * ORDER]; - FBlas::gemm(ORDER, ORDER-1, ORDER*(ORDER-1), FReal(1.), T_of_y, ORDER, E, ORDER-1, F, ORDER); - FReal G[(ORDER-1) * ORDER * ORDER]; - for (unsigned int i=0; i<ORDER; ++i) { - for (unsigned int j=0; j<ORDER; ++j) { - for (unsigned int n=0; n<ORDER-1; ++n) { - const unsigned int a = i*(ORDER-1)*ORDER + n*ORDER + j; - const unsigned int b = j*ORDER*(ORDER-1) + i*(ORDER-1) + n; - G[b] = F[a]; - } - } - } - - FReal H[ORDER * ORDER * ORDER]; - FBlas::gemm(ORDER, ORDER-1, ORDER*ORDER, FReal(1.), T_of_y, ORDER, G, ORDER-1, H, ORDER); - for (unsigned int i=0; i<ORDER; ++i) { - for (unsigned int j=0; j<ORDER; ++j) { - for (unsigned int k=0; k<ORDER; ++k) { - const unsigned int a = j*ORDER*ORDER + i*ORDER + k; - const unsigned int b = j*ORDER*ORDER + k*ORDER + i; - F8[b] = H[a]; - } + for (unsigned int l=0; l<ORDER-1; ++l) + for (unsigned int i=0; i<ORDER; ++i) { + F2[0][i] += T_of_y[l*ORDER + i] * W2[0][l]; + F2[1][i] += T_of_y[l*ORDER + i] * W2[1][l]; + F2[2][i] += T_of_y[l*ORDER + i] * W2[2][l]; + + for (unsigned int m=0; m<ORDER-1; ++m) + for (unsigned int j=0; j<ORDER; ++j) { + F4[0][j*ORDER + i] += T_of_y[l*ORDER + i] * T_of_y[m*ORDER + j] * W4[0][m*(ORDER-1) + l]; + F4[1][j*ORDER + i] += T_of_y[l*ORDER + i] * T_of_y[m*ORDER + j] * W4[1][m*(ORDER-1) + l]; + F4[2][j*ORDER + i] += T_of_y[l*ORDER + i] * T_of_y[m*ORDER + j] * W4[2][m*(ORDER-1) + l]; + + for (unsigned int n=0; n<ORDER-1; ++n) + for (unsigned int k=0; k<ORDER; ++k) + F8[k*ORDER*ORDER + j*ORDER + i] += + T_of_y[l*ORDER + i] * T_of_y[m*ORDER + j] * T_of_y[n*ORDER + k] * + W8[n*(ORDER-1)*(ORDER-1) + m*(ORDER-1) + l]; + } } - } - - - - //for (unsigned int l=0; l<ORDER-1; ++l) - // for (unsigned int i=0; i<ORDER; ++i) { - // //F2[0][i] += T_of_y[l*ORDER + i] * W2[0][l]; - // //F2[1][i] += T_of_y[l*ORDER + i] * W2[1][l]; - // //F2[2][i] += T_of_y[l*ORDER + i] * W2[2][l]; - // - // for (unsigned int m=0; m<ORDER-1; ++m) - // for (unsigned int j=0; j<ORDER; ++j) { - // //F4[0][j*ORDER + i] += T_of_y[l*ORDER + i] * T_of_y[m*ORDER + j] * W4[0][m*(ORDER-1) + l]; - // //F4[1][j*ORDER + i] += T_of_y[l*ORDER + i] * T_of_y[m*ORDER + j] * W4[1][m*(ORDER-1) + l]; - // //F4[2][j*ORDER + i] += T_of_y[l*ORDER + i] * T_of_y[m*ORDER + j] * W4[2][m*(ORDER-1) + l]; - // - // for (unsigned int n=0; n<ORDER-1; ++n) - // for (unsigned int k=0; k<ORDER; ++k) - // //F8[k*ORDER*ORDER + j*ORDER + i] += - // // T_of_y[l*ORDER + i] * T_of_y[m*ORDER + j] * T_of_y[n*ORDER + k] * - // // W8[n*(ORDER-1)*(ORDER-1) + m*(ORDER-1) + l]; - // } - // } } //////////////////////////////////////////////////////// @@ -374,13 +316,138 @@ int main(int argc, char* argv[]) } } - std::cout << std::endl; FReal p2m_error(0.); for (unsigned int i=0; i<nnodes; ++i) { p2m_error += W[i] - equivW[i]; //std::cout << W[i] - equivW[i] << std::endl; } - std::cout << "ERROR P2M = " << p2m_error << std::endl; + + std::cout << "------------------------------------------" + << "\n - P2M: ERROR = " << p2m_error << std::endl; + + + + //////////////////////////////////////////////////////////////////// + //////////////////////////////////////////////////////////////////// + // L2P ///////////////////////////////////////////////////////////// + //////////////////////////////////////////////////////////////////// + //////////////////////////////////////////////////////////////////// + + + FReal exactf[M]; + FReal f[M]; + for(unsigned int i=0; i<M; ++i) exactf[i] = f[i] = FReal(0.); + + + { // compute exact target values + FReal Snorm[M * nnodes]; + Interpolator.assembleInterpolator(M, lp, Snorm); + applyL2L(Snorm, W, nnodes, exactf, M); + //for (unsigned int i=0; i<M; ++i) + // std::cout << exactf[i] << std::endl; + } + + FReal f1; + { // sum over interpolation points + FReal T_of_y[ORDER * (ORDER-1)]; + for (unsigned int o=1; o<ORDER; ++o) + for (unsigned int j=0; j<ORDER; ++j) + T_of_y[(o-1)*ORDER + j] = FReal(FChebRoots<ORDER>::T(o, FReal(FChebRoots<ORDER>::roots[j]))); + + // set everything to zero + f1 = FReal(0.); + for(unsigned int i=0; i<ORDER-1; ++i) + W2[0][i] = W2[1][i] = W2[2][i] = FReal(0.); + for(unsigned int i=0; i<(ORDER-1)*(ORDER-1); ++i) + W4[0][i] = W4[1][i] = W4[2][i] = FReal(0.); + for(unsigned int i=0; i<(ORDER-1)*(ORDER-1)*(ORDER-1); ++i) + W8[i] = FReal(0.); + + { + unsigned int nids[nnodes][3]; + FChebTensor<ORDER>::setNodeIds(nids); + + for (unsigned int idx=0; idx<nnodes; ++idx) { + f1 += W[idx]; + const unsigned int i = nids[idx][0]; + const unsigned int j = nids[idx][1]; + const unsigned int k = nids[idx][2]; + + //std::cout << i << "\t" << j << "\t" << k << std::endl; + + for (unsigned int l=0; l<ORDER-1; ++l) { + W2[0][l] += T_of_y[l*ORDER+i] * W[idx]; + W2[1][l] += T_of_y[l*ORDER+j] * W[idx]; + W2[2][l] += T_of_y[l*ORDER+k] * W[idx]; + for (unsigned int m=0; m<ORDER-1; ++m) { + W4[0][m*(ORDER-1)+l] += T_of_y[l*ORDER+i] * T_of_y[m*ORDER+j] * W[idx]; + W4[1][m*(ORDER-1)+l] += T_of_y[l*ORDER+i] * T_of_y[m*ORDER+k] * W[idx]; + W4[2][m*(ORDER-1)+l] += T_of_y[l*ORDER+j] * T_of_y[m*ORDER+k] * W[idx]; + for (unsigned int n=0; n<ORDER-1; ++n) + W8[n*(ORDER-1)*(ORDER-1) + m*(ORDER-1) + l] + += T_of_y[l*ORDER+i]*T_of_y[m*ORDER+j]*T_of_y[n*ORDER+k] * W[idx]; + } + } + } + + } + + } + + { // sum over targets + for (unsigned int p=0; p<M; ++p) { + + FReal T_of_x[3][ORDER]; + { + T_of_x[0][0] = FReal(1.); T_of_x[0][1] = points[0][p]; + T_of_x[1][0] = FReal(1.); T_of_x[1][1] = points[1][p]; + T_of_x[2][0] = FReal(1.); T_of_x[2][1] = points[2][p]; + const FReal x2 = FReal(2.) * T_of_x[0][1]; // 1 flop + const FReal y2 = FReal(2.) * T_of_x[1][1]; // 1 flop + const FReal z2 = FReal(2.) * T_of_x[2][1]; // 1 flop + for (unsigned int j=2; j<ORDER; ++j) { + T_of_x[0][j] = x2 * T_of_x[0][j-1] - T_of_x[0][j-2]; // 2 flops + T_of_x[1][j] = y2 * T_of_x[1][j-1] - T_of_x[1][j-2]; // 2 flops + T_of_x[2][j] = z2 * T_of_x[2][j-1] - T_of_x[2][j-2]; // 2 flops + } + } + + FReal f2, f4, f8; + { + f2 = f4 = f8 = FReal(0.); + for (unsigned int l=1; l<ORDER; ++l) { + f2 += + T_of_x[0][l] * W2[0][l-1] + + T_of_x[1][l] * W2[1][l-1] + + T_of_x[2][l] * W2[2][l-1]; + for (unsigned int m=1; m<ORDER; ++m) { + f4 += + T_of_x[0][l] * T_of_x[1][m] * W4[0][(m-1)*(ORDER-1)+(l-1)] + + T_of_x[0][l] * T_of_x[2][m] * W4[1][(m-1)*(ORDER-1)+(l-1)] + + T_of_x[1][l] * T_of_x[2][m] * W4[2][(m-1)*(ORDER-1)+(l-1)]; + for (unsigned int n=1; n<ORDER; ++n) + f8 += + T_of_x[0][l]*T_of_x[1][m]*T_of_x[2][n] * + W8[(n-1)*(ORDER-1)*(ORDER-1) + (m-1)*(ORDER-1) + (l-1)]; + } + } + } + f[p] = (f1 + FReal(2.)*f2 + FReal(4.)*f4 + FReal(8.)*f8) / (ORDER*ORDER*ORDER); + + } + } + + + FReal l2p_error(0.); + for (unsigned int i=0; i<M; ++i) { + l2p_error += f[i] - exactf[i]; + //std::cout << exactf[i] << "\t" << f[i] << std::endl; + } + + std::cout << "------------------------------------------" + << "\n - L2P: ERROR = " << l2p_error << std::endl; + + return 0; @@ -388,3 +455,25 @@ int main(int argc, char* argv[]) // [--END--] + + //for (unsigned int l=0; l<ORDER-1; ++l) + // for (unsigned int m=0; m<ORDER-1; ++m) + // for (unsigned int n=0; n<ORDER-1; ++n) + // + // for (unsigned int i=0; i<ORDER; ++i) + // for (unsigned int j=0; j<ORDER; ++j) + // for (unsigned int k=0; k<ORDER; ++k) { + // + // const unsigned int idx = k*ORDER*ORDER + j*ORDER + i; + // + // W2[0][l] += T_of_y[l*ORDER+i] * W[idx]; + // W2[1][m] += T_of_y[m*ORDER+j] * W[idx]; + // W2[2][n] += T_of_y[n*ORDER+k] * W[idx]; + // + // W4[0][m*(ORDER-1) + l] += T_of_y[l*ORDER+i] * T_of_y[m*ORDER+j] * W[idx]; + // W4[1][n*(ORDER-1) + l] += T_of_y[l*ORDER+i] * T_of_y[n*ORDER+k] * W[idx]; + // W4[2][n*(ORDER-1) + m] += T_of_y[m*ORDER+j] * T_of_y[n*ORDER+k] * W[idx]; + // + // W8[n*(ORDER-1)*(ORDER-1) + m*(ORDER-1) + l] + // += T_of_y[l*ORDER+i]*T_of_y[m*ORDER+j]*T_of_y[n*ORDER+k] * W[idx]; + // }