Mentions légales du service

Skip to content
Snippets Groups Projects
Commit 57ea27fd authored by Matthias Messner's avatar Matthias Messner
Browse files

reduced flops for P2M from Nl^4 to Nl^3+l^4 (to be done for L2P)

parent 178ee325
Branches
Tags
No related merge requests found
...@@ -30,12 +30,67 @@ class FChebInterpolator : FNoCopyable ...@@ -30,12 +30,67 @@ class FChebInterpolator : FNoCopyable
typedef FChebTensor<ORDER> TensorType; typedef FChebTensor<ORDER> TensorType;
FReal T_of_roots[ORDER][ORDER]; FReal T_of_roots[ORDER][ORDER];
FReal T[ORDER * (ORDER-1)];
unsigned int node_ids[nnodes][3]; unsigned int node_ids[nnodes][3];
FReal* ChildParentInterpolator[8]; FReal* ChildParentInterpolator[8];
// permutations (only needed in the tensor product interpolation case) // permutations (only needed in the tensor product interpolation case)
unsigned int perm[3][nnodes]; unsigned int perm[3][nnodes];
////////////////////////////////////////////////////////////////////
// needed for P2M
struct IMN2MNI {
enum {size = ORDER * (ORDER-1) * (ORDER-1)};
unsigned int imn[size], mni[size];
IMN2MNI() {
unsigned int counter = 0;
for (unsigned int i=0; i<ORDER; ++i) {
for (unsigned int m=0; m<ORDER-1; ++m) {
for (unsigned int n=0; n<ORDER-1; ++n) {
imn[counter] = n*(ORDER-1)*ORDER + m*ORDER + i;
mni[counter] = i*(ORDER-1)*(ORDER-1) + n*(ORDER-1) + m;
counter++;
}
}
}
}
} perm0;
struct JNI2NIJ {
enum {size = ORDER * ORDER * (ORDER-1)};
unsigned int jni[size], nij[size];
JNI2NIJ() {
unsigned int counter = 0;
for (unsigned int i=0; i<ORDER; ++i) {
for (unsigned int j=0; j<ORDER; ++j) {
for (unsigned int n=0; n<ORDER-1; ++n) {
jni[counter] = i*(ORDER-1)*ORDER + n*ORDER + j;
nij[counter] = j*ORDER*(ORDER-1) + i*(ORDER-1) + n;
counter++;
}
}
}
}
} perm1;
struct KIJ2IJK {
enum {size = ORDER * ORDER * ORDER};
unsigned int kij[size], ijk[size];
KIJ2IJK() {
unsigned int counter = 0;
for (unsigned int i=0; i<ORDER; ++i) {
for (unsigned int j=0; j<ORDER; ++j) {
for (unsigned int k=0; k<ORDER; ++k) {
kij[counter] = j*ORDER*ORDER + i*ORDER + k;
ijk[counter] = k*ORDER*ORDER + j*ORDER + i;
counter++;
}
}
}
}
} perm2;
////////////////////////////////////////////////////////////////////
/** /**
* Initialize the child - parent - interpolator, it is basically the matrix * Initialize the child - parent - interpolator, it is basically the matrix
...@@ -126,15 +181,19 @@ public: ...@@ -126,15 +181,19 @@ public:
for (unsigned int j=0; j<ORDER; ++j) for (unsigned int j=0; j<ORDER; ++j)
T_of_roots[o][j] = FReal(BasisType::T(o, FReal(BasisType::roots[j]))); T_of_roots[o][j] = FReal(BasisType::T(o, FReal(BasisType::roots[j])));
// initialize chebyshev polynomials of root nodes: T_o(x_j)
for (unsigned int o=1; o<ORDER; ++o)
for (unsigned int j=0; j<ORDER; ++j)
T[(o-1)*ORDER + j] = FReal(BasisType::T(o, FReal(BasisType::roots[j])));
// initialize root node ids // initialize root node ids
TensorType::setNodeIds(node_ids); TensorType::setNodeIds(node_ids);
// initialize interpolation operator for non M2M and L2L (non leaf // initialize interpolation operator for non M2M and L2L (non leaf
// operations) // operations)
//this -> initM2MandL2L(); // non tensor-product interpolation
//this -> initM2MandL2L(); this -> initTensorM2MandL2L(); // tensor-product interpolation
this -> initTensorM2MandL2L();
} }
...@@ -163,34 +222,28 @@ public: ...@@ -163,34 +222,28 @@ public:
{ {
// values of chebyshev polynomials of source particle: T_o(x_i) // values of chebyshev polynomials of source particle: T_o(x_i)
FReal T_of_x[ORDER][3]; FReal T_of_x[ORDER][3];
FReal c0, c1, c2 ; // loop: local points (mapped in [-1,1])
c0 = FReal(0.0) ; for (unsigned int m=0; m<NumberOfLocalPoints; ++m) {
c1 = FReal(1.) / ORDER;
c2 = FReal(2.) *c1 ;
// loop: local points (mapped in [-1,1])
for (unsigned int m=0; m<NumberOfLocalPoints; ++m) {
// evaluate chebyshev polynomials at local points // evaluate chebyshev polynomials at local points
for (unsigned int o=1; o<ORDER; ++o) { for (unsigned int o=1; o<ORDER; ++o) {
T_of_x[o][0] = BasisType::T(o, LocalPoints[m].getX()); T_of_x[o][0] = BasisType::T(o, LocalPoints[m].getX());
T_of_x[o][1] = BasisType::T(o, LocalPoints[m].getY()); T_of_x[o][1] = BasisType::T(o, LocalPoints[m].getY());
T_of_x[o][2] = BasisType::T(o, LocalPoints[m].getZ()); T_of_x[o][2] = BasisType::T(o, LocalPoints[m].getZ());
} }
// assemble interpolator // assemble interpolator
for (unsigned int n=0; n<nnodes; ++n) { for (unsigned int n=0; n<nnodes; ++n) {
Interpolator[n*nnodes + m] = FReal(1.); //Interpolator[n*nnodes + m] = FReal(1.);
Interpolator[n*NumberOfLocalPoints + m] = FReal(1.);
for (unsigned int d=0; d<3; ++d) { for (unsigned int d=0; d<3; ++d) {
const unsigned int j = node_ids[n][d]; const unsigned int j = node_ids[n][d];
// FReal S_d = FReal(1.) / ORDER; FReal S_d = FReal(1.) / ORDER;
// for (unsigned int o=1; o<ORDER; ++o)
// S_d += FReal(2.) / ORDER * T_of_x[o][d] * T_of_roots[o][j];
// Interpolator[n*nnodes + m] *= S_d;
FReal S_d = c0 ;
for (unsigned int o=1; o<ORDER; ++o) for (unsigned int o=1; o<ORDER; ++o)
S_d += T_of_x[o][d] * T_of_roots[o][j]; S_d += FReal(2.) / ORDER * T_of_x[o][d] * T_of_roots[o][j];
S_d = c1 + c2*S_d ; //Interpolator[n*nnodes + m] *= S_d;
Interpolator[n*nnodes + m] *= S_d; Interpolator[n*NumberOfLocalPoints + m] *= S_d;
} }
} }
} }
...@@ -218,6 +271,8 @@ public: ...@@ -218,6 +271,8 @@ public:
} }
} }
...@@ -360,53 +415,184 @@ inline void FChebInterpolator<ORDER>::applyP2M(const FPoint& center, ...@@ -360,53 +415,184 @@ inline void FChebInterpolator<ORDER>::applyP2M(const FPoint& center,
// allocate stuff // allocate stuff
const map_glob_loc map(center, width); const map_glob_loc map(center, width);
FPoint localPosition; FPoint localPosition;
FReal T_of_x[ORDER][3];
FReal S[3], c1; FReal W1 = FReal(0.);
// FReal W2[3][ ORDER-1];
FReal xpx,ypy,zpz ; FReal W4[3][(ORDER-1)*(ORDER-1)];
c1 = FReal(8.) / nnodes ; // 1 flop FReal W8[ (ORDER-1)*(ORDER-1)*(ORDER-1)];
for(unsigned int i=0; i<(ORDER-1); ++i) W2[0][i] = W2[1][i] = W2[2][i] = FReal(0.);
for(unsigned int i=0; i<(ORDER-1)*(ORDER-1); ++i) W4[0][i] = W4[1][i] = W4[2][i] = FReal(0.);
for(unsigned int i=0; i<(ORDER-1)*(ORDER-1)*(ORDER-1); ++i) W8[i] = FReal(0.);
// loop over source particles // loop over source particles
typename ContainerClass::ConstBasicIterator iter(*sourceParticles); typename ContainerClass::ConstBasicIterator iter(*sourceParticles);
while(iter.hasNotFinished()){ while(iter.hasNotFinished()){
// map global position to [-1,1] // map global position to [-1,1]
map(iter.data().getPosition(), localPosition); // 15 flops map(iter.data().getPosition(), localPosition); // 15 flops
// evaluate chebyshev polynomials of source particle: T_o(x_i)
T_of_x[0][0] = FReal(1.); T_of_x[1][0] = localPosition.getX();
T_of_x[0][1] = FReal(1.); T_of_x[1][1] = localPosition.getY();
T_of_x[0][2] = FReal(1.); T_of_x[1][2] = localPosition.getZ();
xpx = FReal(2.) * localPosition.getX() ; // 1 flop
ypy = FReal(2.) * localPosition.getY() ; // 1 flop
zpz = FReal(2.) * localPosition.getZ() ; // 1 flop
for (unsigned int o=2; o<ORDER; ++o) {
T_of_x[o][0] = xpx * T_of_x[o-1][0] - T_of_x[o-2][0]; // 2 flops
T_of_x[o][1] = ypy * T_of_x[o-1][1] - T_of_x[o-2][1]; // 2 flops
T_of_x[o][2] = zpz * T_of_x[o-1][2] - T_of_x[o-2][2]; // 2 flops
}
// anterpolate FReal T_of_x[3][ORDER];
const FReal sourceValue = iter.data().getPhysicalValue(); T_of_x[0][0] = FReal(1.); T_of_x[0][1] = localPosition.getX();
for (unsigned int n=0; n<nnodes; ++n) { T_of_x[1][0] = FReal(1.); T_of_x[1][1] = localPosition.getY();
const unsigned int j[3] = {node_ids[n][0], node_ids[n][1], node_ids[n][2]}; T_of_x[2][0] = FReal(1.); T_of_x[2][1] = localPosition.getZ();
S[0] = FReal(0.5) + T_of_x[1][0] * T_of_roots[1][j[0]]; // 2 flops const FReal x2 = FReal(2.) * T_of_x[0][1]; // 1 flop
S[1] = FReal(0.5) + T_of_x[1][1] * T_of_roots[1][j[1]]; // 2 flops const FReal y2 = FReal(2.) * T_of_x[1][1]; // 1 flop
S[2] = FReal(0.5) + T_of_x[1][2] * T_of_roots[1][j[2]]; // 2 flops const FReal z2 = FReal(2.) * T_of_x[2][1]; // 1 flop
for (unsigned int o=2; o<ORDER; ++o) { for (unsigned int j=2; j<ORDER; ++j) {
S[0] += T_of_x[o][0] * T_of_roots[o][j[0]]; // 2 flops T_of_x[0][j] = x2 * T_of_x[0][j-1] - T_of_x[0][j-2]; // 2 flops
S[1] += T_of_x[o][1] * T_of_roots[o][j[1]]; // 2 flops T_of_x[1][j] = y2 * T_of_x[1][j-1] - T_of_x[1][j-2]; // 2 flops
S[2] += T_of_x[o][2] * T_of_roots[o][j[2]]; // 2 flops T_of_x[2][j] = z2 * T_of_x[2][j-1] - T_of_x[2][j-2]; // 2 flops
}
// gather contributions
multipoleExpansion[n] += c1 * S[0] * S[1] * S[2] * sourceValue; // 4 flops
} }
const FReal weight = iter.data().getPhysicalValue();
W1 += weight; // 1 flop
for (unsigned int i=1; i<ORDER; ++i) {
const FReal wx = weight * T_of_x[0][i]; // 1 flop
const FReal wy = weight * T_of_x[1][i]; // 1 flop
const FReal wz = weight * T_of_x[2][i]; // 1 flop
W2[0][i-1] += wx; // 1 flop
W2[1][i-1] += wy; // 1 flop
W2[2][i-1] += wz; // 1 flop
for (unsigned int j=1; j<ORDER; ++j) {
const FReal wxy = wx * T_of_x[1][j]; // 1 flop
const FReal wxz = wx * T_of_x[2][j]; // 1 flop
const FReal wyz = wy * T_of_x[2][j]; // 1 flop
W4[0][(j-1)*(ORDER-1) + (i-1)] += wxy; // 1 flop
W4[1][(j-1)*(ORDER-1) + (i-1)] += wxz; // 1 flop
W4[2][(j-1)*(ORDER-1) + (i-1)] += wyz; // 1 flop
for (unsigned int k=1; k<ORDER; ++k) {
const FReal wxyz = wxy * T_of_x[2][k]; // 1 flop
W8[(k-1)*(ORDER-1)*(ORDER-1) + (j-1)*(ORDER-1) + (i-1)] += wxyz; // 1 flop
} // flops: (ORDER-1) * 2
} // flops: (ORDER-1) * (6 + (ORDER-1) * 2)
} // flops: (ORDER-1) * (6 + (ORDER-1) * (6 + (ORDER-1) * 2))
// increment source iterator // increment source iterator
iter.gotoNext(); iter.gotoNext();
} // flops: N * (18 + (ORDER-2) * 6 + (ORDER-1) * (6 + (ORDER-1) * (6 + (ORDER-1) * 2)))
////////////////////////////////////////////////////////////////////
// loop over interpolation points
FReal F2[3][ORDER];
FReal F4[3][ORDER*ORDER];
FReal F8[ ORDER*ORDER*ORDER];
{
// compute W2: 3 * ORDER*(2*(ORDER-1)-1) flops
FBlas::gemv(ORDER, ORDER-1, FReal(1.), const_cast<FReal*>(T), W2[0], F2[0]);
FBlas::gemv(ORDER, ORDER-1, FReal(1.), const_cast<FReal*>(T), W2[1], F2[1]);
FBlas::gemv(ORDER, ORDER-1, FReal(1.), const_cast<FReal*>(T), W2[2], F2[2]);
// compute W4: 3 * [ORDER*(ORDER-1)*(2*(ORDER-1)-1) + ORDER*ORDER*(2*(ORDER-1)-1)]
FReal C[ORDER * (ORDER-1)];
FBlas::gemmt(ORDER, ORDER-1, ORDER-1, FReal(1.), const_cast<FReal*>(T), ORDER, W4[0], ORDER-1, C, ORDER);
FBlas::gemmt(ORDER, ORDER-1, ORDER, FReal(1.), const_cast<FReal*>(T), ORDER, C, ORDER, F4[0], ORDER);
FBlas::gemmt(ORDER, ORDER-1, ORDER-1, FReal(1.), const_cast<FReal*>(T), ORDER, W4[1], ORDER-1, C, ORDER);
FBlas::gemmt(ORDER, ORDER-1, ORDER, FReal(1.), const_cast<FReal*>(T), ORDER, C, ORDER, F4[1], ORDER);
FBlas::gemmt(ORDER, ORDER-1, ORDER-1, FReal(1.), const_cast<FReal*>(T), ORDER, W4[2], ORDER-1, C, ORDER);
FBlas::gemmt(ORDER, ORDER-1, ORDER, FReal(1.), const_cast<FReal*>(T), ORDER, C, ORDER, F4[2], ORDER);
// compute W8: 3 * (2*(ORDER-1)-1) * [ORDER*(ORDER-1)*(ORDER-1) + ORDER*ORDER*(ORDER-1) + ORDER*ORDER*ORDER]
FReal D[ORDER * (ORDER-1) * (ORDER-1)];
FBlas::gemm(ORDER, ORDER-1, (ORDER-1)*(ORDER-1), FReal(1.), const_cast<FReal*>(T), ORDER, W8, ORDER-1, D, ORDER);
FReal E[(ORDER-1) * (ORDER-1) * ORDER];
for (unsigned int s=0; s<perm0.size; ++s) E[perm0.mni[s]] = D[perm0.imn[s]];
FReal F[ORDER * (ORDER-1) * ORDER];
FBlas::gemm(ORDER, ORDER-1, ORDER*(ORDER-1), FReal(1.), const_cast<FReal*>(T), ORDER, E, ORDER-1, F, ORDER);
FReal G[(ORDER-1) * ORDER * ORDER];
for (unsigned int s=0; s<perm1.size; ++s) G[perm1.nij[s]] = F[perm1.jni[s]];
FReal H[ORDER * ORDER * ORDER];
FBlas::gemm(ORDER, ORDER-1, ORDER*ORDER, FReal(1.), const_cast<FReal*>(T), ORDER, G, ORDER-1, H, ORDER);
for (unsigned int s=0; s<perm2.size; ++s) F8[perm2.ijk[s]] = H[perm2.kij[s]];
} }
// assemble multipole expansions
for (unsigned int i=0; i<ORDER; ++i) {
for (unsigned int j=0; j<ORDER; ++j) {
for (unsigned int k=0; k<ORDER; ++k) {
const unsigned int idx = k*ORDER*ORDER + j*ORDER + i;
multipoleExpansion[idx] = (W1 +
FReal(2.) * (F2[0][i] + F2[1][j] + F2[2][k]) +
FReal(4.) * (F4[0][j*ORDER+i] + F4[1][k*ORDER+i] + F4[2][k*ORDER+j]) +
FReal(8.) * F8[idx]) / nnodes; // 11 * ORDER*ORDER*ORDER flops
}
}
}
} }
///**
// * Particle to moment: application of \f$S_\ell(y,\bar y_n)\f$
// * (anterpolation, it is the transposed interpolation)
// */
//template <int ORDER>
//template <class ContainerClass>
//inline void FChebInterpolator<ORDER>::applyP2M(const FPoint& center,
// const FReal width,
// FReal *const multipoleExpansion,
// const ContainerClass *const sourceParticles) const
//{
// // set all multipole expansions to zero
// FBlas::setzero(nnodes, multipoleExpansion);
//
// // allocate stuff
// const map_glob_loc map(center, width);
// FPoint localPosition;
// FReal T_of_x[ORDER][3];
// FReal S[3], c1;
// //
// FReal xpx,ypy,zpz ;
// c1 = FReal(8.) / nnodes ; // 1 flop
// // loop over source particles
// typename ContainerClass::ConstBasicIterator iter(*sourceParticles);
// while(iter.hasNotFinished()){
//
// // map global position to [-1,1]
// map(iter.data().getPosition(), localPosition); // 15 flops
//
// // evaluate chebyshev polynomials of source particle: T_o(x_i)
// T_of_x[0][0] = FReal(1.); T_of_x[1][0] = localPosition.getX();
// T_of_x[0][1] = FReal(1.); T_of_x[1][1] = localPosition.getY();
// T_of_x[0][2] = FReal(1.); T_of_x[1][2] = localPosition.getZ();
// xpx = FReal(2.) * localPosition.getX() ; // 1 flop
// ypy = FReal(2.) * localPosition.getY() ; // 1 flop
// zpz = FReal(2.) * localPosition.getZ() ; // 1 flop
//
// for (unsigned int o=2; o<ORDER; ++o) {
// T_of_x[o][0] = xpx * T_of_x[o-1][0] - T_of_x[o-2][0]; // 2 flops
// T_of_x[o][1] = ypy * T_of_x[o-1][1] - T_of_x[o-2][1]; // 2 flops
// T_of_x[o][2] = zpz * T_of_x[o-1][2] - T_of_x[o-2][2]; // 2 flops
// } // flops: (ORDER-1) * 6
//
// // anterpolate
// const FReal sourceValue = iter.data().getPhysicalValue();
// for (unsigned int n=0; n<nnodes; ++n) {
// const unsigned int j[3] = {node_ids[n][0], node_ids[n][1], node_ids[n][2]};
// S[0] = FReal(0.5) + T_of_x[1][0] * T_of_roots[1][j[0]]; // 2 flops
// S[1] = FReal(0.5) + T_of_x[1][1] * T_of_roots[1][j[1]]; // 2 flops
// S[2] = FReal(0.5) + T_of_x[1][2] * T_of_roots[1][j[2]]; // 2 flops
// for (unsigned int o=2; o<ORDER; ++o) {
// S[0] += T_of_x[o][0] * T_of_roots[o][j[0]]; // 2 flops
// S[1] += T_of_x[o][1] * T_of_roots[o][j[1]]; // 2 flops
// S[2] += T_of_x[o][2] * T_of_roots[o][j[2]]; // 2 flops
// } // flops: (ORDER-2) * 6
//
// // gather contributions
// multipoleExpansion[n] += c1 * S[0] * S[1] * S[2] * sourceValue; // 4 flops
// } // flops: ORDER*ORDER*ORDER * (10 + (ORDER-2) * 6)
//
// // increment source iterator
// iter.gotoNext();
// } // flops: M * (18 + (ORDER-1) * 6 + ORDER*ORDER*ORDER * (10 + (ORDER-2) * 6))
//}
/** /**
* Local to particle operation: application of \f$S_\ell(x,\bar x_m)\f$ (interpolation) * Local to particle operation: application of \f$S_\ell(x,\bar x_m)\f$ (interpolation)
*/ */
...@@ -725,3 +911,121 @@ inline void FChebInterpolator<ORDER>::applyL2PTotal(const FPoint& center, ...@@ -725,3 +911,121 @@ inline void FChebInterpolator<ORDER>::applyL2PTotal(const FPoint& center,
#endif #endif
////struct IMN2MNI {
//// enum {size = ORDER * (ORDER-1) * (ORDER-1)};
//// unsigned int imn[size], mni[size];
//// IMN2MNI() {
//// unsigned int counter = 0;
//// for (unsigned int i=0; i<ORDER; ++i) {
//// for (unsigned int m=0; m<ORDER-1; ++m) {
//// for (unsigned int n=0; n<ORDER-1; ++n) {
//// imn[counter] = n*(ORDER-1)*ORDER + m*ORDER + i;
//// mni[counter] = i*(ORDER-1)*(ORDER-1) + n*(ORDER-1) + m;
//// counter++;
//// }
//// }
//// }
//// }
////} perm0;
//
////for (unsigned int i=0; i<ORDER; ++i) {
//// for (unsigned int m=0; m<ORDER-1; ++m) {
//// for (unsigned int n=0; n<ORDER-1; ++n) {
//// const unsigned int a = n*(ORDER-1)*ORDER + m*ORDER + i;
//// const unsigned int b = i*(ORDER-1)*(ORDER-1) + n*(ORDER-1) + m;
//// E[b] = D[a];
//// }
//// }
////}
////struct JNI2NIJ {
//// enum {size = ORDER * ORDER * (ORDER-1)};
//// unsigned int jni[size], nij[size];
//// JNI2NIJ() {
//// unsigned int counter = 0;
//// for (unsigned int i=0; i<ORDER; ++i) {
//// for (unsigned int j=0; j<ORDER; ++j) {
//// for (unsigned int n=0; n<ORDER-1; ++n) {
//// jni[counter] = i*(ORDER-1)*ORDER + n*ORDER + j;
//// nij[counter] = j*ORDER*(ORDER-1) + i*(ORDER-1) + n;
//// counter++;
//// }
//// }
//// }
//// }
////} perm1;
//
////for (unsigned int i=0; i<ORDER; ++i) {
//// for (unsigned int j=0; j<ORDER; ++j) {
//// for (unsigned int n=0; n<ORDER-1; ++n) {
//// const unsigned int a = i*(ORDER-1)*ORDER + n*ORDER + j;
//// const unsigned int b = j*ORDER*(ORDER-1) + i*(ORDER-1) + n;
//// G[b] = F[a];
//// }
//// }
////}
////struct KIJ2IJK {
//// enum {size = ORDER * ORDER * ORDER};
//// unsigned int kij[size], ijk[size];
//// KIJ2IJK() {
//// unsigned int counter = 0;
//// for (unsigned int i=0; i<ORDER; ++i) {
//// for (unsigned int j=0; j<ORDER; ++j) {
//// for (unsigned int k=0; k<ORDER; ++k) {
//// kij[counter] = j*ORDER*ORDER + i*ORDER + k;
//// ijk[counter] = k*ORDER*ORDER + j*ORDER + i;
//// counter++;
//// }
//// }
//// }
//// }
////} perm2;
//
////for (unsigned int i=0; i<ORDER; ++i) {
//// for (unsigned int j=0; j<ORDER; ++j) {
//// for (unsigned int k=0; k<ORDER; ++k) {
//// const unsigned int a = j*ORDER*ORDER + i*ORDER + k;
//// const unsigned int b = k*ORDER*ORDER + j*ORDER + i;
//// F8[b] = H[a];
//// }
//// }
////}
//FReal T_of_y[ORDER * (ORDER-1)];
//for (unsigned int o=1; o<ORDER; ++o)
// for (unsigned int j=0; j<ORDER; ++j)
// T_of_y[(o-1)*ORDER + j] = FReal(FChebRoots<ORDER>::T(o, FReal(FChebRoots<ORDER>::roots[j])));
//struct SumP2M {
// unsigned int f2[3][nnodes], f4[3][nnodes];
// SumP2M() {
// for (unsigned int i=0; i<ORDER; ++i) {
// for (unsigned int j=0; j<ORDER; ++j) {
// for (unsigned int k=0; k<ORDER; ++k) {
// const unsigned int idx = k*ORDER*ORDER + j*ORDER + i;
// f2[0][idx] = i;
// f2[1][idx] = j;
// f2[2][idx] = k;
// f4[0][idx] = j*ORDER+i;
// f4[1][idx] = k*ORDER+i;
// f4[2][idx] = k*ORDER+j;
// }
// }
// }
// }
//} idx0;
//
//for (unsigned int i=0; i<nnodes; ++i)
// multipoleExpansion[i] = (W1 +
// FReal(2.) * (F2[0][idx0.f2[0][i]] + F2[1][idx0.f2[1][i]] + F2[2][idx0.f2[2][i]]) +
// FReal(4.) * (F4[0][idx0.f4[0][i]] + F4[1][idx0.f4[1][i]] + F4[2][idx0.f4[2][i]]) +
// FReal(8.) * F8[i]) / nnodes;
...@@ -409,26 +409,6 @@ unsigned int ReadRankFromBinaryFile(const std::string& filename) ...@@ -409,26 +409,6 @@ unsigned int ReadRankFromBinaryFile(const std::string& filename)
////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////
////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////
// DANGEROUS FORMULA ?!?!
//template <int ORDER>
//unsigned int getRank(const FReal singular_values[], const double eps)
//{
// enum {nnodes = TensorTraits<ORDER>::nnodes};
// FReal sum(0.);
// for (unsigned int i=0; i<nnodes; ++i)
// sum += singular_values[i] * singular_values[i];
//
// unsigned int k = 0;
// double sum_k = sum;
// while (sqrt(sum_k) > eps*sqrt(sum)) {
// sum_k -= singular_values[k] * singular_values[k];
// k++;
// }
// if (k>nnodes) throw std::runtime_error("rank cannot be larger than nnodes");
// return k;
//}
/** /**
* Computes the low-rank \f$k\f$ based on \f$\|K-U\Sigma_kV^\top\|_F \le * Computes the low-rank \f$k\f$ based on \f$\|K-U\Sigma_kV^\top\|_F \le
* \epsilon \|K\|_F\f$, ie., the truncation rank of the singular value * \epsilon \|K\|_F\f$, ie., the truncation rank of the singular value
......
...@@ -161,9 +161,9 @@ namespace FBlas { ...@@ -161,9 +161,9 @@ namespace FBlas {
inline void setzero(const unsigned n, float* const x) inline void setzero(const unsigned n, float* const x)
{ sscal_(&n, &S_ZERO, x, &N_ONE); } { sscal_(&n, &S_ZERO, x, &N_ONE); }
inline void c_setzero(const unsigned n, double* const x) inline void c_setzero(const unsigned n, double* const x)
{ zscal_(&n, Z_ZERO, x, &N_ONE); } { zscal_(&n, Z_ZERO, x, &N_ONE); }
inline void c_setzero(const unsigned n, float* const x) inline void c_setzero(const unsigned n, float* const x)
{ cscal_(&n, C_ZERO, x, &N_ONE); } { cscal_(&n, C_ZERO, x, &N_ONE); }
// y += x // y += x
inline void add(const unsigned n, double* const x, double* const y) inline void add(const unsigned n, double* const x, double* const y)
...@@ -171,89 +171,89 @@ namespace FBlas { ...@@ -171,89 +171,89 @@ namespace FBlas {
inline void add(const unsigned n, float* const x, float* const y) inline void add(const unsigned n, float* const x, float* const y)
{ saxpy_(&n, &S_ONE, x, &N_ONE, y, &N_ONE); } { saxpy_(&n, &S_ONE, x, &N_ONE, y, &N_ONE); }
inline void c_add(const unsigned n, float* const x, float* const y) inline void c_add(const unsigned n, float* const x, float* const y)
{ caxpy_(&n, C_ONE, x, &N_ONE, y, &N_ONE); } { caxpy_(&n, C_ONE, x, &N_ONE, y, &N_ONE); }
inline void c_add(const unsigned n, double* const x,double* const y) inline void c_add(const unsigned n, double* const x,double* const y)
{ zaxpy_(&n, Z_ONE, x, &N_ONE, y, &N_ONE); } { zaxpy_(&n, Z_ONE, x, &N_ONE, y, &N_ONE); }
// y += d x // y += d x
inline void axpy(const unsigned n, const double d, const double* const x, double* const y) inline void axpy(const unsigned n, const double d, const double* const x, double* const y)
{ daxpy_(&n, &d, x, &N_ONE, y, &N_ONE); } { daxpy_(&n, &d, x, &N_ONE, y, &N_ONE); }
inline void axpy(const unsigned n, const float d, const float* const x, float* const y) inline void axpy(const unsigned n, const float d, const float* const x, float* const y)
{ saxpy_(&n, &d, x, &N_ONE, y, &N_ONE); } { saxpy_(&n, &d, x, &N_ONE, y, &N_ONE); }
inline void c_axpy(const unsigned n, const float* d, const float* const x, float* const y) inline void c_axpy(const unsigned n, const float* d, const float* const x, float* const y)
{ caxpy_(&n, d, x, &N_ONE, y, &N_ONE); } { caxpy_(&n, d, x, &N_ONE, y, &N_ONE); }
inline void c_axpy(const unsigned n, const double* d, const double* const x, double* const y) inline void c_axpy(const unsigned n, const double* d, const double* const x, double* const y)
{ zaxpy_(&n, d, x, &N_ONE, y, &N_ONE); } { zaxpy_(&n, d, x, &N_ONE, y, &N_ONE); }
// // y = d Ax // // y = d Ax
// inline void gemv(const unsigned m, const unsigned n, double d, double* A, double *x, double *y) // inline void gemv(const unsigned m, const unsigned n, double d, double* A, double *x, double *y)
// { cblas_dgemv(CblasColMajor, CblasNoTrans, m, n, d, A, m, x, N_ONE, D_ZERO, y, N_ONE); } // { cblas_dgemv(CblasColMajor, CblasNoTrans, m, n, d, A, m, x, N_ONE, D_ZERO, y, N_ONE); }
// inline void gemv(const unsigned m, const unsigned n, float d, float* A, float *x, float *y) // inline void gemv(const unsigned m, const unsigned n, float d, float* A, float *x, float *y)
// { cblas_sgemv(CblasColMajor, CblasNoTrans, m, n, d, A, m, x, N_ONE, S_ZERO, y, N_ONE); } // { cblas_sgemv(CblasColMajor, CblasNoTrans, m, n, d, A, m, x, N_ONE, S_ZERO, y, N_ONE); }
// y = d Ax // y = d Ax
inline void gemv(const unsigned m, const unsigned n, double d, double* A, double *x, double *y) inline void gemv(const unsigned m, const unsigned n, double d, double* A, double *x, double *y)
{ dgemv_(JOB_STR, &m, &n, &d, A, &m, x, &N_ONE, &D_ZERO, y, &N_ONE); } { dgemv_(JOB_STR, &m, &n, &d, A, &m, x, &N_ONE, &D_ZERO, y, &N_ONE); }
inline void gemv(const unsigned m, const unsigned n, float d, float* A, float *x, float *y) inline void gemv(const unsigned m, const unsigned n, float d, float* A, float *x, float *y)
{ sgemv_(JOB_STR, &m, &n, &d, A, &m, x, &N_ONE, &S_ZERO, y, &N_ONE); } { sgemv_(JOB_STR, &m, &n, &d, A, &m, x, &N_ONE, &S_ZERO, y, &N_ONE); }
inline void c_gemv(const unsigned m, const unsigned n, float* d, float* A, float *x, float *y) inline void c_gemv(const unsigned m, const unsigned n, float* d, float* A, float *x, float *y)
{ cgemv_(JOB_STR, &m, &n, d, A, &m, x, &N_ONE, C_ZERO, y, &N_ONE); } { cgemv_(JOB_STR, &m, &n, d, A, &m, x, &N_ONE, C_ZERO, y, &N_ONE); }
inline void c_gemv(const unsigned m, const unsigned n, double* d, double* A, double *x, double *y) inline void c_gemv(const unsigned m, const unsigned n, double* d, double* A, double *x, double *y)
{ zgemv_(JOB_STR, &m, &n, d, A, &m, x, &N_ONE, Z_ZERO, y, &N_ONE); } { zgemv_(JOB_STR, &m, &n, d, A, &m, x, &N_ONE, Z_ZERO, y, &N_ONE); }
// // y += d Ax // // y += d Ax
// inline void gemva(const unsigned m, const unsigned n, double d, double* A, double *x, double *y) // inline void gemva(const unsigned m, const unsigned n, double d, double* A, double *x, double *y)
// { cblas_dgemv(CblasColMajor, CblasNoTrans, m, n, d, A, m, x, N_ONE, D_ONE, y, N_ONE); } // { cblas_dgemv(CblasColMajor, CblasNoTrans, m, n, d, A, m, x, N_ONE, D_ONE, y, N_ONE); }
// inline void gemva(const unsigned m, const unsigned n, float d, float* A, float *x, float *y) // inline void gemva(const unsigned m, const unsigned n, float d, float* A, float *x, float *y)
// { cblas_sgemv(CblasColMajor, CblasNoTrans, m, n, d, A, m, x, N_ONE, S_ONE, y, N_ONE); } // { cblas_sgemv(CblasColMajor, CblasNoTrans, m, n, d, A, m, x, N_ONE, S_ONE, y, N_ONE); }
// y += d Ax // y += d Ax
inline void gemva(const unsigned m, const unsigned n, double d, double* A, double *x, double *y) inline void gemva(const unsigned m, const unsigned n, double d, double* A, double *x, double *y)
{ dgemv_(JOB_STR, &m, &n, &d, A, &m, x, &N_ONE, &D_ONE, y, &N_ONE); } { dgemv_(JOB_STR, &m, &n, &d, A, &m, x, &N_ONE, &D_ONE, y, &N_ONE); }
inline void gemva(const unsigned m, const unsigned n, float d, float* A, float *x, float *y) inline void gemva(const unsigned m, const unsigned n, float d, float* A, float *x, float *y)
{ sgemv_(JOB_STR, &m, &n, &d, A, &m, x, &N_ONE, &S_ONE, y, &N_ONE); } { sgemv_(JOB_STR, &m, &n, &d, A, &m, x, &N_ONE, &S_ONE, y, &N_ONE); }
inline void c_gemva(const unsigned m, const unsigned n, const float* d, const float* A, const float *x, float *y) inline void c_gemva(const unsigned m, const unsigned n, const float* d, const float* A, const float *x, float *y)
{ cgemv_(JOB_STR, &m, &n, d, A, &m, x, &N_ONE, C_ONE, y, &N_ONE); } { cgemv_(JOB_STR, &m, &n, d, A, &m, x, &N_ONE, C_ONE, y, &N_ONE); }
inline void c_gemva(const unsigned m, const unsigned n, const double* d, const double* A, const double *x, double *y) inline void c_gemva(const unsigned m, const unsigned n, const double* d, const double* A, const double *x, double *y)
{ zgemv_(JOB_STR, &m, &n, d, A, &m, x, &N_ONE, Z_ONE, y, &N_ONE); } { zgemv_(JOB_STR, &m, &n, d, A, &m, x, &N_ONE, Z_ONE, y, &N_ONE); }
// // y = d A^T x // // y = d A^T x
// inline void gemtv(const unsigned m, const unsigned n, double d, double* A, double *x, double *y) // inline void gemtv(const unsigned m, const unsigned n, double d, double* A, double *x, double *y)
// { cblas_dgemv(CblasColMajor, CblasTrans, m, n, d, A, m, x, N_ONE, D_ZERO, y, N_ONE); } // { cblas_dgemv(CblasColMajor, CblasTrans, m, n, d, A, m, x, N_ONE, D_ZERO, y, N_ONE); }
// inline void gemtv(const unsigned m, const unsigned n, float d, float* A, float *x, float *y) // inline void gemtv(const unsigned m, const unsigned n, float d, float* A, float *x, float *y)
// { cblas_sgemv(CblasColMajor, CblasTrans, m, n, d, A, m, x, N_ONE, S_ZERO, y, N_ONE); } // { cblas_sgemv(CblasColMajor, CblasTrans, m, n, d, A, m, x, N_ONE, S_ZERO, y, N_ONE); }
// y = d A^T x // y = d A^T x
inline void gemtv(const unsigned m, const unsigned n, double d, double* A, double *x, double *y) inline void gemtv(const unsigned m, const unsigned n, double d, double* A, double *x, double *y)
{ dgemv_(JOB_STR+1, &m, &n, &d, A, &m, x, &N_ONE, &D_ZERO, y, &N_ONE); } { dgemv_(JOB_STR+1, &m, &n, &d, A, &m, x, &N_ONE, &D_ZERO, y, &N_ONE); }
inline void gemtv(const unsigned m, const unsigned n, float d, float* A, float *x, float *y) inline void gemtv(const unsigned m, const unsigned n, float d, float* A, float *x, float *y)
{ sgemv_(JOB_STR+1, &m, &n, &d, A, &m, x, &N_ONE, &S_ZERO, y, &N_ONE); } { sgemv_(JOB_STR+1, &m, &n, &d, A, &m, x, &N_ONE, &S_ZERO, y, &N_ONE); }
inline void c_gemtv(const unsigned m, const unsigned n, float* d, float* A, float *x, float *y) inline void c_gemtv(const unsigned m, const unsigned n, float* d, float* A, float *x, float *y)
{ cgemv_(JOB_STR+1, &m, &n, d, A, &m, x, &N_ONE, C_ZERO, y, &N_ONE); } { cgemv_(JOB_STR+1, &m, &n, d, A, &m, x, &N_ONE, C_ZERO, y, &N_ONE); }
inline void c_gemtv(const unsigned m, const unsigned n, double* d, double* A, double *x, double *y) inline void c_gemtv(const unsigned m, const unsigned n, double* d, double* A, double *x, double *y)
{ zgemv_(JOB_STR+1, &m, &n, d, A, &m, x, &N_ONE, Z_ZERO, y, &N_ONE); } { zgemv_(JOB_STR+1, &m, &n, d, A, &m, x, &N_ONE, Z_ZERO, y, &N_ONE); }
inline void c_gemhv(const unsigned m, const unsigned n, float* d, float* A, float *x, float *y) inline void c_gemhv(const unsigned m, const unsigned n, float* d, float* A, float *x, float *y)
{ cgemv_(JOB_STR+7, &m, &n, d, A, &m, x, &N_ONE, C_ZERO, y, &N_ONE); } // hermitian transposed { cgemv_(JOB_STR+7, &m, &n, d, A, &m, x, &N_ONE, C_ZERO, y, &N_ONE); } // hermitian transposed
inline void c_gemhv(const unsigned m, const unsigned n, double* d, double* A, double *x, double *y) inline void c_gemhv(const unsigned m, const unsigned n, double* d, double* A, double *x, double *y)
{ zgemv_(JOB_STR+7, &m, &n, d, A, &m, x, &N_ONE, Z_ZERO, y, &N_ONE); } // hermitian transposed { zgemv_(JOB_STR+7, &m, &n, d, A, &m, x, &N_ONE, Z_ZERO, y, &N_ONE); } // hermitian transposed
// // y += d A^T x // // y += d A^T x
// inline void gemtva(const unsigned m, const unsigned n, double d, double* A, double *x, double *y) // inline void gemtva(const unsigned m, const unsigned n, double d, double* A, double *x, double *y)
// { cblas_dgemv(CblasColMajor, CblasTrans, m, n, d, A, m, x, N_ONE, D_ONE, y, N_ONE); } // { cblas_dgemv(CblasColMajor, CblasTrans, m, n, d, A, m, x, N_ONE, D_ONE, y, N_ONE); }
// inline void gemtva(const unsigned m, const unsigned n, float d, float* A, float *x, float *y) // inline void gemtva(const unsigned m, const unsigned n, float d, float* A, float *x, float *y)
// { cblas_sgemv(CblasColMajor, CblasTrans, m, n, d, A, m, x, N_ONE, S_ONE, y, N_ONE); } // { cblas_sgemv(CblasColMajor, CblasTrans, m, n, d, A, m, x, N_ONE, S_ONE, y, N_ONE); }
// y += d A^T x // y += d A^T x
inline void gemtva(const unsigned m, const unsigned n, double d, double* A, double *x, double *y) inline void gemtva(const unsigned m, const unsigned n, double d, double* A, double *x, double *y)
{ dgemv_(JOB_STR+1, &m, &n, &d, A, &m, x, &N_ONE, &D_ONE, y, &N_ONE); } { dgemv_(JOB_STR+1, &m, &n, &d, A, &m, x, &N_ONE, &D_ONE, y, &N_ONE); }
inline void gemtva(const unsigned m, const unsigned n, float d, float* A, float *x, float *y) inline void gemtva(const unsigned m, const unsigned n, float d, float* A, float *x, float *y)
{ sgemv_(JOB_STR+1, &m, &n, &d, A, &m, x, &N_ONE, &S_ONE, y, &N_ONE); } { sgemv_(JOB_STR+1, &m, &n, &d, A, &m, x, &N_ONE, &S_ONE, y, &N_ONE); }
inline void c_gemtva(const unsigned m, const unsigned n, float* d, float* A, float *x, float *y) inline void c_gemtva(const unsigned m, const unsigned n, float* d, float* A, float *x, float *y)
{ cgemv_(JOB_STR+1, &m, &n, d, A, &m, x, &N_ONE, C_ONE, y, &N_ONE); } { cgemv_(JOB_STR+1, &m, &n, d, A, &m, x, &N_ONE, C_ONE, y, &N_ONE); }
inline void c_gemtva(const unsigned m, const unsigned n, double* d, double* A, double *x, double *y) inline void c_gemtva(const unsigned m, const unsigned n, double* d, double* A, double *x, double *y)
{ zgemv_(JOB_STR+1, &m, &n, d, A, &m, x, &N_ONE, Z_ONE, y, &N_ONE); } { zgemv_(JOB_STR+1, &m, &n, d, A, &m, x, &N_ONE, Z_ONE, y, &N_ONE); }
inline void c_gemhva(const unsigned m, const unsigned n, float* d, float* A, float *x, float *y) inline void c_gemhva(const unsigned m, const unsigned n, float* d, float* A, float *x, float *y)
{ cgemv_(JOB_STR+7, &m, &n, d, A, &m, x, &N_ONE, C_ONE, y, &N_ONE); } // hermitian transposed { cgemv_(JOB_STR+7, &m, &n, d, A, &m, x, &N_ONE, C_ONE, y, &N_ONE); } // hermitian transposed
inline void c_gemhva(const unsigned m, const unsigned n, double* d, double* A, double *x, double *y) inline void c_gemhva(const unsigned m, const unsigned n, double* d, double* A, double *x, double *y)
{ zgemv_(JOB_STR+7, &m, &n, d, A, &m, x, &N_ONE, Z_ONE, y, &N_ONE); } // hermitian transposed { zgemv_(JOB_STR+7, &m, &n, d, A, &m, x, &N_ONE, Z_ONE, y, &N_ONE); } // hermitian transposed
...@@ -265,14 +265,14 @@ namespace FBlas { ...@@ -265,14 +265,14 @@ namespace FBlas {
inline void gemm(unsigned m, unsigned p, unsigned n, float d, inline void gemm(unsigned m, unsigned p, unsigned n, float d,
float* A, unsigned ldA, float* B, unsigned ldB, float* C, unsigned ldC) float* A, unsigned ldA, float* B, unsigned ldB, float* C, unsigned ldC)
{ sgemm_(JOB_STR, JOB_STR, &m, &n, &p, &d, A, &ldA, B, &ldB, &S_ZERO, C, &ldC); } { sgemm_(JOB_STR, JOB_STR, &m, &n, &p, &d, A, &ldA, B, &ldB, &S_ZERO, C, &ldC); }
inline void c_gemm(const unsigned m, const unsigned p, const unsigned n, const float* d, inline void c_gemm(const unsigned m, const unsigned p, const unsigned n, const float* d,
float* A, const unsigned ldA, float* B, const unsigned ldB, float* C, const unsigned ldC) float* A, const unsigned ldA, float* B, const unsigned ldB, float* C, const unsigned ldC)
{ {
cgemm_(JOB_STR, JOB_STR, &m, &n, &p, d, A, &ldA, B, &ldB, C_ZERO, C, &ldC); } cgemm_(JOB_STR, JOB_STR, &m, &n, &p, d, A, &ldA, B, &ldB, C_ZERO, C, &ldC); }
inline void c_gemm(const unsigned m, const unsigned p, const unsigned n, const double* d, inline void c_gemm(const unsigned m, const unsigned p, const unsigned n, const double* d,
double* A, const unsigned ldA, double* B, const unsigned ldB, double* C, const unsigned ldC) double* A, const unsigned ldA, double* B, const unsigned ldB, double* C, const unsigned ldC)
{ {
zgemm_(JOB_STR, JOB_STR, &m, &n, &p, d, A, &ldA, B, &ldB, Z_ZERO, C, &ldC); } zgemm_(JOB_STR, JOB_STR, &m, &n, &p, d, A, &ldA, B, &ldB, Z_ZERO, C, &ldC); }
// C += d A B, A is m x p, B is p x n // C += d A B, A is m x p, B is p x n
inline void gemma(unsigned m, unsigned p, unsigned n, double d, inline void gemma(unsigned m, unsigned p, unsigned n, double d,
...@@ -281,12 +281,12 @@ namespace FBlas { ...@@ -281,12 +281,12 @@ namespace FBlas {
inline void gemma(unsigned m, unsigned p, unsigned n, float d, inline void gemma(unsigned m, unsigned p, unsigned n, float d,
float* A, unsigned ldA, float* B, unsigned ldB, float* C, unsigned ldC) float* A, unsigned ldA, float* B, unsigned ldB, float* C, unsigned ldC)
{ sgemm_(JOB_STR, JOB_STR, &m, &n, &p, &d, A, &ldA, B, &ldB, &S_ONE, C, &ldC); } { sgemm_(JOB_STR, JOB_STR, &m, &n, &p, &d, A, &ldA, B, &ldB, &S_ONE, C, &ldC); }
inline void c_gemma(unsigned m, unsigned p, unsigned n, float* d, inline void c_gemma(unsigned m, unsigned p, unsigned n, float* d,
float* A, unsigned ldA, float* B, unsigned ldB, float* C, unsigned ldC) float* A, unsigned ldA, float* B, unsigned ldB, float* C, unsigned ldC)
{ cgemm_(JOB_STR, JOB_STR, &m, &n, &p, d, A, &ldA, B, &ldB, C_ONE, C, &ldC); } { cgemm_(JOB_STR, JOB_STR, &m, &n, &p, d, A, &ldA, B, &ldB, C_ONE, C, &ldC); }
inline void c_gemma(unsigned m, unsigned p, unsigned n, double* d, inline void c_gemma(unsigned m, unsigned p, unsigned n, double* d,
double* A, unsigned ldA, double* B, unsigned ldB, double* C, unsigned ldC) double* A, unsigned ldA, double* B, unsigned ldB, double* C, unsigned ldC)
{ zgemm_(JOB_STR, JOB_STR, &m, &n, &p, d, A, &ldA, B, &ldB, Z_ONE, C, &ldC); } { zgemm_(JOB_STR, JOB_STR, &m, &n, &p, d, A, &ldA, B, &ldB, Z_ONE, C, &ldC); }
// C = d A^T B, A is m x p, B is m x n // C = d A^T B, A is m x p, B is m x n
inline void gemtm(unsigned m, unsigned p, unsigned n, double d, inline void gemtm(unsigned m, unsigned p, unsigned n, double d,
...@@ -295,18 +295,18 @@ namespace FBlas { ...@@ -295,18 +295,18 @@ namespace FBlas {
inline void gemtm(unsigned m, unsigned p, unsigned n, float d, inline void gemtm(unsigned m, unsigned p, unsigned n, float d,
float* A, unsigned ldA, float *B, unsigned ldB, float* C, unsigned ldC) float* A, unsigned ldA, float *B, unsigned ldB, float* C, unsigned ldC)
{ sgemm_(JOB_STR+1, JOB_STR, &p, &n, &m, &d, A, &ldA, B, &ldB, &S_ZERO, C, &ldC); } { sgemm_(JOB_STR+1, JOB_STR, &p, &n, &m, &d, A, &ldA, B, &ldB, &S_ZERO, C, &ldC); }
inline void c_gemtm(unsigned m, unsigned p, unsigned n, float* d, inline void c_gemtm(unsigned m, unsigned p, unsigned n, float* d,
float* A, unsigned ldA, float *B, unsigned ldB, float* C, unsigned ldC) float* A, unsigned ldA, float *B, unsigned ldB, float* C, unsigned ldC)
{ cgemm_(JOB_STR+1, JOB_STR, &p, &n, &m, d, A, &ldA, B, &ldB, C_ZERO, C, &ldC); } { cgemm_(JOB_STR+1, JOB_STR, &p, &n, &m, d, A, &ldA, B, &ldB, C_ZERO, C, &ldC); }
inline void c_gemtm(unsigned m, unsigned p, unsigned n, double* d, inline void c_gemtm(unsigned m, unsigned p, unsigned n, double* d,
double* A, unsigned ldA, double *B, unsigned ldB, double* C, unsigned ldC) double* A, unsigned ldA, double *B, unsigned ldB, double* C, unsigned ldC)
{ zgemm_(JOB_STR+1, JOB_STR, &p, &n, &m, d, A, &ldA, B, &ldB, Z_ZERO, C, &ldC); } { zgemm_(JOB_STR+1, JOB_STR, &p, &n, &m, d, A, &ldA, B, &ldB, Z_ZERO, C, &ldC); }
inline void c_gemhm(unsigned m, unsigned p, unsigned n, float* d, // hermitialn transposed inline void c_gemhm(unsigned m, unsigned p, unsigned n, float* d, // hermitialn transposed
float* A, unsigned ldA, float *B, unsigned ldB, float* C, unsigned ldC) float* A, unsigned ldA, float *B, unsigned ldB, float* C, unsigned ldC)
{ cgemm_(JOB_STR+7, JOB_STR, &p, &n, &m, d, A, &ldA, B, &ldB, C_ZERO, C, &ldC); } { cgemm_(JOB_STR+7, JOB_STR, &p, &n, &m, d, A, &ldA, B, &ldB, C_ZERO, C, &ldC); }
inline void c_gemhm(unsigned m, unsigned p, unsigned n, double* d, // hermitian transposed inline void c_gemhm(unsigned m, unsigned p, unsigned n, double* d, // hermitian transposed
double* A, unsigned ldA, double *B, unsigned ldB, double* C, unsigned ldC) double* A, unsigned ldA, double *B, unsigned ldB, double* C, unsigned ldC)
{ zgemm_(JOB_STR+7, JOB_STR, &p, &n, &m, d, A, &ldA, B, &ldB, Z_ZERO, C, &ldC); } { zgemm_(JOB_STR+7, JOB_STR, &p, &n, &m, d, A, &ldA, B, &ldB, Z_ZERO, C, &ldC); }
// C += d A^T B, A is m x p, B is m x n // C += d A^T B, A is m x p, B is m x n
inline void gemtma(unsigned m, unsigned p, unsigned n, double d, inline void gemtma(unsigned m, unsigned p, unsigned n, double d,
...@@ -315,19 +315,39 @@ namespace FBlas { ...@@ -315,19 +315,39 @@ namespace FBlas {
inline void gemtma(unsigned m, unsigned p, unsigned n, float d, inline void gemtma(unsigned m, unsigned p, unsigned n, float d,
float* A, unsigned ldA, float *B, unsigned ldB, float* C, unsigned ldC) float* A, unsigned ldA, float *B, unsigned ldB, float* C, unsigned ldC)
{ sgemm_(JOB_STR+1, JOB_STR, &p, &n, &m, &d, A, &ldA, B, &ldB, &S_ONE, C, &ldC); } { sgemm_(JOB_STR+1, JOB_STR, &p, &n, &m, &d, A, &ldA, B, &ldB, &S_ONE, C, &ldC); }
inline void c_gemtma(unsigned m, unsigned p, unsigned n, float* d, inline void c_gemtma(unsigned m, unsigned p, unsigned n, float* d,
float* A, unsigned ldA, float *B, unsigned ldB, float* C, unsigned ldC) float* A, unsigned ldA, float *B, unsigned ldB, float* C, unsigned ldC)
{ cgemm_(JOB_STR+1, JOB_STR, &p, &n, &m, d, A, &ldA, B, &ldB, C_ONE, C, &ldC); } { cgemm_(JOB_STR+1, JOB_STR, &p, &n, &m, d, A, &ldA, B, &ldB, C_ONE, C, &ldC); }
inline void c_gemtma(unsigned m, unsigned p, unsigned n, double* d, inline void c_gemtma(unsigned m, unsigned p, unsigned n, double* d,
double* A, unsigned ldA, double *B, unsigned ldB, double* C, unsigned ldC) double* A, unsigned ldA, double *B, unsigned ldB, double* C, unsigned ldC)
{ zgemm_(JOB_STR+1, JOB_STR, &p, &n, &m, d, A, &ldA, B, &ldB, Z_ONE, C, &ldC); } { zgemm_(JOB_STR+1, JOB_STR, &p, &n, &m, d, A, &ldA, B, &ldB, Z_ONE, C, &ldC); }
inline void c_gemhma(unsigned m, unsigned p, unsigned n, float* d, // hermitian transposed inline void c_gemhma(unsigned m, unsigned p, unsigned n, float* d, // hermitian transposed
float* A, unsigned ldA, float *B, unsigned ldB, float* C, unsigned ldC) float* A, unsigned ldA, float *B, unsigned ldB, float* C, unsigned ldC)
{ cgemm_(JOB_STR+7, JOB_STR, &p, &n, &m, d, A, &ldA, B, &ldB, C_ONE, C, &ldC); } { cgemm_(JOB_STR+7, JOB_STR, &p, &n, &m, d, A, &ldA, B, &ldB, C_ONE, C, &ldC); }
inline void c_gemhma(unsigned m, unsigned p, unsigned n, double* d, // hermitian transposed inline void c_gemhma(unsigned m, unsigned p, unsigned n, double* d, // hermitian transposed
double* A, unsigned ldA, double *B, unsigned ldB, double* C, unsigned ldC) double* A, unsigned ldA, double *B, unsigned ldB, double* C, unsigned ldC)
{ zgemm_(JOB_STR+7, JOB_STR, &p, &n, &m, d, A, &ldA, B, &ldB, Z_ONE, C, &ldC); } { zgemm_(JOB_STR+7, JOB_STR, &p, &n, &m, d, A, &ldA, B, &ldB, Z_ONE, C, &ldC); }
// C = d A B^T, A is m x p, B is n x p
inline void gemmt(unsigned m, unsigned p, unsigned n, double d,
double* A, unsigned ldA, double *B, unsigned ldB, double* C, unsigned ldC)
{ dgemm_(JOB_STR, JOB_STR+1, &m, &n, &p, &d, A, &ldA, B, &ldB, &D_ZERO, C, &ldC); }
inline void gemmt(unsigned m, unsigned p, unsigned n, float d,
float* A, unsigned ldA, float *B, unsigned ldB, float* C, unsigned ldC)
{ sgemm_(JOB_STR, JOB_STR+1, &m, &n, &p, &d, A, &ldA, B, &ldB, &S_ZERO, C, &ldC); }
inline void c_gemmt(unsigned m, unsigned p, unsigned n, float d,
float* A, unsigned ldA, float *B, unsigned ldB, float* C, unsigned ldC)
{ cgemm_(JOB_STR, JOB_STR+1, &m, &n, &p, &d, A, &ldA, B, &ldB, C_ZERO, C, &ldC); }
inline void c_gemmt(unsigned m, unsigned p, unsigned n, double d,
double* A, unsigned ldA, double *B, unsigned ldB, double* C, unsigned ldC)
{ zgemm_(JOB_STR, JOB_STR+1, &m, &n, &p, &d, A, &ldA, B, &ldB, Z_ZERO, C, &ldC); }
inline void c_gemmh(unsigned m, unsigned p, unsigned n, float d, // hermitian transposed
float* A, unsigned ldA, float *B, unsigned ldB, float* C, unsigned ldC)
{ cgemm_(JOB_STR, JOB_STR+7, &m, &n, &p, &d, A, &ldA, B, &ldB, C_ZERO, C, &ldC); }
inline void c_gemmh(unsigned m, unsigned p, unsigned n, double d, // hermitian transposed
double* A, unsigned ldA, double *B, unsigned ldB, double* C, unsigned ldC)
{ zgemm_(JOB_STR, JOB_STR+7, &m, &n, &p, &d, A, &ldA, B, &ldB, Z_ZERO, C, &ldC); }
......
...@@ -78,8 +78,8 @@ int main(int argc, char* argv[]) ...@@ -78,8 +78,8 @@ int main(int argc, char* argv[])
const unsigned int SubTreeHeight = FParameters::getValue(argc, argv, "-sh", 2); const unsigned int SubTreeHeight = FParameters::getValue(argc, argv, "-sh", 2);
const unsigned int NbThreads = FParameters::getValue(argc, argv, "-t", 1); const unsigned int NbThreads = FParameters::getValue(argc, argv, "-t", 1);
const unsigned int ORDER = 9; const unsigned int ORDER = 7;
const FReal epsilon = FReal(1e-9); const FReal epsilon = FReal(1e-7);
// set threads // set threads
omp_set_num_threads(NbThreads); omp_set_num_threads(NbThreads);
......
...@@ -93,7 +93,7 @@ int main(int, char **){ ...@@ -93,7 +93,7 @@ int main(int, char **){
// Leaf size // Leaf size
FReal width = 3.723; FReal width = FReal(3.723);
//////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////
LeafClass X; LeafClass X;
......
...@@ -36,10 +36,11 @@ ...@@ -36,10 +36,11 @@
#include "../../Src/Kernels/Chebyshev/FChebParticle.hpp" #include "../../Src/Kernels/Chebyshev/FChebParticle.hpp"
#include "../../Src/Kernels/Chebyshev/FChebLeaf.hpp" #include "../../Src/Kernels/Chebyshev/FChebLeaf.hpp"
#include "../../Src/Kernels/Chebyshev/FChebInterpolator.hpp" #include "../../Src/Kernels/Chebyshev/FChebInterpolator.hpp"
#include "../../Src/Kernels/Chebyshev/FChebM2LHandler.hpp"
#include "../../Src/Kernels/Chebyshev/FChebMatrixKernel.hpp" #include "../../Src/Kernels/Chebyshev/FChebMatrixKernel.hpp"
void applyM2M(FReal *const S, FReal *const w, const unsigned int n, FReal *const W, const unsigned int N)
{ FBlas::gemtva(n, N, FReal(1.), S, w, W); }
FReal computeL2norm(unsigned int N, FReal *const u, FReal *const v) FReal computeL2norm(unsigned int N, FReal *const u, FReal *const v)
...@@ -97,7 +98,7 @@ int main(int argc, char* argv[]) ...@@ -97,7 +98,7 @@ int main(int argc, char* argv[])
//////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////
LeafClass X; LeafClass X;
FPoint cx(0., 0., 0.); FPoint cx(0., 0., 0.);
const long M = 10000; const long M = 1000;
std::cout << "Fill the leaf X of width " << width std::cout << "Fill the leaf X of width " << width
<< " centered at cx=[" << cx.getX() << "," << cx.getY() << "," << cx.getZ() << " centered at cx=[" << cx.getX() << "," << cx.getY() << "," << cx.getZ()
<< "] with M=" << M << " target particles" << std::endl; << "] with M=" << M << " target particles" << std::endl;
...@@ -116,7 +117,7 @@ int main(int argc, char* argv[]) ...@@ -116,7 +117,7 @@ int main(int argc, char* argv[])
//////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////
LeafClass Y; LeafClass Y;
FPoint cy(FReal(2.)*width, 0., 0.); FPoint cy(FReal(2.)*width, 0., 0.);
const long N = 10000; const long N = 1000;
std::cout << "Fill the leaf Y of width " << width std::cout << "Fill the leaf Y of width " << width
<< " centered at cy=[" << cy.getX() << "," << cy.getY() << "," << cy.getZ() << " centered at cy=[" << cy.getX() << "," << cy.getY() << "," << cy.getZ()
<< "] with N=" << N << " target particles" << std::endl; << "] with N=" << N << " target particles" << std::endl;
...@@ -133,43 +134,25 @@ int main(int argc, char* argv[]) ...@@ -133,43 +134,25 @@ int main(int argc, char* argv[])
} }
//////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////
// approximative computation // approximative computation
const unsigned int ORDER = 5; const unsigned int ORDER = 10;
const FReal epsilon = FParameters::getValue(argc, argv, "-eps", 1e-5);
const unsigned int nnodes = TensorTraits<ORDER>::nnodes; const unsigned int nnodes = TensorTraits<ORDER>::nnodes;
typedef FChebInterpolator<ORDER> InterpolatorClass; typedef FChebInterpolator<ORDER> InterpolatorClass;
InterpolatorClass S; InterpolatorClass S;
// set compressed M2L operators std::cout << "\nCompute interactions approximatively, ORDER = " << ORDER << std::endl;
FChebM2LHandler<ORDER,MatrixKernelClass> M2L(epsilon); FReal W[nnodes]; // multipole expansion
//M2L.ComputeAndCompressAndSet(); // <- precompute M2L operators FReal F[nnodes]; // local expansion
M2L.ReadFromBinaryFileAndSet(); // <- read precomputed M2L operators from binary file for (unsigned int n=0; n<nnodes; ++n)
std::cout << "\nCompute interactions approximatively, ACC = (" << ORDER << ", " << epsilon << ") ..." << std::endl;
time.tic();
FReal W[nnodes * 2]; // multipole expansion
FReal F[nnodes * 2]; // local expansion
for (unsigned int n=0; n<nnodes*2; ++n)
W[n] = F[n] = FReal(0.); W[n] = F[n] = FReal(0.);
// Anterpolate: W_n = \sum_j^N S(y_j,\bar y_n) * w_j // Anterpolate: W_n = \sum_j^N S(y_j,\bar y_n) * w_j
time.tic();
S.applyP2M(cy, width, W, Y.getSrc()); // the multipole expansions are set to 0 in S.applyP2M S.applyP2M(cy, width, W, Y.getSrc()); // the multipole expansions are set to 0 in S.applyP2M
std::cout << "P2M done in " << time.tacAndElapsed() << "s" << std::endl;
// M2L (compressed)
const int diffx = int((cy.getX()-cx.getX()) / width);
const int diffy = int((cy.getY()-cx.getY()) / width);
const int diffz = int((cy.getZ()-cx.getZ()) / width);
const int transfer[3] = {diffx, diffy, diffz};
M2L.applyB(W, W+nnodes);
M2L.applyC(transfer, width, W+nnodes, F+nnodes);
M2L.applyU(F+nnodes, F);
//for (unsigned int n=0; n<nnodes; ++n) F[n] *= MatrixKernel.getScaleFactor(width);
/*
// M2L (direct) // M2L (direct)
FPoint rootsX[nnodes], rootsY[nnodes]; FPoint rootsX[nnodes], rootsY[nnodes];
FChebTensor<ORDER>::setRoots(cx, width, rootsX); FChebTensor<ORDER>::setRoots(cx, width, rootsX);
...@@ -179,13 +162,13 @@ int main(int argc, char* argv[]) ...@@ -179,13 +162,13 @@ int main(int argc, char* argv[])
for (unsigned int j=0; j<nnodes; ++j) for (unsigned int j=0; j<nnodes; ++j)
F[i] += MatrixKernel.evaluate(rootsX[i], rootsY[j]) * W[j]; F[i] += MatrixKernel.evaluate(rootsX[i], rootsY[j]) * W[j];
} }
*/
// Interpolate f_i = \sum_m^L S(x_i,\bar x_m) * F_m // Interpolate f_i = \sum_m^L S(x_i,\bar x_m) * F_m
S.applyL2PTotal(cx, width, F, X.getTargets()); time.tic();
//S.applyL2PTotal(cx, width, F, X.getTargets());
S.applyL2P(cx, width, F, X.getTargets());
std::cout << "L2P done in " << time.tacAndElapsed() << "s" << std::endl;
time.tac();
std::cout << "Done in " << time.elapsed() << "sec." << std::endl;
// ----------------------------------------------------- // -----------------------------------------------------
//////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////
...@@ -221,6 +204,11 @@ int main(int argc, char* argv[]) ...@@ -221,6 +204,11 @@ int main(int argc, char* argv[])
iterX.gotoNext(); iterX.gotoNext();
} }
// for (unsigned int i=0; i<1; ++i)
// std::cout << f[i] << "\t" << approx_f[i] << "\t" << approx_f[i]/f[i] << std::endl;
std::cout << "\nRelative L2 error = " << computeL2norm( M, f, approx_f) << std::endl; std::cout << "\nRelative L2 error = " << computeL2norm( M, f, approx_f) << std::endl;
std::cout << "Relative Lmax error = " << computeINFnorm(M, f, approx_f) << "\n" << std::endl; std::cout << "Relative Lmax error = " << computeINFnorm(M, f, approx_f) << "\n" << std::endl;
......
...@@ -51,191 +51,337 @@ void applyl2l(FReal *const S, FReal *const F, const unsigned int n, FReal *const ...@@ -51,191 +51,337 @@ void applyl2l(FReal *const S, FReal *const F, const unsigned int n, FReal *const
/** /**
* In this file we show how to use octree * In this file we show how to use octree
*/ */
int main(int argc, char* argv[]) int main(int argc, char* argv[])
{ {
const unsigned int ORDER = 2; const unsigned int ORDER = 10;
const unsigned int nnodes = TensorTraits<ORDER>::nnodes; const unsigned int nnodes = TensorTraits<ORDER>::nnodes;
FPoint X[nnodes]; FPoint X[nnodes];
FChebTensor<ORDER>::setRoots(FPoint(0.,0.,0.), FReal(2.), X); FChebInterpolator<ORDER> Interpolator;
FPoint x[nnodes];
// const FPoint cx(-.5, -.5, -.5); {
// const FPoint cx(-.5, -.5, .5); FChebTensor<ORDER>::setRoots(FPoint(0.,0.,0.), FReal(2.), X);
// const FPoint cx(-.5, .5, -.5); FPoint x[nnodes];
// const FPoint cx(-.5, .5, .5);
// const FPoint cx( .5, -.5, -.5); // const FPoint cx(-.5, -.5, -.5);
// const FPoint cx( .5, -.5, .5); // const FPoint cx(-.5, -.5, .5);
// const FPoint cx( .5, .5, -.5); // const FPoint cx(-.5, .5, -.5);
const FPoint cx( .5, .5, .5); // const FPoint cx(-.5, .5, .5);
const FReal wx(1.); // const FPoint cx( .5, -.5, -.5);
FChebTensor<ORDER>::setRoots(cx, wx, x); // const FPoint cx( .5, -.5, .5);
// const FPoint cx( .5, .5, -.5);
FReal w[nnodes], f[nnodes]; const FPoint cx( .5, .5, .5);
for (unsigned int n=0; n<nnodes; ++n) { const FReal wx(1.);
w[n] = f[n] = FReal(n); FChebTensor<ORDER>::setRoots(cx, wx, x);
// std::cout << w[n] << "\t" << X[n] << "\t" << x[n] << std::endl;
} FReal w[nnodes], f[nnodes];
for (unsigned int n=0; n<nnodes; ++n) {
w[n] = f[n] = FReal(n);
// std::cout << w[n] << "\t" << X[n] << "\t" << x[n] << std::endl;
}
FReal coords[3][ORDER]; FReal coords[3][ORDER];
FChebTensor<ORDER>::setChebyshevRoots(cx, wx, coords); FChebTensor<ORDER>::setChebyshevRoots(cx, wx, coords);
// for (unsigned int n=0; n<ORDER; ++n) { // for (unsigned int n=0; n<ORDER; ++n) {
// std::cout << coords[0][n] << "\t" // std::cout << coords[0][n] << "\t"
// << coords[1][n] << "\t" // << coords[1][n] << "\t"
// << coords[2][n] << std::endl; // << coords[2][n] << std::endl;
// } // }
FReal S[3][ORDER*ORDER]; FReal S[3][ORDER*ORDER];
FChebInterpolator<ORDER> Interpolator; Interpolator.assembleInterpolator(ORDER, coords[0], S[0]);
Interpolator.assembleInterpolator(ORDER, coords[0], S[0]); Interpolator.assembleInterpolator(ORDER, coords[1], S[1]);
Interpolator.assembleInterpolator(ORDER, coords[1], S[1]); Interpolator.assembleInterpolator(ORDER, coords[2], S[2]);
Interpolator.assembleInterpolator(ORDER, coords[2], S[2]);
FReal Skron[nnodes * nnodes];
Interpolator.assembleInterpolator(nnodes, x, Skron);
// std::cout << std::endl;
// for (unsigned int i=0; i<ORDER; ++i) {
// for (unsigned int j=0; j<ORDER; ++j)
// std::cout << S[0][j*ORDER + i] << " "; FReal W0[nnodes];
// std::cout << std::endl; for (unsigned int i=0; i<nnodes; ++i) W0[i] = FReal(0.);
// } applyM2M(Skron, w, nnodes, W0, nnodes);
//
// std::cout << std::endl;
// for (unsigned int i=0; i<ORDER; ++i) { FReal F0[nnodes];
// for (unsigned int j=0; j<ORDER; ++j) for (unsigned int i=0; i<nnodes; ++i) F0[i] = FReal(0.);
// std::cout << S[1][j*ORDER + i] << " "; applyL2L(Skron, f, nnodes, F0, nnodes);
// std::cout << std::endl;
// }
//
// std::cout << std::endl;
// for (unsigned int i=0; i<ORDER; ++i) { unsigned int perm[3][nnodes];
// for (unsigned int j=0; j<ORDER; ++j) for (unsigned int i=0; i<ORDER; ++i) {
// std::cout << S[2][j*ORDER + i] << " "; for (unsigned int j=0; j<ORDER; ++j) {
// std::cout << std::endl; for (unsigned int k=0; k<ORDER; ++k) {
// } const unsigned int index = k*ORDER*ORDER + j*ORDER + i;
perm[0][index] = k*ORDER*ORDER + j*ORDER + i;
perm[1][index] = i*ORDER*ORDER + k*ORDER + j;
FReal Skron[nnodes * nnodes]; perm[2][index] = j*ORDER*ORDER + i*ORDER + k;
Interpolator.assembleInterpolator(nnodes, x, Skron); }
// std::cout << std::endl;
// for (unsigned int i=0; i<nnodes; ++i) {
// for (unsigned int j=0; j<nnodes; ++j)
// std::cout << Skron[j*nnodes + i] << " ";
// std::cout << std::endl;
// }
FReal W0[nnodes]; FBlas::setzero(nnodes, W0);
applyM2M(Skron, w, nnodes, W0, nnodes);
for (unsigned int n=0; n<nnodes; ++n)
std::cout << w[n] << "\t" << W0[n] << std::endl;
FReal F0[nnodes]; FBlas::setzero(nnodes, F0);
applyL2L(Skron, f, nnodes, F0, nnodes);
for (unsigned int n=0; n<nnodes; ++n)
std::cout << f[n] << "\t" << F0[n] << std::endl;
// std::cout << std::endl;
// for (unsigned int i=0; i<ORDER; ++i) {
// for (unsigned int j=0; j<ORDER*ORDER; ++j)
// std::cout << w[j*ORDER + i] << " ";
// std::cout << std::endl;
// }
unsigned int perm[3][nnodes];
for (unsigned int i=0; i<ORDER; ++i) {
for (unsigned int j=0; j<ORDER; ++j) {
for (unsigned int k=0; k<ORDER; ++k) {
const unsigned int index = k*ORDER*ORDER + j*ORDER + i;
perm[0][index] = k*ORDER*ORDER + j*ORDER + i;
perm[1][index] = i*ORDER*ORDER + k*ORDER + j;
perm[2][index] = j*ORDER*ORDER + i*ORDER + k;
} }
} }
}
// for (unsigned int n=0; n<nnodes; ++n) // for (unsigned int n=0; n<nnodes; ++n)
// std::cout << perm[0][n] << "\t" << perm[1][n] << "\t" << perm[2][n] << std::endl; // std::cout << perm[0][n] << "\t" << perm[1][n] << "\t" << perm[2][n] << std::endl;
FReal W[nnodes]; FBlas::setzero(nnodes, W); FReal W[nnodes];
applym2m(S[0], w, ORDER, W, ORDER); for (unsigned int i=0; i<nnodes; ++i) W[i] = FReal(0.);
// std::cout << std::endl; applym2m(S[0], w, ORDER, W, ORDER);
// for (unsigned int i=0; i<ORDER; ++i) { for (unsigned int n=0; n<nnodes; ++n) w[n] = W[perm[1][n]];
// for (unsigned int j=0; j<ORDER*ORDER; ++j) applym2m(S[2], w, ORDER, W, ORDER);
// std::cout << W[j*ORDER + i] << " "; for (unsigned int n=0; n<nnodes; ++n) w[perm[1][n]] = W[perm[2][n]];
// std::cout << std::endl; applym2m(S[1], w, ORDER, W, ORDER);
// } for (unsigned int n=0; n<nnodes; ++n) w[perm[2][n]] = W[n];
// std::cout << std::endl; FReal m2m_error(0.);
for (unsigned int n=0; n<nnodes; ++n) {
for (unsigned int n=0; n<nnodes; ++n) //std::cout << n << "\t" << w[n] << " - " << W0[n] << " = " << w[n]-W0[n] << std::endl;
w[n] = W[perm[1][n]]; m2m_error += w[n] - W0[n];
applym2m(S[2], w, ORDER, W, ORDER); }
// for (unsigned int i=0; i<ORDER; ++i) {
// for (unsigned int j=0; j<ORDER*ORDER; ++j)
// std::cout << W[j*ORDER + i] << " ";
// std::cout << std::endl;
// }
// std::cout << std::endl;
for (unsigned int n=0; n<nnodes; ++n)
w[perm[1][n]] = W[perm[2][n]];
applym2m(S[1], w, ORDER, W, ORDER);
// for (unsigned int i=0; i<ORDER; ++i) {
// for (unsigned int j=0; j<ORDER*ORDER; ++j)
// std::cout << W[j*ORDER + i] << " ";
// std::cout << std::endl;
// }
// std::cout << std::endl;
for (unsigned int n=0; n<nnodes; ++n)
w[perm[2][n]] = W[n];
// for (unsigned int i=0; i<ORDER; ++i) {
// for (unsigned int j=0; j<ORDER*ORDER; ++j)
// std::cout << w[j*ORDER + i] << " ";
// std::cout << std::endl;
// }
// std::cout << std::endl;
std::cout << std::endl; std::cout << "ERROR M2M = " << m2m_error << std::endl;
FReal error(0.);
for (unsigned int n=0; n<nnodes; ++n) {
std::cout << n << "\t" << w[n] << " - " << W0[n] << " = " << w[n]-W0[n] << std::endl;
error += w[n] - W0[n];
}
std::cout << "\nERROR = " << error << std::endl << std::endl;
FReal F[nnodes];
for (unsigned int i=0; i<nnodes; ++i) F[i] = FReal(0.);
applyl2l(S[0], f, ORDER, F, ORDER);
for (unsigned int n=0; n<nnodes; ++n) f[n] = F[perm[1][n]];
applyl2l(S[2], f, ORDER, F, ORDER);
for (unsigned int n=0; n<nnodes; ++n) f[perm[1][n]] = F[perm[2][n]];
applyl2l(S[1], f, ORDER, F, ORDER);
for (unsigned int n=0; n<nnodes; ++n) f[perm[2][n]] = F[n];
FReal F[nnodes]; FBlas::setzero(nnodes, F); FReal l2l_error(0.);
applyl2l(S[0], f, ORDER, F, ORDER); for (unsigned int n=0; n<nnodes; ++n) {
//std::cout << n << "\t" << f[n] << " - " << F0[n] << " = " << f[n]-F0[n] << std::endl;
l2l_error += f[n] - F0[n];
}
for (unsigned int n=0; n<nnodes; ++n) std::cout << "ERROR L2L = " << l2l_error << std::endl;
f[n] = F[perm[1][n]];
applyl2l(S[2], f, ORDER, F, ORDER); }
////////////////////////////////////////////////////////////////////
////////////////////////////////////////////////////////////////////
// P2M /////////////////////////////////////////////////////////////
////////////////////////////////////////////////////////////////////
////////////////////////////////////////////////////////////////////
std::cout << "\n--------------------------------------\n"
<< "-- P2M -------------------------------\n"
<< "--------------------------------------" << std::endl;
const unsigned int M = 10;
FReal points[3][M];
FReal weights[M];
FPoint lp[M];
FReal equivW[nnodes];
{ ////////////////////////////////////////////////////////
const FReal FRandMax = FReal(RAND_MAX);
for(unsigned int p=0; p<M; ++p){
points[0][p] = (FReal(rand())/FRandMax - FReal(.5)) * FReal(2.);
points[1][p] = (FReal(rand())/FRandMax - FReal(.5)) * FReal(2.);
points[2][p] = (FReal(rand())/FRandMax - FReal(.5)) * FReal(2.);
weights[p] = FReal(rand())/FRandMax;
//std::cout << points[0][p] << "\t"
// << points[1][p] << "\t"
// << points[2][p] << "\t"
// << weights[p] << std::endl;
lp[p].setX(points[0][p]);
lp[p].setY(points[1][p]);
lp[p].setZ(points[2][p]);
//std::cout << lp[p] << std::endl;
}
} ////////////////////////////////////////////////////////
for (unsigned int n=0; n<nnodes; ++n)
f[perm[1][n]] = F[perm[2][n]]; {
applyl2l(S[1], f, ORDER, F, ORDER); for(unsigned int i=0; i<nnodes; ++i) equivW[i] = FReal(0.);
FReal Snorm[M * nnodes];
Interpolator.assembleInterpolator(M, lp, Snorm);
applyM2M(Snorm, weights, M, equivW, nnodes);
}
for (unsigned int n=0; n<nnodes; ++n) FReal W1;
f[perm[2][n]] = F[n]; FReal W2[3][ ORDER-1];
FReal W4[3][(ORDER-1)*(ORDER-1)];
FReal W8[ (ORDER-1)*(ORDER-1)*(ORDER-1)];
{ ////////////////////////////////////////////////////////
W1 = FReal(0.);
for(unsigned int i=0; i<ORDER-1; ++i)
W2[0][i] = W2[1][i] = W2[2][i] = FReal(0.);
for(unsigned int i=0; i<(ORDER-1)*(ORDER-1); ++i)
W4[0][i] = W4[1][i] = W4[2][i] = FReal(0.);
for(unsigned int i=0; i<(ORDER-1)*(ORDER-1)*(ORDER-1); ++i)
W8[i] = FReal(0.);
// loop over source particles
for (unsigned int p=0; p<M; ++p) {
FReal T_of_x[3][ORDER];
T_of_x[0][0] = FReal(1.); T_of_x[0][1] = points[0][p];
T_of_x[1][0] = FReal(1.); T_of_x[1][1] = points[1][p];
T_of_x[2][0] = FReal(1.); T_of_x[2][1] = points[2][p];
const FReal x2 = FReal(2.) * T_of_x[0][1]; // 1 flop
const FReal y2 = FReal(2.) * T_of_x[1][1]; // 1 flop
const FReal z2 = FReal(2.) * T_of_x[2][1]; // 1 flop
for (unsigned int j=2; j<ORDER; ++j) {
T_of_x[0][j] = x2 * T_of_x[0][j-1] - T_of_x[0][j-2]; // 2 flops
T_of_x[1][j] = y2 * T_of_x[1][j-1] - T_of_x[1][j-2]; // 2 flops
T_of_x[2][j] = z2 * T_of_x[2][j-1] - T_of_x[2][j-2]; // 2 flops
}
W1 += weights[p]; // 1 flop
for (unsigned int i=1; i<ORDER; ++i) {
const FReal wx = weights[p] * T_of_x[0][i]; // 1 flop
const FReal wy = weights[p] * T_of_x[1][i]; // 1 flop
const FReal wz = weights[p] * T_of_x[2][i]; // 1 flop
W2[0][i-1] += wx; // 1 flop
W2[1][i-1] += wy; // 1 flop
W2[2][i-1] += wz; // 1 flop
for (unsigned int j=1; j<ORDER; ++j) {
const FReal wxy = wx * T_of_x[1][j]; // 1 flop
const FReal wxz = wx * T_of_x[2][j]; // 1 flop
const FReal wyz = wy * T_of_x[2][j]; // 1 flop
W4[0][(j-1)*(ORDER-1) + (i-1)] += wxy; // 1 flop
W4[1][(j-1)*(ORDER-1) + (i-1)] += wxz; // 1 flop
W4[2][(j-1)*(ORDER-1) + (i-1)] += wyz; // 1 flop
for (unsigned int k=1; k<ORDER; ++k) {
const FReal wxyz = wxy * T_of_x[2][k]; // 1 flop
W8[(k-1)*(ORDER-1)*(ORDER-1) + (j-1)*(ORDER-1) + (i-1)] += wxyz; // 1 flop
} // flops: (ORDER-1) * 2
} // flops: (ORDER-1) * (6 + (ORDER-1) * 2)
} // flops: (ORDER-1) * (6 + (ORDER-1) * (6 + (ORDER-1) * 2))
} // flops: M * (3 + (ORDER-2) * 6 + (ORDER-1) * (6 + (ORDER-1) * (6 + (ORDER-1) * 2)))
} ////////////////////////////////////////////////////////
FReal F2[3][ORDER];
FReal F4[3][ORDER*ORDER];
FReal F8[ ORDER*ORDER*ORDER];
{ ////////////////////////////////////////////////////////
//for(unsigned int i=0; i<ORDER; ++i)
// F2[0][i] = F2[1][i] = F2[2][i] = FReal(0.);
for(unsigned int i=0; i<ORDER*ORDER; ++i)
F4[0][i] = F4[1][i] = F4[2][i] = FReal(0.);
for(unsigned int i=0; i<ORDER*ORDER*ORDER; ++i)
F8[i] = FReal(0.);
FReal T_of_y[ORDER * (ORDER-1)];
for (unsigned int o=1; o<ORDER; ++o)
for (unsigned int j=0; j<ORDER; ++j)
T_of_y[(o-1)*ORDER + j] = FReal(FChebRoots<ORDER>::T(o, FReal(FChebRoots<ORDER>::roots[j])));
FBlas::gemv(ORDER, ORDER-1, FReal(1.), T_of_y, W2[0], F2[0]);
FBlas::gemv(ORDER, ORDER-1, FReal(1.), T_of_y, W2[1], F2[1]);
FBlas::gemv(ORDER, ORDER-1, FReal(1.), T_of_y, W2[2], F2[2]);
FReal C[ORDER * (ORDER-1)];
FBlas::gemmt(ORDER, ORDER-1, ORDER-1, FReal(1.), T_of_y, ORDER, W4[0], ORDER-1, C, ORDER);
FBlas::gemmt(ORDER, ORDER-1, ORDER-1, FReal(1.), T_of_y, ORDER, C, ORDER-1, F4[0], ORDER);
FBlas::gemmt(ORDER, ORDER-1, ORDER-1, FReal(1.), T_of_y, ORDER, W4[1], ORDER-1, C, ORDER);
FBlas::gemmt(ORDER, ORDER-1, ORDER-1, FReal(1.), T_of_y, ORDER, C, ORDER-1, F4[1], ORDER);
FBlas::gemmt(ORDER, ORDER-1, ORDER-1, FReal(1.), T_of_y, ORDER, W4[2], ORDER-1, C, ORDER);
FBlas::gemmt(ORDER, ORDER-1, ORDER-1, FReal(1.), T_of_y, ORDER, C, ORDER-1, F4[2], ORDER);
FReal D[ORDER * (ORDER-1) * (ORDER-1)];
FBlas::gemm(ORDER, ORDER-1, (ORDER-1)*(ORDER-1), FReal(1.), T_of_y, ORDER, W8, ORDER-1, D, ORDER);
FReal E[(ORDER-1) * (ORDER-1) * ORDER];
for (unsigned int i=0; i<ORDER; ++i) {
for (unsigned int m=0; m<ORDER-1; ++m) {
for (unsigned int n=0; n<ORDER-1; ++n) {
const unsigned int a = n*(ORDER-1)*ORDER + m*ORDER + i;
const unsigned int b = i*(ORDER-1)*(ORDER-1) + n*(ORDER-1) + m;
E[b] = D[a];
}
}
}
FReal F[ORDER * (ORDER-1) * ORDER];
FBlas::gemm(ORDER, ORDER-1, ORDER*(ORDER-1), FReal(1.), T_of_y, ORDER, E, ORDER-1, F, ORDER);
FReal G[(ORDER-1) * ORDER * ORDER];
for (unsigned int i=0; i<ORDER; ++i) {
for (unsigned int j=0; j<ORDER; ++j) {
for (unsigned int n=0; n<ORDER-1; ++n) {
const unsigned int a = i*(ORDER-1)*ORDER + n*ORDER + j;
const unsigned int b = j*ORDER*(ORDER-1) + i*(ORDER-1) + n;
G[b] = F[a];
}
}
}
FReal H[ORDER * ORDER * ORDER];
FBlas::gemm(ORDER, ORDER-1, ORDER*ORDER, FReal(1.), T_of_y, ORDER, G, ORDER-1, H, ORDER);
for (unsigned int i=0; i<ORDER; ++i) {
for (unsigned int j=0; j<ORDER; ++j) {
for (unsigned int k=0; k<ORDER; ++k) {
const unsigned int a = j*ORDER*ORDER + i*ORDER + k;
const unsigned int b = j*ORDER*ORDER + k*ORDER + i;
F8[b] = H[a];
}
}
}
//for (unsigned int l=0; l<ORDER-1; ++l)
// for (unsigned int i=0; i<ORDER; ++i) {
// //F2[0][i] += T_of_y[l*ORDER + i] * W2[0][l];
// //F2[1][i] += T_of_y[l*ORDER + i] * W2[1][l];
// //F2[2][i] += T_of_y[l*ORDER + i] * W2[2][l];
//
// for (unsigned int m=0; m<ORDER-1; ++m)
// for (unsigned int j=0; j<ORDER; ++j) {
// //F4[0][j*ORDER + i] += T_of_y[l*ORDER + i] * T_of_y[m*ORDER + j] * W4[0][m*(ORDER-1) + l];
// //F4[1][j*ORDER + i] += T_of_y[l*ORDER + i] * T_of_y[m*ORDER + j] * W4[1][m*(ORDER-1) + l];
// //F4[2][j*ORDER + i] += T_of_y[l*ORDER + i] * T_of_y[m*ORDER + j] * W4[2][m*(ORDER-1) + l];
//
// for (unsigned int n=0; n<ORDER-1; ++n)
// for (unsigned int k=0; k<ORDER; ++k)
// //F8[k*ORDER*ORDER + j*ORDER + i] +=
// // T_of_y[l*ORDER + i] * T_of_y[m*ORDER + j] * T_of_y[n*ORDER + k] *
// // W8[n*(ORDER-1)*(ORDER-1) + m*(ORDER-1) + l];
// }
// }
} ////////////////////////////////////////////////////////
FReal W[nnodes];
{
//for (unsigned int i=0; i<nnodes; ++i) W[i] = FReal(0.);
for (unsigned int i=0; i<ORDER; ++i) {
for (unsigned int j=0; j<ORDER; ++j) {
for (unsigned int k=0; k<ORDER; ++k) {
const unsigned int idx = k*ORDER*ORDER + j*ORDER + i;
W[idx] = (W1 +
FReal(2.) * (F2[0][i] + F2[1][j] + F2[2][k]) +
FReal(4.) * (F4[0][j*ORDER+i] + F4[1][k*ORDER+i] + F4[2][k*ORDER+j]) +
FReal(8.) * F8[idx]) / (ORDER*ORDER*ORDER);
}
}
}
}
std::cout << std::endl; std::cout << std::endl;
FReal ferror(0.); FReal p2m_error(0.);
for (unsigned int n=0; n<nnodes; ++n) { for (unsigned int i=0; i<nnodes; ++i) {
std::cout << n << "\t" << f[n] << " - " << F0[n] << " = " << f[n]-F0[n] << std::endl; p2m_error += W[i] - equivW[i];
ferror += f[n] - F0[n]; //std::cout << W[i] - equivW[i] << std::endl;
} }
std::cout << "ERROR P2M = " << p2m_error << std::endl;
std::cout << "\nERROR = " << ferror << std::endl << std::endl;
return 0; return 0;
} }
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment