Commit 834d15a0 authored by messner's avatar messner
Browse files

optimized P2M and L2P (for potential and forces) by using recurrence

representation of chebyshev polynomials and by unroling loops


git-svn-id: svn+ssh://scm.gforge.inria.fr/svn/scalfmm/scalfmm/trunk@425 2616d619-271b-44dc-8df4-d4a8f33a7222
parent e7c67b81
......@@ -146,50 +146,7 @@ public:
void applyP2M(const F3DPosition& center,
const FReal width,
FReal *const multipoleExpansion,
const ContainerClass *const sourceParticles) const
{
// setup global to local mapping
const map_glob_loc map(center, width);
F3DPosition localPosition;
FReal T_of_x[ORDER][3];
// set all multipole expansions to zero
for (unsigned int n=0; n<nnodes; ++n) multipoleExpansion[n] = FReal(0.);
// loop: source particles
typename ContainerClass::ConstBasicIterator iter(*sourceParticles);
while(iter.hasNotFinished()){
// map global position to [-1,1]
map(iter.data().getPosition(), localPosition);
// get source value
const FReal sourceValue = iter.data().getPhysicalValue();
// evaluate chebyshev polynomials of source particle: T_o(x_i)
for (unsigned int o=1; o<ORDER; ++o) {
T_of_x[o][0] = BasisType::T(o, localPosition.getX());
T_of_x[o][1] = BasisType::T(o, localPosition.getY());
T_of_x[o][2] = BasisType::T(o, localPosition.getZ());
}
// anterpolate
for (unsigned int n=0; n<nnodes; ++n) {
//multipoleExpansion[n] = FReal(0.);
FReal S = FReal(1.);
for (unsigned int d=0; d<3; ++d) {
const unsigned int j = node_ids[n][d];
FReal S_d = FReal(1.) / ORDER;
for (unsigned int o=1; o<ORDER; ++o)
S_d += FReal(2.) / ORDER * T_of_x[o][d] * T_of_roots[o][j];
S *= S_d;
}
multipoleExpansion[n] += S * sourceValue;
}
iter.gotoNext();
} // end loop: source particles
}
const ContainerClass *const sourceParticles) const;
......@@ -200,47 +157,7 @@ public:
void applyL2P(const F3DPosition& center,
const FReal width,
const FReal *const localExpansion,
ContainerClass *const localParticles) const
{
// setup local to global mapping
const map_glob_loc map(center, width);
F3DPosition localPosition;
FReal T_of_x[ORDER][3];
typename ContainerClass::BasicIterator iter(*localParticles);
while(iter.hasNotFinished()){
// map global position to [-1,1]
map(iter.data().getPosition(), localPosition);
// get target value
FReal targetValue = iter.data().getPotential();
// evaluate chebyshev polynomials of source particle: T_o(x_i)
for (unsigned int o=1; o<ORDER; ++o) {
T_of_x[o][0] = BasisType::T(o, localPosition.getX());
T_of_x[o][1] = BasisType::T(o, localPosition.getY());
T_of_x[o][2] = BasisType::T(o, localPosition.getZ());
}
// interpolate and increment target value
for (unsigned int n=0; n<nnodes; ++n) {
FReal S = FReal(1.);
for (unsigned int d=0; d<3; ++d) {
const unsigned int j = node_ids[n][d];
FReal S_d = FReal(1.) / ORDER;
for (unsigned int o=1; o<ORDER; ++o)
S_d += FReal(2.) / ORDER * T_of_x[o][d] * T_of_roots[o][j];
S *= S_d;
}
targetValue += S * localExpansion[n];
}
iter.data().setPotential(targetValue);
iter.gotoNext();
}
}
ContainerClass *const localParticles) const;
/**
......@@ -250,70 +167,7 @@ public:
void applyL2PGradient(const F3DPosition& center,
const FReal width,
const FReal *const localExpansion,
ContainerClass *const localParticles) const
{
// setup local to global mapping
const map_glob_loc map(center, width);
F3DPosition Jacobian;
map.computeJacobian(Jacobian);
const FReal jacobian[3] = {Jacobian.getX(), Jacobian.getY(), Jacobian.getZ()};
F3DPosition localPosition;
FReal T_of_x[ORDER][3];
FReal U_of_x[ORDER][3];
typename ContainerClass::BasicIterator iter(*localParticles);
while(iter.hasNotFinished()){
// map global position to [-1,1]
map(iter.data().getPosition(), localPosition);
// get target value
FReal forces[3] = {iter.data().getForces().getX(),
iter.data().getForces().getY(),
iter.data().getForces().getZ()};
// evaluate chebyshev polynomials of source particle
for (unsigned int o=1; o<ORDER; ++o) {
// T_o(x_i)
T_of_x[o][0] = BasisType::T(o, localPosition.getX());
T_of_x[o][1] = BasisType::T(o, localPosition.getY());
T_of_x[o][2] = BasisType::T(o, localPosition.getZ());
// T_o(x_i)
U_of_x[o][0] = BasisType::U(o, localPosition.getX());
U_of_x[o][1] = BasisType::U(o, localPosition.getY());
U_of_x[o][2] = BasisType::U(o, localPosition.getZ());
}
// apply P
for (unsigned int n=0; n<nnodes; ++n) {
for (unsigned int i=0; i<3; ++i) {
FReal P = FReal(1.);
for (unsigned int d=0; d<3; ++d) {
const unsigned int j = node_ids[n][d];
FReal P_d;
if (d==i) {
P_d = 0.;
for (unsigned int o=1; o<ORDER; ++o)
P_d += 2. / ORDER * U_of_x[o][d] * T_of_roots[o][j] * jacobian[d];
} else {
P_d = 1. / ORDER;
for (unsigned int o=1; o<ORDER; ++o)
P_d += 2. / ORDER * T_of_x[o][d] * T_of_roots[o][j];
}
P *= P_d;
}
// the minus sign comes due to the \f$- \nabla_x K(x,y) = \nabla_y K(x,y) = \bar K(x,y)\f$
forces[i] -= P * localExpansion[n];
}
}
iter.data().setForces(forces[0] * iter.data().getPhysicalValue(),
forces[1] * iter.data().getPhysicalValue(),
forces[2] * iter.data().getPhysicalValue());
// increment iterator
iter.gotoNext();
}
}
ContainerClass *const localParticles) const;
void applyM2M(const unsigned int ChildIndex,
......@@ -338,4 +192,250 @@ public:
/**
* Particle to moment: application of \f$S_\ell(y,\bar y_n)\f$
* (anterpolation, it is the transposed interpolation)
*/
template <int ORDER>
template <class ContainerClass>
inline void FChebInterpolator<ORDER>::applyP2M(const F3DPosition& center,
const FReal width,
FReal *const multipoleExpansion,
const ContainerClass *const sourceParticles) const
{
// set all multipole expansions to zero
FBlas::setzero(nnodes, multipoleExpansion);
// allocate stuff
const map_glob_loc map(center, width);
F3DPosition localPosition;
FReal T_of_x[ORDER][3];
FReal S[3];
// loop over source particles
typename ContainerClass::ConstBasicIterator iter(*sourceParticles);
while(iter.hasNotFinished()){
// map global position to [-1,1]
map(iter.data().getPosition(), localPosition);
// evaluate chebyshev polynomials of source particle: T_o(x_i)
T_of_x[0][0] = FReal(1.); T_of_x[1][0] = localPosition.getX();
T_of_x[0][1] = FReal(1.); T_of_x[1][1] = localPosition.getY();
T_of_x[0][2] = FReal(1.); T_of_x[1][2] = localPosition.getZ();
for (unsigned int o=2; o<ORDER; ++o) {
T_of_x[o][0] = FReal(2.) * localPosition.getX() * T_of_x[o-1][0] - T_of_x[o-2][0];
T_of_x[o][1] = FReal(2.) * localPosition.getY() * T_of_x[o-1][1] - T_of_x[o-2][1];
T_of_x[o][2] = FReal(2.) * localPosition.getZ() * T_of_x[o-1][2] - T_of_x[o-2][2];
}
// anterpolate
const FReal sourceValue = iter.data().getPhysicalValue();
for (unsigned int n=0; n<nnodes; ++n) {
const unsigned int j[3] = {node_ids[n][0], node_ids[n][1], node_ids[n][2]};
S[0] = T_of_x[1][0] * T_of_roots[1][j[0]];
S[1] = T_of_x[1][1] * T_of_roots[1][j[1]];
S[2] = T_of_x[1][2] * T_of_roots[1][j[2]];
for (unsigned int o=2; o<ORDER; ++o) {
S[0] += T_of_x[o][0] * T_of_roots[o][j[0]];
S[1] += T_of_x[o][1] * T_of_roots[o][j[1]];
S[2] += T_of_x[o][2] * T_of_roots[o][j[2]];
}
// gather contributions
S[0] *= FReal(2.); S[0] += FReal(1.);
S[1] *= FReal(2.); S[1] += FReal(1.);
S[2] *= FReal(2.); S[2] += FReal(1.);
multipoleExpansion[n] += FReal(1.) / nnodes * S[0] * S[1] * S[2] * sourceValue;
}
// increment source iterator
iter.gotoNext();
}
}
/**
* Local to particle operation: application of \f$S_\ell(x,\bar x_m)\f$ (interpolation)
*/
template <int ORDER>
template <class ContainerClass>
inline void FChebInterpolator<ORDER>::applyL2P(const F3DPosition& center,
const FReal width,
const FReal *const localExpansion,
ContainerClass *const localParticles) const
{
// allocate stuff
const map_glob_loc map(center, width);
F3DPosition localPosition;
FReal T_of_x[ORDER][3];
FReal S[3];
typename ContainerClass::BasicIterator iter(*localParticles);
while(iter.hasNotFinished()){
// map global position to [-1,1]
map(iter.data().getPosition(), localPosition);
// evaluate chebyshev polynomials of source particle: T_o(x_i)
T_of_x[0][0] = FReal(1.); T_of_x[1][0] = localPosition.getX();
T_of_x[0][1] = FReal(1.); T_of_x[1][1] = localPosition.getY();
T_of_x[0][2] = FReal(1.); T_of_x[1][2] = localPosition.getZ();
for (unsigned int o=2; o<ORDER; ++o) {
T_of_x[o][0] = FReal(2.) * localPosition.getX() * T_of_x[o-1][0] - T_of_x[o-2][0];
T_of_x[o][1] = FReal(2.) * localPosition.getY() * T_of_x[o-1][1] - T_of_x[o-2][1];
T_of_x[o][2] = FReal(2.) * localPosition.getZ() * T_of_x[o-1][2] - T_of_x[o-2][2];
}
// interpolate and increment target value
FReal targetValue = iter.data().getPotential();
for (unsigned int n=0; n<nnodes; ++n) {
const unsigned int j[3] = {node_ids[n][0], node_ids[n][1], node_ids[n][2]};
S[0] = T_of_x[1][0] * T_of_roots[1][j[0]];
S[1] = T_of_x[1][1] * T_of_roots[1][j[1]];
S[2] = T_of_x[1][2] * T_of_roots[1][j[2]];
for (unsigned int o=2; o<ORDER; ++o) {
S[0] += T_of_x[o][0] * T_of_roots[o][j[0]];
S[1] += T_of_x[o][1] * T_of_roots[o][j[1]];
S[2] += T_of_x[o][2] * T_of_roots[o][j[2]];
}
// gather contributions
S[0] *= FReal(2.); S[0] += FReal(1.);
S[1] *= FReal(2.); S[1] += FReal(1.);
S[2] *= FReal(2.); S[2] += FReal(1.);
targetValue += FReal(1.) / nnodes * S[0] * S[1] * S[2] * localExpansion[n];
}
// set potential
iter.data().setPotential(targetValue);
// increment target iterator
iter.gotoNext();
}
}
/**
* Local to particle operation: application of \f$\nabla_x S_\ell(x,\bar x_m)\f$ (interpolation)
*/
template <int ORDER>
template <class ContainerClass>
inline void FChebInterpolator<ORDER>::applyL2PGradient(const F3DPosition& center,
const FReal width,
const FReal *const localExpansion,
ContainerClass *const localParticles) const
{
// setup local to global mapping
const map_glob_loc map(center, width);
F3DPosition Jacobian;
map.computeJacobian(Jacobian);
const FReal jacobian[3] = {Jacobian.getX(), Jacobian.getY(), Jacobian.getZ()};
F3DPosition localPosition;
FReal T_of_x[ORDER][3];
FReal U_of_x[ORDER][3];
FReal P[3];
typename ContainerClass::BasicIterator iter(*localParticles);
while(iter.hasNotFinished()){
// map global position to [-1,1]
map(iter.data().getPosition(), localPosition);
// evaluate chebyshev polynomials of source particle
// T_0(x_i) and T_1(x_i)
T_of_x[0][0] = FReal(1.); T_of_x[1][0] = localPosition.getX();
T_of_x[0][1] = FReal(1.); T_of_x[1][1] = localPosition.getY();
T_of_x[0][2] = FReal(1.); T_of_x[1][2] = localPosition.getZ();
// U_0(x_i) and U_1(x_i)
U_of_x[0][0] = FReal(1.); U_of_x[1][0] = localPosition.getX() * FReal(2.);
U_of_x[0][1] = FReal(1.); U_of_x[1][1] = localPosition.getY() * FReal(2.);
U_of_x[0][2] = FReal(1.); U_of_x[1][2] = localPosition.getZ() * FReal(2.);
for (unsigned int o=2; o<ORDER; ++o) {
// T_o(x_i)
T_of_x[o][0] = FReal(2.)*localPosition.getX()*T_of_x[o-1][0] - T_of_x[o-2][0];
T_of_x[o][1] = FReal(2.)*localPosition.getY()*T_of_x[o-1][1] - T_of_x[o-2][1];
T_of_x[o][2] = FReal(2.)*localPosition.getZ()*T_of_x[o-1][2] - T_of_x[o-2][2];
// U_o(x_i)
U_of_x[o][0] = FReal(2.)*localPosition.getX()*U_of_x[o-1][0] - U_of_x[o-2][0];
U_of_x[o][1] = FReal(2.)*localPosition.getY()*U_of_x[o-1][1] - U_of_x[o-2][1];
U_of_x[o][2] = FReal(2.)*localPosition.getZ()*U_of_x[o-1][2] - U_of_x[o-2][2];
}
// scale, because dT_o/dx = oU_{o-1}
for (unsigned int o=2; o<ORDER; ++o) {
U_of_x[o-1][0] *= FReal(o);
U_of_x[o-1][1] *= FReal(o);
U_of_x[o-1][2] *= FReal(o);
}
// apply P and increment forces
FReal forces[3] = {FReal(0.), FReal(0.), FReal(0.)};
for (unsigned int n=0; n<nnodes; ++n) {
// tensor indices of chebyshev nodes
const unsigned int j[3] = {node_ids[n][0], node_ids[n][1], node_ids[n][2]};
// f0 component //////////////////////////////////////
P[0] = U_of_x[0][0] * T_of_roots[1][j[0]];
P[1] = T_of_x[1][1] * T_of_roots[1][j[1]];
P[2] = T_of_x[1][2] * T_of_roots[1][j[2]];
for (unsigned int o=2; o<ORDER; ++o) {
P[0] += U_of_x[o-1][0] * T_of_roots[o][j[0]];
P[1] += T_of_x[o ][1] * T_of_roots[o][j[1]];
P[2] += T_of_x[o ][2] * T_of_roots[o][j[2]];
}
P[0] *= FReal(2.);
P[1] *= FReal(2.); P[1] += FReal(1.);
P[2] *= FReal(2.); P[2] += FReal(1.);
forces[0] -= P[0] * P[1] * P[2] * localExpansion[n];
// f1 component //////////////////////////////////////
P[0] = T_of_x[1][0] * T_of_roots[1][j[0]];
P[1] = U_of_x[0][1] * T_of_roots[1][j[1]];
P[2] = T_of_x[1][2] * T_of_roots[1][j[2]];
for (unsigned int o=2; o<ORDER; ++o) {
P[0] += T_of_x[o ][0] * T_of_roots[o][j[0]];
P[1] += U_of_x[o-1][1] * T_of_roots[o][j[1]];
P[2] += T_of_x[o ][2] * T_of_roots[o][j[2]];
}
P[0] *= FReal(2.); P[0] += FReal(1.);
P[1] *= FReal(2.);
P[2] *= FReal(2.); P[2] += FReal(1.);
forces[1] -= P[0] * P[1] * P[2] * localExpansion[n];
// f2 component //////////////////////////////////////
P[0] = T_of_x[1][0] * T_of_roots[1][j[0]];
P[1] = T_of_x[1][1] * T_of_roots[1][j[1]];
P[2] = U_of_x[0][2] * T_of_roots[1][j[2]];
for (unsigned int o=2; o<ORDER; ++o) {
P[0] += T_of_x[o ][0] * T_of_roots[o][j[0]];
P[1] += T_of_x[o ][1] * T_of_roots[o][j[1]];
P[2] += U_of_x[o-1][2] * T_of_roots[o][j[2]];
}
P[0] *= FReal(2.); P[0] += FReal(1.);
P[1] *= FReal(2.); P[1] += FReal(1.);
P[2] *= FReal(2.);
forces[2] -= P[0] * P[1] * P[2] * localExpansion[n];
}
// scale forces
forces[0] *= jacobian[0] / nnodes;
forces[1] *= jacobian[1] / nnodes;
forces[2] *= jacobian[2] / nnodes;
// set computed forces
iter.data().incForces(forces[0] * iter.data().getPhysicalValue(),
forces[1] * iter.data().getPhysicalValue(),
forces[2] * iter.data().getPhysicalValue());
// increment iterator
iter.gotoNext();
}
}
#endif
......@@ -79,12 +79,12 @@ int main(int, char **){
////////////////////////////////////////////////////////////////////
LeafClass X;
F3DPosition cx(0., 0., 0.);
const unsigned long M = 10000;
const unsigned long M = 20000;
std::cout << "Fill the leaf X of width " << width
<< " centered at cx=" << cx << " with M=" << M << " target particles" << std::endl;
{
FChebParticle particle;
for(long i=0; i<M; ++i){
for(unsigned long i=0; i<M; ++i){
FReal x = (FReal(rand())/FRandMax - FReal(.5)) * width + cx.getX();
FReal y = (FReal(rand())/FRandMax - FReal(.5)) * width + cx.getY();
FReal z = (FReal(rand())/FRandMax - FReal(.5)) * width + cx.getZ();
......@@ -98,12 +98,12 @@ int main(int, char **){
////////////////////////////////////////////////////////////////////
LeafClass Y;
F3DPosition cy(FReal(2.)*width, 0., 0.);
const unsigned long N = 10000;
const unsigned long N = 20000;
std::cout << "Fill the leaf Y of width " << width
<< " centered at cy=" << cy << " with N=" << N << " target particles" << std::endl;
{
FChebParticle particle;
for(long i=0; i<N; ++i){
for(unsigned long i=0; i<N; ++i){
FReal x = (FReal(rand())/FRandMax - FReal(.5)) * width + cy.getX();
FReal y = (FReal(rand())/FRandMax - FReal(.5)) * width + cy.getY();
FReal z = (FReal(rand())/FRandMax - FReal(.5)) * width + cy.getZ();
......@@ -117,19 +117,22 @@ int main(int, char **){
////////////////////////////////////////////////////////////////////
// approximative computation
const unsigned int ORDER = 8;
const unsigned int ORDER = 10;
const unsigned int nnodes = TensorTraits<ORDER>::nnodes;
typedef FChebInterpolator<ORDER> InterpolatorClass;
InterpolatorClass S;
std::cout << "\nCompute interactions approximatively, interpolation order = " << ORDER << " ..."
<< std::endl;
time.tic();
std::cout << "\nCompute interactions approximatively, interpolation order = " << ORDER << " ..." << std::endl;
std::cout << "\nP2M ... " << std::flush;
time.tic();
// Anterpolate: W_n = \sum_j^N S(y_j,\bar y_n) * w_j
FReal W[nnodes]; // multipole expansion
S.applyP2M(cy, width, W, Y.getSrc()); // the multipole expansions are set to 0 in S.applyP2M
std::cout << "took " << time.tacAndElapsed() << "s" << std::endl;
std::cout << "M2L ... " << std::flush;
time.tic();
// Multipole to local: F_m = \sum_n^L K(\bar x_m, \bar y_n) * W_n
F3DPosition rootsX[nnodes], rootsY[nnodes];
FChebTensor<ORDER>::setRoots(cx, width, rootsX);
......@@ -141,18 +144,21 @@ int main(int, char **){
for (unsigned int j=0; j<nnodes; ++j)
F[i] += MatrixKernel.evaluate(rootsX[i], rootsY[j]) * W[j];
}
std::cout << "took " << time.tacAndElapsed() << "s" << std::endl;
std::cout << "L2P (potential) ... " << std::flush;
time.tic();
// Interpolate p_i = \sum_m^L S(x_i,\bar x_m) * F_m
S.applyL2P(cx, width, F, X.getTargets());
std::cout << "took " << time.tacAndElapsed() << "s" << std::endl;
std::cout << "L2P (forces) ... " << std::flush;
time.tic();
// Interpolate f_i = \sum_m^L P(x_i,\bar x_m) * F_m
S.applyL2PGradient(cx, width, F, X.getTargets());
std::cout << "took " << time.tacAndElapsed() << "s" << std::endl;
time.tac();
std::cout << "Done in " << time.elapsed() << "sec." << std::endl;
// -----------------------------------------------------
////////////////////////////////////////////////////////////////////
// direct computation
std::cout << "Compute interactions directly ..." << std::endl;
......@@ -228,7 +234,7 @@ int main(int, char **){
delete [] p;
delete [] approx_f;
delete [] f;
return 0;
}
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment