Commit 0f163780 authored by Matthias Messner's avatar Matthias Messner

merged individual files from starpu_dev to master branch

parent 95b3fe18
......@@ -48,6 +48,12 @@ public:
data[2] = inZ;
}
explicit FTreeCoordinate(const int inPosition[3]) {
data[0] = inPosition[0];
data[1] = inPosition[1];
data[2] = inPosition[2];
}
/**
* Copy constructor
* @param other the source class to copy
......
......@@ -60,7 +60,6 @@ protected:
BoxCorner.getZ() + (FReal(Coordinate.getZ()) + FReal(.5)) * BoxWidthLeaf);
}
public:
/**
* The constructor initializes all constant attributes and it reads the
......@@ -80,6 +79,9 @@ public:
/* empty */
}
const InterpolatorClass *const getPtrToInterpolator() const
{ return Interpolator.getPtr(); }
virtual void P2M(CellClass* const LeafCell,
const ContainerClass* const SourceParticles) = 0;
......@@ -193,12 +195,12 @@ public:
private:
void directInteraction(ParticleClass& Target, const ParticleClass& Source) const // 21 overall flops
void directInteraction(ParticleClass& Target, const ParticleClass& Source) const // 34 overall flops
{
FPoint xy(Source.getPosition() - Target.getPosition()); // 3 flops
const FReal one_over_r = FReal(1.) / FMath::Sqrt(xy.getX()*xy.getX() +
xy.getY()*xy.getY() +
xy.getZ()*xy.getZ()); // 1 + 1 + 1 + 5 = 8 flops
xy.getZ()*xy.getZ()); // 1 + 15 + 5 = 21 flops
const FReal wt = Target.getPhysicalValue();
const FReal ws = Source.getPhysicalValue();
// potential
......@@ -208,12 +210,12 @@ private:
Target.incForces(xy.getX(), xy.getY(), xy.getZ()); // 3 flops
}
void directInteractionMutual(ParticleClass& Target, ParticleClass& Source) const // 26 overall flops
void directInteractionMutual(ParticleClass& Target, ParticleClass& Source) const // 39 overall flops
{
FPoint xy(Source.getPosition() - Target.getPosition()); // 3 flops
const FReal one_over_r = FReal(1.) / FMath::Sqrt(xy.getX()*xy.getX() +
xy.getY()*xy.getY() +
xy.getZ()*xy.getZ()); // 1 + 1 + 1 + 5 = 8 flops
xy.getZ()*xy.getZ()); // 1 + 15 + 5 = 21 flops
const FReal wt = Target.getPhysicalValue();
const FReal ws = Source.getPhysicalValue();
// potential
......
......@@ -37,7 +37,7 @@ class FChebFlopsSymKernel
: public FAbstractKernels<ParticleClass, CellClass, ContainerClass>
{
enum {nnodes = TensorTraits<ORDER>::nnodes};
public:
/// Handler to deal with all symmetries: Stores permutation indices and
/// vectors to reduce 343 different interactions to 16 only.
struct SymmetryHandler;
......@@ -46,31 +46,41 @@ class FChebFlopsSymKernel
const FSmartPointer<MatrixKernelClass,FSmartPointerMemory> MatrixKernel;
const FSmartPointer<SymmetryHandler, FSmartPointerMemory> SymHandler;
/// tree height
const unsigned int inTreeHeight;
// count permuted local and multipole expansions
unsigned int* countExp;
unsigned long long flopsP2M, flopsM2M, flopsM2L, flopsL2L, flopsL2P, flopsP2P;
unsigned long long *flopsPerLevelM2M, *flopsPerLevelM2L, *flopsPerLevelL2L;
// start flop counters
unsigned int countFlopsP2MorL2P(const unsigned int N) const
{ return 1 + N * (nnodes*(6*ORDER-2) + 6*ORDER + 6); }
//{ return N*nnodes*(15*ORDER-19) + ORDER*(3*ORDER-6) + 2; }
//{ return N * (nnodes*(24*ORDER-26) - 1); }
unsigned int countFlopsM2MorL2L() const
{ return nnodes * (nnodes*2 - 1); }
//unsigned int countFlopsL2PGradient(const unsigned int N) const
//{ return 3 * N * (nnodes*(17*ORDER-21) - 1); }
unsigned int countFlopsL2PTotal(const unsigned int N) const
{ return 7 + N * (nnodes*(12*ORDER-1) + 15*ORDER + 2); }
//{ return N * (nnodes*2 -1 ) + countFlopsL2PGradient(N); }
{ return 3 * nnodes * (2*ORDER-1); }
unsigned int countFlopsM2L(const unsigned int nexp, const unsigned int rank) const
{ return nexp * (4*nnodes*rank - rank - nnodes); }
unsigned int countFlopsP2P() const
{ return 21; }
{ return 34; }
unsigned int countFlopsP2Pmutual() const
{ return 26; }
{ return 39; }
unsigned int countFlopsP2M(const unsigned int N) const {
const unsigned first = N * (18 + (ORDER-2) * 6 + (ORDER-1) * (6 + (ORDER-1) * (6 + (ORDER-1) * 2)));
const unsigned W2 = 3 * ORDER*(2*(ORDER-1)-1);
const unsigned W4 = 3 * (ORDER*(ORDER-1)*(2*(ORDER-1)-1) + ORDER*ORDER*(2*(ORDER-1)-1));
const unsigned W8 = 3 * (2*(ORDER-1)-1) * (ORDER*(ORDER-1)*(ORDER-1) + ORDER*ORDER*(ORDER-1) + nnodes);
return first + W2 + W4 + W8 + nnodes*11;
}
unsigned int countFlopsL2PTotal(const unsigned int N) const {
const unsigned W0 = nnodes;
const unsigned W2 = 3 * (ORDER-1)*ORDER*ORDER * 2*ORDER;
const unsigned W4 = 3 * ORDER*(ORDER-1)*(ORDER-1) * 2*ORDER;
const unsigned W8 = (ORDER-1)*(ORDER-1)*(ORDER-1) * (2*ORDER-1);
const unsigned second = N * (38 + (ORDER-2)*15 + (ORDER-1)*((ORDER-1) * (27 + (ORDER-1) * 16))) + 6;
return W0 + W2 + W4 + W8 + second;
}
// end flop counters
public:
......@@ -79,14 +89,22 @@ public:
* precomputed and compressed M2L operators from a binary file (an
* runtime_error is thrown if the required file is not valid).
*/
FChebFlopsSymKernel(const int inTreeHeight,
FChebFlopsSymKernel(const int _inTreeHeight,
const FPoint& inBoxCenter,
const FReal inBoxWidth,
const FReal Epsilon)
: MatrixKernel(new MatrixKernelClass()),
SymHandler(new SymmetryHandler(MatrixKernel.getPtr(), Epsilon)),
flopsP2M(0), flopsM2M(0), flopsM2L(0), flopsL2L(0), flopsL2P(0), flopsP2P(0)
{ countExp = new unsigned int [343]; }
SymHandler(new SymmetryHandler(MatrixKernel.getPtr(), Epsilon)), inTreeHeight(_inTreeHeight),
flopsP2M(0), flopsM2M(0), flopsM2L(0), flopsL2L(0), flopsL2P(0), flopsP2P(0),
flopsPerLevelM2M(NULL), flopsPerLevelM2L(NULL), flopsPerLevelL2L(NULL)
{
countExp = new unsigned int [343];
flopsPerLevelM2M = new unsigned long long [inTreeHeight];
flopsPerLevelM2L = new unsigned long long [inTreeHeight];
flopsPerLevelL2L = new unsigned long long [inTreeHeight];
for (unsigned int level = 0; level<inTreeHeight; ++level)
flopsPerLevelM2M[level] = flopsPerLevelM2L[level] = flopsPerLevelL2L[level] = 0;
}
......@@ -110,26 +128,48 @@ public:
<< "\n- Flops for L2L = " << flopsL2L
<< "\n- Flops for L2P = " << flopsL2P
<< "\n- Flops for P2P = " << flopsP2P
<< "\n- Overall Flops = " << flopsP2M + flopsM2M + flopsM2L + flopsL2L + flopsL2P + flopsP2P
<< "\n==================================================\n"
<< std::endl;
std::cout << "\n=================================================="
<< "\n- Flops for M2M" << std::endl;
for (unsigned int level=0; level<inTreeHeight; ++level)
std::cout << " |- at level " << level << " flops = " << flopsPerLevelM2M[level] << std::endl;
std::cout << "=================================================="
<< "\n- Flops for M2L" << std::endl;
for (unsigned int level=0; level<inTreeHeight; ++level)
std::cout << " |- at level " << level << " flops = " << flopsPerLevelM2L[level] << std::endl;
std::cout << "=================================================="
<< "\n- Flops for L2L" << std::endl;
for (unsigned int level=0; level<inTreeHeight; ++level)
std::cout << " |- at level " << level << " flops = " << flopsPerLevelL2L[level] << std::endl;
std::cout << "==================================================" << std::endl;
if (flopsPerLevelM2M) delete [] flopsPerLevelM2M;
if (flopsPerLevelM2L) delete [] flopsPerLevelM2L;
if (flopsPerLevelL2L) delete [] flopsPerLevelL2L;
}
void P2M(CellClass* const /* not needed */, const ContainerClass* const SourceParticles)
{
flopsP2M += countFlopsP2MorL2P(SourceParticles->getSize());
flopsP2M += countFlopsP2M(SourceParticles->getSize());
}
void M2M(CellClass* const FRestrict /* not needed */,
const CellClass*const FRestrict *const FRestrict ChildCells,
const int /* not needed */)
const int TreeLevel)
{
unsigned int flops = 0;
for (unsigned int ChildIndex=0; ChildIndex < 8; ++ChildIndex)
if (ChildCells[ChildIndex]) flopsM2M += countFlopsM2MorL2L();
if (ChildCells[ChildIndex]) flops += countFlopsM2MorL2L();
flopsM2M += flops;
flopsPerLevelM2M[TreeLevel] += flops;
}
......@@ -138,24 +178,30 @@ public:
void M2L(CellClass* const FRestrict /* not needed */,
const CellClass* SourceCells[343],
const int /* not needed */,
const int /* not needed*/)
const int TreeLevel)
{
unsigned int flops = 0;
// count how ofter each of the 16 interactions is used
memset(countExp, 0, sizeof(int) * 343);
for (unsigned int idx=0; idx<343; ++idx)
if (SourceCells[idx]) countExp[SymHandler->pindices[idx]]++;
// multiply (mat-mat-mul)
for (unsigned int pidx=0; pidx<343; ++pidx)
if (countExp[pidx]) flopsM2L += countFlopsM2L(countExp[pidx], SymHandler->LowRank[pidx]) + countExp[pidx]*nnodes;
if (countExp[pidx]) flops += countFlopsM2L(countExp[pidx], SymHandler->LowRank[pidx]) + countExp[pidx]*nnodes;
flopsM2L += flops;
flopsPerLevelM2L[TreeLevel] += flops;
}
void L2L(const CellClass* const FRestrict /* not needed */,
CellClass* FRestrict *const FRestrict ChildCells,
const int /* not needed */)
const int TreeLevel)
{
unsigned int flops = 0;
for (unsigned int ChildIndex=0; ChildIndex < 8; ++ChildIndex)
if (ChildCells[ChildIndex]) flopsL2L += countFlopsM2MorL2L() + nnodes;
if (ChildCells[ChildIndex]) flops += countFlopsM2MorL2L() + nnodes;
flopsL2L += flops;
flopsPerLevelL2L[TreeLevel] += flops;
}
......
......@@ -412,8 +412,12 @@ public:
const FReal *const *const getChildParentInterpolator() const
{ return ChildParentInterpolator; }
const unsigned int *const getPermutationsM2ML2L(unsigned int i) const
{ return perm[i]; }
......@@ -487,16 +491,19 @@ public:
FReal *const ParentExpansion) const
{
FReal Exp[nnodes], PermExp[nnodes];
// ORDER*ORDER*ORDER * (2*ORDER-1)
FBlas::gemtm(ORDER, ORDER, ORDER*ORDER, FReal(1.),
ChildParentInterpolator[ChildIndex], ORDER,
const_cast<FReal*>(ChildExpansion), ORDER, PermExp, ORDER);
for (unsigned int n=0; n<nnodes; ++n) Exp[n] = PermExp[perm[1][n]];
// ORDER*ORDER*ORDER * (2*ORDER-1)
FBlas::gemtm(ORDER, ORDER, ORDER*ORDER, FReal(1.),
ChildParentInterpolator[ChildIndex] + 2 * ORDER*ORDER, ORDER,
Exp, ORDER, PermExp, ORDER);
for (unsigned int n=0; n<nnodes; ++n) Exp[perm[1][n]] = PermExp[perm[2][n]];
// ORDER*ORDER*ORDER * (2*ORDER-1)
FBlas::gemtm(ORDER, ORDER, ORDER*ORDER, FReal(1.),
ChildParentInterpolator[ChildIndex] + 1 * ORDER*ORDER, ORDER,
Exp, ORDER, PermExp, ORDER);
......@@ -510,24 +517,26 @@ public:
FReal *const ChildExpansion) const
{
FReal Exp[nnodes], PermExp[nnodes];
// ORDER*ORDER*ORDER * (2*ORDER-1)
FBlas::gemm(ORDER, ORDER, ORDER*ORDER, FReal(1.),
ChildParentInterpolator[ChildIndex], ORDER,
const_cast<FReal*>(ParentExpansion), ORDER, PermExp, ORDER);
for (unsigned int n=0; n<nnodes; ++n) Exp[n] = PermExp[perm[1][n]];
// ORDER*ORDER*ORDER * (2*ORDER-1)
FBlas::gemm(ORDER, ORDER, ORDER*ORDER, FReal(1.),
ChildParentInterpolator[ChildIndex] + 2 * ORDER*ORDER, ORDER,
Exp, ORDER, PermExp, ORDER);
for (unsigned int n=0; n<nnodes; ++n) Exp[perm[1][n]] = PermExp[perm[2][n]];
// ORDER*ORDER*ORDER * (2*ORDER-1)
FBlas::gemm(ORDER, ORDER, ORDER*ORDER, FReal(1.),
ChildParentInterpolator[ChildIndex] + 1 * ORDER*ORDER, ORDER,
Exp, ORDER, PermExp, ORDER);
for (unsigned int n=0; n<nnodes; ++n) ChildExpansion[perm[2][n]] += PermExp[n];
}
// total flops count: 3 * ORDER*ORDER*ORDER * (2*ORDER-1)
};
......@@ -1124,40 +1133,40 @@ inline void FChebInterpolator<ORDER>::applyL2PTotal(const FPoint& center,
// IMPORTANT: NOT CHANGE ORDER OF COMPUTATIONS!!! ////////////////
//////////////////////////////////////////////////////////////////
// W2[0] /////////////////
// W2[0] ///////////////// (ORDER-1)*ORDER*ORDER * 2*ORDER
FBlas::gemtm(ORDER, ORDER-1, ORDER*ORDER, FReal(1.), const_cast<FReal*>(T), ORDER,
const_cast<FReal*>(localExpansion), ORDER, F2, ORDER-1);
for (unsigned int l=0; l<ORDER-1; ++l) { W2[0][l] = F2[l];
for (unsigned int j=1; j<ORDER*ORDER; ++j) W2[0][l] += F2[j*(ORDER-1) + l]; }
// W4[0] /////////////////
// W4[0] ///////////////// ORDER*(ORDER-1)*(ORDER-1) + 2*ORDER
perm5.permute(F2, F4);
FBlas::gemtm(ORDER, ORDER-1, ORDER*(ORDER-1), FReal(1.), const_cast<FReal*>(T), ORDER, F4, ORDER, G4, ORDER-1);
for (unsigned int l=0; l<ORDER-1; ++l)
for (unsigned int m=0; m<ORDER-1; ++m) { W4[0][m*(ORDER-1)+l] = G4[l*ORDER*(ORDER-1) + m];
for (unsigned int k=1; k<ORDER; ++k) W4[0][m*(ORDER-1)+l] += G4[l*ORDER*(ORDER-1) + k*(ORDER-1) + m]; }
// W8 ////////////////////
// W8 //////////////////// (ORDER-1)*(ORDER-1)*(ORDER-1) * (2*ORDER-1)
perm8.permute(G4, G8);
FReal F8[(ORDER-1)*(ORDER-1)*(ORDER-1)];
FBlas::gemtm(ORDER, ORDER-1, (ORDER-1)*(ORDER-1), FReal(1.), const_cast<FReal*>(T), ORDER, G8, ORDER, F8, ORDER-1);
perm9.permute(F8, W8);
// W4[1] /////////////////
// W4[1] ///////////////// ORDER*(ORDER-1)*(ORDER-1) + 2*ORDER
perm6.permute(F2, F4);
FBlas::gemtm(ORDER, ORDER-1, (ORDER-1)*ORDER, FReal(1.), const_cast<FReal*>(T), ORDER, F4, ORDER, G4, ORDER-1);
for (unsigned int l=0; l<ORDER-1; ++l)
for (unsigned int n=0; n<ORDER-1; ++n) { W4[1][n*(ORDER-1)+l] = G4[l*(ORDER-1) + n];
for (unsigned int j=1; j<ORDER; ++j) W4[1][n*(ORDER-1)+l] += G4[j*(ORDER-1)*(ORDER-1) + l*(ORDER-1) + n]; }
// W2[1] /////////////////
// W2[1] ///////////////// (ORDER-1)*ORDER*ORDER * 2*ORDER
perm3.permute(localExpansion, lE);
FBlas::gemtm(ORDER, ORDER-1, ORDER*ORDER, FReal(1.), const_cast<FReal*>(T), ORDER, lE, ORDER, F2, ORDER-1);
for (unsigned int i=0; i<ORDER-1; ++i) { W2[1][i] = F2[i];
for (unsigned int j=1; j<ORDER*ORDER; ++j) W2[1][i] += F2[j*(ORDER-1) + i]; }
// W4[2] /////////////////
// W4[2] ///////////////// ORDER*(ORDER-1)*(ORDER-1) + 2*ORDER
perm7.permute(F2, F4);
FBlas::gemtm(ORDER, ORDER-1, (ORDER-1)*ORDER, FReal(1.), const_cast<FReal*>(T), ORDER, F4, ORDER, G4, ORDER-1);
for (unsigned int m=0; m<ORDER-1; ++m)
for (unsigned int n=0; n<ORDER-1; ++n) { W4[2][n*(ORDER-1)+m] = G4[m*ORDER*(ORDER-1) + n];
for (unsigned int i=1; i<ORDER; ++i) W4[2][n*(ORDER-1)+m] += G4[m*ORDER*(ORDER-1) + i*(ORDER-1) + n]; }
// W2[2] /////////////////
// W2[2] ///////////////// (ORDER-1)*ORDER*ORDER * 2*ORDER
perm4.permute(localExpansion, lE);
FBlas::gemtm(ORDER, ORDER-1, ORDER*ORDER, FReal(1.), const_cast<FReal*>(T), ORDER, lE, ORDER, F2, ORDER-1);
for (unsigned int i=0; i<ORDER-1; ++i) { W2[2][i] = F2[i];
......@@ -1198,15 +1207,13 @@ inline void FChebInterpolator<ORDER>::applyL2PTotal(const FPoint& center,
U_of_x[1][j] = y2 * U_of_x[1][j-1] - U_of_x[1][j-2]; // 2 flops
U_of_x[2][j] = z2 * U_of_x[2][j-1] - U_of_x[2][j-2]; // 2 flops
}
// scale, because dT_j/dx = jU_{j-1}
for (unsigned int j=2; j<ORDER; ++j) {
U_of_x[0][j-1] *= FReal(j); // 1 flops
U_of_x[1][j-1] *= FReal(j); // 1 flops
U_of_x[2][j-1] *= FReal(j); // 1 flops
}
}
} // 3 + (ORDER-2)*15
// apply P and increment forces
FReal potential = FReal(0.);
......@@ -1233,19 +1240,19 @@ inline void FChebInterpolator<ORDER>::applyL2PTotal(const FPoint& center,
f4[3] += T_of_x[0][l] * U_of_x[2][m-1] * w4[1] + T_of_x[1][l] * U_of_x[2][m-1] * w4[2]; // 6 flops
for (unsigned int n=1; n<ORDER; ++n) {
const FReal w8 = W8[(n-1)*(ORDER-1)*(ORDER-1) + (m-1)*(ORDER-1) + (l-1)];
f8[0] += T_of_x[0][l] * T_of_x[1][m] * T_of_x[2][n] * w8;
f8[1] += U_of_x[0][l-1] * T_of_x[1][m] * T_of_x[2][n] * w8;
f8[2] += T_of_x[0][l] * U_of_x[1][m-1] * T_of_x[2][n] * w8;
f8[3] += T_of_x[0][l] * T_of_x[1][m] * U_of_x[2][n-1] * w8;
} // ORDER * 4 flops
} // ORDER * (9 + ORDER * 4) flops
} // ORDER * (ORDER * (9 + ORDER * 4)) flops
f8[0] += T_of_x[0][l] * T_of_x[1][m] * T_of_x[2][n] * w8; // 4 flops
f8[1] += U_of_x[0][l-1] * T_of_x[1][m] * T_of_x[2][n] * w8; // 4 flops
f8[2] += T_of_x[0][l] * U_of_x[1][m-1] * T_of_x[2][n] * w8; // 4 flops
f8[3] += T_of_x[0][l] * T_of_x[1][m] * U_of_x[2][n-1] * w8; // 4 flops
} // (ORDER-1) * 16 flops
} // (ORDER-1) * (27 + (ORDER-1) * 16) flops
} // (ORDER-1) * ((ORDER-1) * (27 + (ORDER-1) * 16)) flops
}
potential = (f1 + FReal(2.)*f2[0] + FReal(4.)*f4[0] + FReal(8.)*f8[0]) / nnodes; // 7 flops
forces[0] = ( FReal(2.)*f2[1] + FReal(4.)*f4[1] + FReal(8.)*f8[1]) * jacobian[0] / nnodes; // 6 flops
forces[1] = ( FReal(2.)*f2[2] + FReal(4.)*f4[2] + FReal(8.)*f8[2]) * jacobian[1] / nnodes; // 6 flops
forces[2] = ( FReal(2.)*f2[3] + FReal(4.)*f4[3] + FReal(8.)*f8[3]) * jacobian[2] / nnodes; // 6 flops
} // 7 + ORDER * (ORDER * (9 + ORDER * 4)) flops
forces[0] = ( FReal(2.)*f2[1] + FReal(4.)*f4[1] + FReal(8.)*f8[1]) * jacobian[0] / nnodes; // 7 flops
forces[1] = ( FReal(2.)*f2[2] + FReal(4.)*f4[2] + FReal(8.)*f8[2]) * jacobian[1] / nnodes; // 7 flops
forces[2] = ( FReal(2.)*f2[3] + FReal(4.)*f4[3] + FReal(8.)*f8[3]) * jacobian[2] / nnodes; // 7 flops
} // 28 + (ORDER-1) * ((ORDER-1) * (27 + (ORDER-1) * 16)) flops
// set computed potential
iter.data().incPotential(potential); // 1 flop
......@@ -1257,7 +1264,7 @@ inline void FChebInterpolator<ORDER>::applyL2PTotal(const FPoint& center,
// increment target iterator
iter.gotoNext();
} // N * (7 + ORDER * (ORDER * (9 + ORDER * 4))) flops
} // N * (38 + (ORDER-2)*15 + (ORDER-1)*((ORDER-1) * (27 + (ORDER-1) * 16))) + 6 flops
}
......
......@@ -8,7 +8,7 @@
#include "./FAbstractChebKernel.hpp"
#include "./FChebInterpolator.hpp"
#include "./FChebSymmetries.hpp"
#include "./FChebSymM2LHandler.hpp"
class FTreeCoordinate;
......@@ -35,16 +35,12 @@ template <class ParticleClass, class CellClass, class ContainerClass, class Matr
class FChebSymKernel
: public FAbstractChebKernel<ParticleClass, CellClass, ContainerClass, MatrixKernelClass, ORDER>
{
typedef FAbstractChebKernel<ParticleClass, CellClass, ContainerClass, MatrixKernelClass, ORDER>
AbstractBaseClass;
typedef FAbstractChebKernel<ParticleClass, CellClass, ContainerClass, MatrixKernelClass, ORDER> AbstractBaseClass;
typedef SymmetryHandler<ORDER> SymmetryHandlerClass;
enum {nnodes = AbstractBaseClass::nnodes};
/// Handler to deal with all symmetries: Stores permutation indices and
/// vectors to reduce 343 different interactions to 16 only.
struct SymmetryHandler;
/// Needed for handling all symmetries
const FSmartPointer<SymmetryHandler,FSmartPointerMemory> SymHandler;
const FSmartPointer<SymmetryHandlerClass,FSmartPointerMemory> SymHandler;
// permuted local and multipole expansions
FReal** Loc;
......@@ -74,8 +70,8 @@ class FChebSymKernel
for (int k=0; k<=j; ++k) {
const unsigned int idx = (i+3)*7*7 + (j+3)*7 + (k+3);
assert(Mul[idx]==NULL || Loc[idx]==NULL);
Mul[idx] = new FReal [30 * nnodes];
Loc[idx] = new FReal [30 * nnodes];
Mul[idx] = new FReal [24 * nnodes];
Loc[idx] = new FReal [24 * nnodes];
}
}
......@@ -92,7 +88,7 @@ public:
const FReal inBoxWidth,
const FReal Epsilon)
: AbstractBaseClass(inTreeHeight, inBoxCenter, inBoxWidth),
SymHandler(new SymmetryHandler(AbstractBaseClass::MatrixKernel.getPtr(), Epsilon)),
SymHandler(new SymmetryHandlerClass(AbstractBaseClass::MatrixKernel.getPtr(), Epsilon)),
Loc(NULL), Mul(NULL), countExp(NULL)
{
this->allocateMemoryForPermutedExpansions();
......@@ -124,6 +120,10 @@ public:
}
const SymmetryHandlerClass *const getPtrToSymHandler() const
{ return SymHandler.getPtr(); }
void P2M(CellClass* const LeafCell,
const ContainerClass* const SourceParticles)
......@@ -174,15 +174,17 @@ public:
}
// multiply (mat-mat-mul)
FReal Compressed [nnodes * 30];
FReal Compressed [nnodes * 24];
const FReal CellWidth(AbstractBaseClass::BoxWidth / FReal(FMath::pow(2, TreeLevel)));
const FReal scale(AbstractBaseClass::MatrixKernel->getScaleFactor(CellWidth));
for (unsigned int pidx=0; pidx<343; ++pidx) {
const unsigned int count = countExp[pidx];
if (count) {
const unsigned int rank = SymHandler->LowRank[pidx];
// rank * count * (2*nnodes-1) flops
FBlas::gemtm(nnodes, rank, count, FReal(1.),
SymHandler->K[pidx]+rank*nnodes, nnodes, Mul[pidx], nnodes, Compressed, rank);
// nnodes *count * (2*rank-1) flops
FBlas::gemm( nnodes, rank, count, scale,
SymHandler->K[pidx], nnodes, Compressed, rank, Loc[pidx], nnodes);
}
......@@ -197,6 +199,7 @@ public:
const unsigned int count = (countExp[pidx])++;
const FReal *const loc = Loc[pidx] + count*nnodes;
const unsigned int *const pvec = SymHandler->pvectors[idx];
// nnodes flops
for (unsigned int n=0; n<nnodes; ++n)
LocalExpansion[n] += loc[pvec[n]];
}
......@@ -345,137 +348,6 @@ public:
/**
* Handler to deal with all symmetries: Stores permutation indices and vectors
* to reduce 343 different interactions to 16 only.
*/
template <class ParticleClass,
class CellClass,
class ContainerClass,
class MatrixKernelClass,
int ORDER>
struct FChebSymKernel<ParticleClass, CellClass, ContainerClass, MatrixKernelClass, ORDER>
::SymmetryHandler
{
// M2L operators
FReal* K[343];
int LowRank[343];
// permutation vectors and permutated indices
unsigned int pvectors[343][nnodes];
unsigned int pindices[343];
/** Constructor */
SymmetryHandler(const MatrixKernelClass *const MatrixKernel, const double Epsilon)
{
// init all 343 item to zero, because effectively only 16 exist
for (unsigned int t=0; t<343; ++t) {
K[t] = NULL;
LowRank[t] = 0;
}
// set permutation vector and indices
const FChebSymmetries<ORDER> Symmetries;
for (int i=-3; i<=3; ++i)
for (int j=-3; j<=3; ++j)
for (int k=-3; k<=3; ++k)
if (abs(i)>1 || abs(j)>1 || abs(k)>1) {
const unsigned int idx = ((i+3) * 7 + (j+3)) * 7 + (k+3);
pindices[idx] = Symmetries.getPermutationArrayAndIndex(i,j,k, pvectors[idx]);
}
// precompute 16 M2L operators
this->precomputeSVD(MatrixKernel, Epsilon);
}
/** Destructor */
~SymmetryHandler()
{
for (unsigned int t=0; t<343; ++t)
if (K[ t]!=NULL) delete [] K[ t];
}
private:
void precomputeSVD(const MatrixKernelClass *const MatrixKernel, const double Epsilon)
{
// interpolation points of source (Y) and target (X) cell
FPoint X[nnodes], Y[nnodes];
// set roots of target cell (X)
FChebTensor<ORDER>::setRoots(FPoint(0.,0.,0.), FReal(2.), X);
// temporary matrix
FReal* U = new FReal [nnodes*nnodes];
// needed for the SVD
const unsigned int LWORK = 2 * (3*nnodes + nnodes);
FReal *const WORK = new FReal [LWORK];
FReal *const VT = new FReal [nnodes*nnodes];
FReal *const S = new FReal [nnodes];
unsigned int counter = 0;
for (int i=2; i<=3; ++i) {
for (int j=0; j<=i; ++j) {
for (int k=0; k<=j; ++k) {
// assemble matrix
const FPoint cy(FReal(2.*i), FReal(2.*j), FReal(2.*k));
FChebTensor<ORDER>::setRoots(cy, FReal(2.), Y);
for (unsigned int n=0; n<nnodes; ++n)
for (unsigned int m=0; m<nnodes; ++m)
U[n*nnodes + m] = MatrixKernel->evaluate(X[m], Y[n]);
// applying weights ////////////////////////////////////////
FReal weights[nnodes];
FChebTensor<ORDER>::setRootOfWeights(weights);
for (unsigned int n=0; n<nnodes; ++n) {
FBlas::scal(nnodes, weights[n], U + n, nnodes); // scale rows
FBlas::scal(nnodes, weights[n], U + n * nnodes); // scale cols
}
//////////////////////////////////////////////////////////
// truncated singular value decomposition of matrix
const unsigned int info = FBlas::gesvd(nnodes, nnodes, U, S, VT, nnodes, LWORK, WORK);
if (info!=0) throw std::runtime_error("SVD did not converge with " + info);
const unsigned int rank = getRank<ORDER>(S, Epsilon);