Commit 6e0dfd84 authored by BLANCHARD Pierre's avatar BLANCHARD Pierre
Browse files

Fixed multiRhs utest for ChebFMM.

parent 94a96644
......@@ -45,7 +45,7 @@ class FAbstractChebKernel : public FAbstractKernels< CellClass, ContainerClass>
{
protected:
enum {nnodes = TensorTraits<ORDER>::nnodes};
typedef FChebInterpolator<ORDER,MatrixKernelClass> InterpolatorClass;
typedef FChebInterpolator<ORDER,MatrixKernelClass,NVALS> InterpolatorClass;
/// Needed for P2M, M2M, L2L and L2P operators
const FSmartPointer<InterpolatorClass,FSmartPointerMemory> Interpolator;
......
......@@ -41,14 +41,15 @@
* of the size of the vectorial interpolators. Default is the scalar
* matrix kernel class of type ONE_OVER_R (NRHS=NLHS=1).
*/
template <int ORDER, class MatrixKernelClass = struct FInterpMatrixKernelR>
template <int ORDER, class MatrixKernelClass = struct FInterpMatrixKernelR, int NVALS = 1>
class FChebInterpolator : FNoCopyable
{
// compile time constants and types
enum {nnodes = TensorTraits<ORDER>::nnodes,
nRhs = MatrixKernelClass::NRHS,
nLhs = MatrixKernelClass::NLHS,
nPV = MatrixKernelClass::NPV};
nPV = MatrixKernelClass::NPV,
nVals = NVALS};
typedef FChebRoots< ORDER> BasisType;
typedef FChebTensor<ORDER> TensorType;
......@@ -635,9 +636,9 @@ public:
* Particle to moment: application of \f$S_\ell(y,\bar y_n)\f$
* (anterpolation, it is the transposed interpolation)
*/
template <int ORDER, class MatrixKernelClass>
template <int ORDER, class MatrixKernelClass, int NVALS>
template <class ContainerClass>
inline void FChebInterpolator<ORDER,MatrixKernelClass>::applyP2M(const FPoint& center,
inline void FChebInterpolator<ORDER,MatrixKernelClass,NVALS>::applyP2M(const FPoint& center,
const FReal width,
FReal *const multipoleExpansion,
const ContainerClass *const inParticles) const
......@@ -648,20 +649,19 @@ inline void FChebInterpolator<ORDER,MatrixKernelClass>::applyP2M(const FPoint& c
FPoint localPosition;
// Init W
FReal W1[nRhs];
FReal W2[nRhs][3][ ORDER-1];
FReal W4[nRhs][3][(ORDER-1)*(ORDER-1)];
FReal W8[nRhs][ (ORDER-1)*(ORDER-1)*(ORDER-1)];
for(int idxRhs = 0 ; idxRhs < nRhs ; ++idxRhs){
W1[idxRhs] = FReal(0.);
for(unsigned int i=0; i<(ORDER-1); ++i) W2[idxRhs][0][i] = W2[idxRhs][1][i] = W2[idxRhs][2][i] = FReal(0.);
for(unsigned int i=0; i<(ORDER-1)*(ORDER-1); ++i) W4[idxRhs][0][i] = W4[idxRhs][1][i] = W4[idxRhs][2][i] = FReal(0.);
for(unsigned int i=0; i<(ORDER-1)*(ORDER-1)*(ORDER-1); ++i) W8[idxRhs][i] = FReal(0.);
FReal W1[nVals*nRhs];
FReal W2[nVals*nRhs][3][ ORDER-1];
FReal W4[nVals*nRhs][3][(ORDER-1)*(ORDER-1)];
FReal W8[nVals*nRhs][ (ORDER-1)*(ORDER-1)*(ORDER-1)];
for(int idxMul = 0 ; idxMul < nVals*nRhs ; ++idxMul){
W1[idxMul] = FReal(0.);
for(unsigned int i=0; i<(ORDER-1); ++i) W2[idxMul][0][i] = W2[idxMul][1][i] = W2[idxMul][2][i] = FReal(0.);
for(unsigned int i=0; i<(ORDER-1)*(ORDER-1); ++i) W4[idxMul][0][i] = W4[idxMul][1][i] = W4[idxMul][2][i] = FReal(0.);
for(unsigned int i=0; i<(ORDER-1)*(ORDER-1)*(ORDER-1); ++i) W8[idxMul][i] = FReal(0.);
}
// loop over source particles
// const FReal*const physicalValues = inParticles->getPhysicalValues();
const FReal*const positionsX = inParticles->getPositions()[0];
const FReal*const positionsY = inParticles->getPositions()[1];
const FReal*const positionsZ = inParticles->getPositions()[2];
......@@ -682,86 +682,93 @@ inline void FChebInterpolator<ORDER,MatrixKernelClass>::applyP2M(const FPoint& c
T_of_x[1][j] = y2 * T_of_x[1][j-1] - T_of_x[1][j-2]; // 2 flops
T_of_x[2][j] = z2 * T_of_x[2][j-1] - T_of_x[2][j-2]; // 2 flops
}
for(int idxRhs = 0 ; idxRhs < nRhs ; ++idxRhs){
const FReal*const physicalValues = inParticles->getPhysicalValues(idxRhs);
const FReal weight = physicalValues[idxPart];
W1[idxRhs] += weight; // 1 flop
for (unsigned int i=1; i<ORDER; ++i) {
const FReal wx = weight * T_of_x[0][i]; // 1 flop
const FReal wy = weight * T_of_x[1][i]; // 1 flop
const FReal wz = weight * T_of_x[2][i]; // 1 flop
W2[idxRhs][0][i-1] += wx; // 1 flop
W2[idxRhs][1][i-1] += wy; // 1 flop
W2[idxRhs][2][i-1] += wz; // 1 flop
for (unsigned int j=1; j<ORDER; ++j) {
const FReal wxy = wx * T_of_x[1][j]; // 1 flop
const FReal wxz = wx * T_of_x[2][j]; // 1 flop
const FReal wyz = wy * T_of_x[2][j]; // 1 flop
W4[idxRhs][0][(j-1)*(ORDER-1) + (i-1)] += wxy; // 1 flop
W4[idxRhs][1][(j-1)*(ORDER-1) + (i-1)] += wxz; // 1 flop
W4[idxRhs][2][(j-1)*(ORDER-1) + (i-1)] += wyz; // 1 flop
for (unsigned int k=1; k<ORDER; ++k) {
const FReal wxyz = wxy * T_of_x[2][k]; // 1 flop
W8[idxRhs][(k-1)*(ORDER-1)*(ORDER-1) + (j-1)*(ORDER-1) + (i-1)] += wxyz; // 1 flop
} // flops: (ORDER-1) * 2
} // flops: (ORDER-1) * (6 + (ORDER-1) * 2)
} // flops: (ORDER-1) * (6 + (ORDER-1) * (6 + (ORDER-1) * 2))
} // flops: ... * NRHS
for(int idxVals = 0 ; idxVals < nVals ; ++idxVals){
for(int idxRhs = 0 ; idxRhs < nRhs ; ++idxRhs){
const int idxMul = idxVals*nRhs+idxRhs;
const FReal*const physicalValues = inParticles->getPhysicalValues(idxVals,idxRhs);
const FReal weight = physicalValues[idxPart];
W1[idxMul] += weight; // 1 flop
for (unsigned int i=1; i<ORDER; ++i) {
const FReal wx = weight * T_of_x[0][i]; // 1 flop
const FReal wy = weight * T_of_x[1][i]; // 1 flop
const FReal wz = weight * T_of_x[2][i]; // 1 flop
W2[idxMul][0][i-1] += wx; // 1 flop
W2[idxMul][1][i-1] += wy; // 1 flop
W2[idxMul][2][i-1] += wz; // 1 flop
for (unsigned int j=1; j<ORDER; ++j) {
const FReal wxy = wx * T_of_x[1][j]; // 1 flop
const FReal wxz = wx * T_of_x[2][j]; // 1 flop
const FReal wyz = wy * T_of_x[2][j]; // 1 flop
W4[idxMul][0][(j-1)*(ORDER-1) + (i-1)] += wxy; // 1 flop
W4[idxMul][1][(j-1)*(ORDER-1) + (i-1)] += wxz; // 1 flop
W4[idxMul][2][(j-1)*(ORDER-1) + (i-1)] += wyz; // 1 flop
for (unsigned int k=1; k<ORDER; ++k) {
const FReal wxyz = wxy * T_of_x[2][k]; // 1 flop
W8[idxMul][(k-1)*(ORDER-1)*(ORDER-1) + (j-1)*(ORDER-1) + (i-1)] += wxyz; // 1 flop
} // flops: (ORDER-1) * 2
} // flops: (ORDER-1) * (6 + (ORDER-1) * 2)
} // flops: (ORDER-1) * (6 + (ORDER-1) * (6 + (ORDER-1) * 2))
} // flops: ... * NRHS
} // flops: ... * NVALS
} // flops: N * (18 + (ORDER-2) * 6 + (ORDER-1) * (6 + (ORDER-1) * (6 + (ORDER-1) * 2)))
////////////////////////////////////////////////////////////////////
for(int idxRhs = 0 ; idxRhs < nRhs ; ++idxRhs){
for(int idxMul = 0 ; idxMul < nVals*nRhs ; ++idxMul){
// loop over interpolation points
FReal F2[3][ORDER];
FReal F4[3][ORDER*ORDER];
FReal F8[ ORDER*ORDER*ORDER];
{
// compute W2: 3 * ORDER*(2*(ORDER-1)-1) flops
FBlas::gemv(ORDER, ORDER-1, FReal(1.), const_cast<FReal*>(T), W2[idxRhs][0], F2[0]);
FBlas::gemv(ORDER, ORDER-1, FReal(1.), const_cast<FReal*>(T), W2[idxRhs][1], F2[1]);
FBlas::gemv(ORDER, ORDER-1, FReal(1.), const_cast<FReal*>(T), W2[idxRhs][2], F2[2]);
// compute W4: 3 * [ORDER*(ORDER-1)*(2*(ORDER-1)-1) + ORDER*ORDER*(2*(ORDER-1)-1)]
FReal C[ORDER * (ORDER-1)];
FBlas::gemmt(ORDER, ORDER-1, ORDER-1, FReal(1.), const_cast<FReal*>(T), ORDER, W4[idxRhs][0], ORDER-1, C, ORDER);
FBlas::gemmt(ORDER, ORDER-1, ORDER, FReal(1.), const_cast<FReal*>(T), ORDER, C, ORDER, F4[0], ORDER);
FBlas::gemmt(ORDER, ORDER-1, ORDER-1, FReal(1.), const_cast<FReal*>(T), ORDER, W4[idxRhs][1], ORDER-1, C, ORDER);
FBlas::gemmt(ORDER, ORDER-1, ORDER, FReal(1.), const_cast<FReal*>(T), ORDER, C, ORDER, F4[1], ORDER);
FBlas::gemmt(ORDER, ORDER-1, ORDER-1, FReal(1.), const_cast<FReal*>(T), ORDER, W4[idxRhs][2], ORDER-1, C, ORDER);
FBlas::gemmt(ORDER, ORDER-1, ORDER, FReal(1.), const_cast<FReal*>(T), ORDER, C, ORDER, F4[2], ORDER);
// compute W8: 3 * (2*(ORDER-1)-1) * [ORDER*(ORDER-1)*(ORDER-1) + ORDER*ORDER*(ORDER-1) + ORDER*ORDER*ORDER]
FReal D[ORDER * (ORDER-1) * (ORDER-1)];
FBlas::gemm(ORDER, ORDER-1, (ORDER-1)*(ORDER-1), FReal(1.), const_cast<FReal*>(T), ORDER, W8[idxRhs], ORDER-1, D, ORDER);
FReal E[(ORDER-1) * (ORDER-1) * ORDER];
for (unsigned int s=0; s<perm0.size; ++s) E[perm0.mni[s]] = D[perm0.imn[s]];
FReal F[ORDER * (ORDER-1) * ORDER];
FBlas::gemm(ORDER, ORDER-1, ORDER*(ORDER-1), FReal(1.), const_cast<FReal*>(T), ORDER, E, ORDER-1, F, ORDER);
FReal G[(ORDER-1) * ORDER * ORDER];
for (unsigned int s=0; s<perm1.size; ++s) G[perm1.nij[s]] = F[perm1.jni[s]];
FReal H[ORDER * ORDER * ORDER];
FBlas::gemm(ORDER, ORDER-1, ORDER*ORDER, FReal(1.), const_cast<FReal*>(T), ORDER, G, ORDER-1, H, ORDER);
for (unsigned int s=0; s<perm2.size; ++s) F8[perm2.ijk[s]] = H[perm2.kij[s]];
}
// loop over interpolation points
FReal F2[3][ORDER];
FReal F4[3][ORDER*ORDER];
FReal F8[ ORDER*ORDER*ORDER];
{
// compute W2: 3 * ORDER*(2*(ORDER-1)-1) flops
FBlas::gemv(ORDER, ORDER-1, FReal(1.), const_cast<FReal*>(T), W2[idxMul][0], F2[0]);
FBlas::gemv(ORDER, ORDER-1, FReal(1.), const_cast<FReal*>(T), W2[idxMul][1], F2[1]);
FBlas::gemv(ORDER, ORDER-1, FReal(1.), const_cast<FReal*>(T), W2[idxMul][2], F2[2]);
// compute W4: 3 * [ORDER*(ORDER-1)*(2*(ORDER-1)-1) + ORDER*ORDER*(2*(ORDER-1)-1)]
FReal C[ORDER * (ORDER-1)];
FBlas::gemmt(ORDER, ORDER-1, ORDER-1, FReal(1.), const_cast<FReal*>(T), ORDER, W4[idxMul][0], ORDER-1, C, ORDER);
FBlas::gemmt(ORDER, ORDER-1, ORDER, FReal(1.), const_cast<FReal*>(T), ORDER, C, ORDER, F4[0], ORDER);
FBlas::gemmt(ORDER, ORDER-1, ORDER-1, FReal(1.), const_cast<FReal*>(T), ORDER, W4[idxMul][1], ORDER-1, C, ORDER);
FBlas::gemmt(ORDER, ORDER-1, ORDER, FReal(1.), const_cast<FReal*>(T), ORDER, C, ORDER, F4[1], ORDER);
FBlas::gemmt(ORDER, ORDER-1, ORDER-1, FReal(1.), const_cast<FReal*>(T), ORDER, W4[idxMul][2], ORDER-1, C, ORDER);
FBlas::gemmt(ORDER, ORDER-1, ORDER, FReal(1.), const_cast<FReal*>(T), ORDER, C, ORDER, F4[2], ORDER);
// compute W8: 3 * (2*(ORDER-1)-1) * [ORDER*(ORDER-1)*(ORDER-1) + ORDER*ORDER*(ORDER-1) + ORDER*ORDER*ORDER]
FReal D[ORDER * (ORDER-1) * (ORDER-1)];
FBlas::gemm(ORDER, ORDER-1, (ORDER-1)*(ORDER-1), FReal(1.), const_cast<FReal*>(T), ORDER, W8[idxMul], ORDER-1, D, ORDER);
FReal E[(ORDER-1) * (ORDER-1) * ORDER];
for (unsigned int s=0; s<perm0.size; ++s) E[perm0.mni[s]] = D[perm0.imn[s]];
FReal F[ORDER * (ORDER-1) * ORDER];
FBlas::gemm(ORDER, ORDER-1, ORDER*(ORDER-1), FReal(1.), const_cast<FReal*>(T), ORDER, E, ORDER-1, F, ORDER);
FReal G[(ORDER-1) * ORDER * ORDER];
for (unsigned int s=0; s<perm1.size; ++s) G[perm1.nij[s]] = F[perm1.jni[s]];
FReal H[ORDER * ORDER * ORDER];
FBlas::gemm(ORDER, ORDER-1, ORDER*ORDER, FReal(1.), const_cast<FReal*>(T), ORDER, G, ORDER-1, H, ORDER);
for (unsigned int s=0; s<perm2.size; ++s) F8[perm2.ijk[s]] = H[perm2.kij[s]];
}
// assemble multipole expansions
for (unsigned int i=0; i<ORDER; ++i) {
for (unsigned int j=0; j<ORDER; ++j) {
for (unsigned int k=0; k<ORDER; ++k) {
const unsigned int idx = k*ORDER*ORDER + j*ORDER + i;
multipoleExpansion[2*idxRhs*nnodes + idx] += (W1[idxRhs] +
FReal(2.) * (F2[0][i] + F2[1][j] + F2[2][k]) +
FReal(4.) * (F4[0][j*ORDER+i] + F4[1][k*ORDER+i] + F4[2][k*ORDER+j]) +
FReal(8.) * F8[idx]) / nnodes; // 11 * ORDER*ORDER*ORDER flops
// assemble multipole expansions
for (unsigned int i=0; i<ORDER; ++i) {
for (unsigned int j=0; j<ORDER; ++j) {
for (unsigned int k=0; k<ORDER; ++k) {
const unsigned int idx = k*ORDER*ORDER + j*ORDER + i;
multipoleExpansion[2*idxMul*nnodes + idx] += (W1[idxMul] +
FReal(2.) * (F2[0][i] + F2[1][j] + F2[2][k]) +
FReal(4.) * (F4[0][j*ORDER+i] + F4[1][k*ORDER+i] + F4[2][k*ORDER+j]) +
FReal(8.) * F8[idx]) / nnodes; // 11 * ORDER*ORDER*ORDER flops
}
}
}
}
} // NRHS
} // NVALS*NRHS
}
......@@ -836,55 +843,62 @@ inline void FChebInterpolator<ORDER,MatrixKernelClass>::applyP2M(const FPoint& c
/**
* Local to particle operation: application of \f$S_\ell(x,\bar x_m)\f$ (interpolation)
*/
template <int ORDER, class MatrixKernelClass>
template <int ORDER, class MatrixKernelClass, int NVALS>
template <class ContainerClass>
inline void FChebInterpolator<ORDER,MatrixKernelClass>::applyL2P(const FPoint& center,
inline void FChebInterpolator<ORDER,MatrixKernelClass,NVALS>::applyL2P(const FPoint& center,
const FReal width,
const FReal *const localExpansion,
ContainerClass *const inParticles) const
{
FReal f1[nLhs];
FReal W2[nLhs][3][ ORDER-1];
FReal W4[nLhs][3][(ORDER-1)*(ORDER-1)];
FReal W8[nLhs][ (ORDER-1)*(ORDER-1)*(ORDER-1)];
{
for(int idxLhs = 0 ; idxLhs < nLhs ; ++idxLhs){
// sum over interpolation points
f1[idxLhs] = FReal(0.);
for(unsigned int i=0; i<ORDER-1; ++i) W2[idxLhs][0][i] = W2[idxLhs][1][i] = W2[idxLhs][2][i] = FReal(0.);
for(unsigned int i=0; i<(ORDER-1)*(ORDER-1); ++i) W4[idxLhs][0][i] = W4[idxLhs][1][i] = W4[idxLhs][2][i] = FReal(0.);
for(unsigned int i=0; i<(ORDER-1)*(ORDER-1)*(ORDER-1); ++i) W8[idxLhs][i] = FReal(0.);
FReal f1[nVals*nLhs];
FReal W2[nVals*nLhs][3][ ORDER-1];
FReal W4[nVals*nLhs][3][(ORDER-1)*(ORDER-1)];
FReal W8[nVals*nLhs][ (ORDER-1)*(ORDER-1)*(ORDER-1)];
{
for (unsigned int idx=0; idx<nnodes; ++idx) {
const unsigned int i = node_ids[idx][0];
const unsigned int j = node_ids[idx][1];
const unsigned int k = node_ids[idx][2];
for(int idxVals = 0 ; idxVals < nVals ; ++idxVals){
f1[idxLhs] += localExpansion[2*idxLhs*nnodes + idx]; // 1 flop
for(int idxLhs = 0 ; idxLhs < nLhs ; ++idxLhs){
for (unsigned int l=0; l<ORDER-1; ++l) {
const FReal wx = T[l*ORDER+i] * localExpansion[2*idxLhs*nnodes + idx]; // 1 flops
const FReal wy = T[l*ORDER+j] * localExpansion[2*idxLhs*nnodes + idx]; // 1 flops
const FReal wz = T[l*ORDER+k] * localExpansion[2*idxLhs*nnodes + idx]; // 1 flops
W2[idxLhs][0][l] += wx; // 1 flops
W2[idxLhs][1][l] += wy; // 1 flops
W2[idxLhs][2][l] += wz; // 1 flops
for (unsigned int m=0; m<ORDER-1; ++m) {
const FReal wxy = wx * T[m*ORDER + j]; // 1 flops
const FReal wxz = wx * T[m*ORDER + k]; // 1 flops
const FReal wyz = wy * T[m*ORDER + k]; // 1 flops
W4[idxLhs][0][m*(ORDER-1)+l] += wxy; // 1 flops
W4[idxLhs][1][m*(ORDER-1)+l] += wxz; // 1 flops
W4[idxLhs][2][m*(ORDER-1)+l] += wyz; // 1 flops
for (unsigned int n=0; n<ORDER-1; ++n) {
const FReal wxyz = wxy * T[n*ORDER + k]; // 1 flops
W8[idxLhs][n*(ORDER-1)*(ORDER-1) + m*(ORDER-1) + l] += wxyz; // 1 flops
} // (ORDER-1) * 2 flops
} // (ORDER-1) * (6 + (ORDER-1)*2) flops
} // (ORDER-1) * (6 + (ORDER-1) * (6 + (ORDER-1)*2)) flops
} // ORDER*ORDER*ORDER * (1 + (ORDER-1) * (6 + (ORDER-1) * (6 + (ORDER-1)*2))) flops
} // NLHS
const int idxLoc = idxVals*nLhs+idxLhs;
// sum over interpolation points
f1[idxLoc] = FReal(0.);
for(unsigned int i=0; i<ORDER-1; ++i) W2[idxLoc][0][i] = W2[idxLoc][1][i] = W2[idxLoc][2][i] = FReal(0.);
for(unsigned int i=0; i<(ORDER-1)*(ORDER-1); ++i) W4[idxLoc][0][i] = W4[idxLoc][1][i] = W4[idxLoc][2][i] = FReal(0.);
for(unsigned int i=0; i<(ORDER-1)*(ORDER-1)*(ORDER-1); ++i) W8[idxLoc][i] = FReal(0.);
for (unsigned int idx=0; idx<nnodes; ++idx) {
const unsigned int i = node_ids[idx][0];
const unsigned int j = node_ids[idx][1];
const unsigned int k = node_ids[idx][2];
f1[idxLoc] += localExpansion[2*idxLoc*nnodes + idx]; // 1 flop
for (unsigned int l=0; l<ORDER-1; ++l) {
const FReal wx = T[l*ORDER+i] * localExpansion[2*idxLoc*nnodes + idx]; // 1 flops
const FReal wy = T[l*ORDER+j] * localExpansion[2*idxLoc*nnodes + idx]; // 1 flops
const FReal wz = T[l*ORDER+k] * localExpansion[2*idxLoc*nnodes + idx]; // 1 flops
W2[idxLoc][0][l] += wx; // 1 flops
W2[idxLoc][1][l] += wy; // 1 flops
W2[idxLoc][2][l] += wz; // 1 flops
for (unsigned int m=0; m<ORDER-1; ++m) {
const FReal wxy = wx * T[m*ORDER + j]; // 1 flops
const FReal wxz = wx * T[m*ORDER + k]; // 1 flops
const FReal wyz = wy * T[m*ORDER + k]; // 1 flops
W4[idxLoc][0][m*(ORDER-1)+l] += wxy; // 1 flops
W4[idxLoc][1][m*(ORDER-1)+l] += wxz; // 1 flops
W4[idxLoc][2][m*(ORDER-1)+l] += wyz; // 1 flops
for (unsigned int n=0; n<ORDER-1; ++n) {
const FReal wxyz = wxy * T[n*ORDER + k]; // 1 flops
W8[idxLoc][n*(ORDER-1)*(ORDER-1) + m*(ORDER-1) + l] += wxyz; // 1 flops
} // (ORDER-1) * 2 flops
} // (ORDER-1) * (6 + (ORDER-1)*2) flops
} // (ORDER-1) * (6 + (ORDER-1) * (6 + (ORDER-1)*2)) flops
} // ORDER*ORDER*ORDER * (1 + (ORDER-1) * (6 + (ORDER-1) * (6 + (ORDER-1)*2))) flops
} // NLHS
} // NVALS
}
......@@ -892,14 +906,10 @@ inline void FChebInterpolator<ORDER,MatrixKernelClass>::applyL2P(const FPoint& c
const map_glob_loc map(center, width);
FPoint localPosition;
//const FReal*const physicalValues = inParticles->getPhysicalValues();
// get particles position
const FReal*const positionsX = inParticles->getPositions()[0];
const FReal*const positionsY = inParticles->getPositions()[1];
const FReal*const positionsZ = inParticles->getPositions()[2];
//FReal*const forcesX = inParticles->getForcesX();
//FReal*const forcesY = inParticles->getForcesY();
//FReal*const forcesZ = inParticles->getForcesZ();
// FReal*const potentials = inParticles->getPotentials();
for(int idxPart = 0 ; idxPart < inParticles->getNbParticles() ; ++ idxPart){
......@@ -921,47 +931,58 @@ inline void FChebInterpolator<ORDER,MatrixKernelClass>::applyL2P(const FPoint& c
}
}
for(int idxLhs = 0 ; idxLhs < nLhs ; ++idxLhs){
// distribution over potential components:
// We sum the multidim contribution of PhysValue
// This was originally done at M2L step but moved here
// because their storage is required by the force computation.
// In fact : f_{ik}(x)=w_j(x) \nabla_{x_i} K_{ij}(x,y)w_j(y))
const unsigned int idxPot = idxLhs / nPV;
// loop over multipole expansions
for(int idxVals = 0 ; idxVals < nVals ; ++idxVals){
for(int idxLhs = 0 ; idxLhs < nLhs ; ++idxLhs){
FReal*const potentials = inParticles->getPotentials(idxPot);
const int idxLoc = idxVals*nLhs+idxLhs;
// interpolate and increment target value
FReal targetValue = potentials[idxPart];
{
FReal f2, f4, f8;
// distribution over potential components:
// We sum the multidim contribution of PhysValue
// This was originally done at M2L step but moved here
// because their storage is required by the force computation.
// In fact : f_{ik}(x)=w_j(x) \nabla_{x_i} K_{ij}(x,y)w_j(y))
const unsigned int idxPot = idxLhs / nPV;
FReal*const potentials = inParticles->getPotentials(idxVals,idxPot);
// interpolate and increment target value
FReal targetValue = potentials[idxPart];
{
f2 = f4 = f8 = FReal(0.);
for (unsigned int l=1; l<ORDER; ++l) {
f2 +=
T_of_x[0][l] * W2[idxLhs][0][l-1] +
T_of_x[1][l] * W2[idxLhs][1][l-1] +
T_of_x[2][l] * W2[idxLhs][2][l-1]; // 6 flops
for (unsigned int m=1; m<ORDER; ++m) {
f4 +=
T_of_x[0][l] * T_of_x[1][m] * W4[idxLhs][0][(m-1)*(ORDER-1)+(l-1)] +
T_of_x[0][l] * T_of_x[2][m] * W4[idxLhs][1][(m-1)*(ORDER-1)+(l-1)] +
T_of_x[1][l] * T_of_x[2][m] * W4[idxLhs][2][(m-1)*(ORDER-1)+(l-1)]; // 9 flops
for (unsigned int n=1; n<ORDER; ++n) {
f8 +=
T_of_x[0][l] * T_of_x[1][m] * T_of_x[2][n] *
W8[idxLhs][(n-1)*(ORDER-1)*(ORDER-1) + (m-1)*(ORDER-1) + (l-1)];
} // ORDER * 4 flops
} // ORDER * (9 + ORDER * 4) flops
} // ORDER * (ORDER * (9 + ORDER * 4)) flops
}
targetValue = (f1[idxLhs] + FReal(2.)*f2 + FReal(4.)*f4 + FReal(8.)*f8) / nnodes; // 7 flops
} // 7 + ORDER * (ORDER * (9 + ORDER * 4)) flops
// set potential
potentials[idxPart] += (targetValue);
} // NLHS
FReal f2, f4, f8;
{
f2 = f4 = f8 = FReal(0.);
for (unsigned int l=1; l<ORDER; ++l) {
f2 +=
T_of_x[0][l] * W2[idxLoc][0][l-1] +
T_of_x[1][l] * W2[idxLoc][1][l-1] +
T_of_x[2][l] * W2[idxLoc][2][l-1]; // 6 flops
for (unsigned int m=1; m<ORDER; ++m) {
f4 +=
T_of_x[0][l] * T_of_x[1][m] * W4[idxLoc][0][(m-1)*(ORDER-1)+(l-1)] +
T_of_x[0][l] * T_of_x[2][m] * W4[idxLoc][1][(m-1)*(ORDER-1)+(l-1)] +
T_of_x[1][l] * T_of_x[2][m] * W4[idxLoc][2][(m-1)*(ORDER-1)+(l-1)]; // 9 flops
for (unsigned int n=1; n<ORDER; ++n) {
f8 +=
T_of_x[0][l] * T_of_x[1][m] * T_of_x[2][n] *
W8[idxLoc][(n-1)*(ORDER-1)*(ORDER-1) + (m-1)*(ORDER-1) + (l-1)];
} // ORDER * 4 flops
} // ORDER * (9 + ORDER * 4) flops
} // ORDER * (ORDER * (9 + ORDER * 4)) flops
}
targetValue = (f1[idxLoc] + FReal(2.)*f2 + FReal(4.)*f4 + FReal(8.)*f8) / nnodes; // 7 flops
} // 7 + ORDER * (ORDER * (9 + ORDER * 4)) flops
// set potential
potentials[idxPart] += (targetValue);
} // NLHS
}// NVALS
} // N * (7 + ORDER * (ORDER * (9 + ORDER * 4))) flops
}
......@@ -1057,9 +1078,9 @@ inline void FChebInterpolator<ORDER,MatrixKernelClass>::applyL2P(const FPoint& c
/**
* Local to particle operation: application of \f$\nabla_x S_\ell(x,\bar x_m)\f$ (interpolation)
*/
template <int ORDER, class MatrixKernelClass>
template <int ORDER, class MatrixKernelClass, int NVALS>
template <class ContainerClass>
inline void FChebInterpolator<ORDER,MatrixKernelClass>::applyL2PGradient(const FPoint& center,
inline void FChebInterpolator<ORDER,MatrixKernelClass,NVALS>::applyL2PGradient(const FPoint& center,
const FReal width,
const FReal *const localExpansion,
ContainerClass *const inParticles) const
......@@ -1078,14 +1099,9 @@ inline void FChebInterpolator<ORDER,MatrixKernelClass>::applyL2PGradient(const F
FReal U_of_x[ORDER][3];
FReal P[3];
// const FReal*const physicalValues = inParticles->getPhysicalValues();
const FReal*const positionsX = inParticles->getPositions()[0];
const FReal*const positionsY = inParticles->getPositions()[1];
const FReal*const positionsZ = inParticles->getPositions()[2];
// FReal*const forcesX = inParticles->getForcesX();
// FReal*const forcesY = inParticles->getForcesY();
// FReal*const forcesZ = inParticles->getForcesZ();
//FReal*const potentials = inParticles->getPotentials();
for(int idxPart = 0 ; idxPart < inParticles->getNbParticles() ; ++ idxPart){
......@@ -1120,10 +1136,10 @@ inline void FChebInterpolator<ORDER,MatrixKernelClass>::applyL2PGradient(const F
}
// apply P and increment forces
FReal forces[nLhs][3];
for(int idxLhs = 0 ; idxLhs < nLhs ; ++idxLhs)
for (unsigned int i=0; i<3; ++i)
forces[idxLhs][i] = FReal(0.);
FReal forces[nVals*nLhs][3];
for(int idxLoc = 0 ; idxLoc < nVals*nLhs ; ++idxLoc)
for (unsigned int i=0; i<3; ++i)
forces[idxLoc][i] = FReal(0.);
for (unsigned int n=0; n<nnodes; ++n) {
......@@ -1142,8 +1158,8 @@ inline void FChebInterpolator<ORDER,MatrixKernelClass>::applyL2PGradient(const F
P[0] *= FReal(2.);
P[1] *= FReal(2.); P[1] += FReal(1.);
P[2] *= FReal(2.); P[2] += FReal(1.);
for(int idxLhs = 0 ; idxLhs < nLhs ; ++idxLhs)
forces[idxLhs][0] += P[0] * P[1] * P[2] * localExpansion[2*idxLhs*nnodes + n];
for(int idxLoc = 0 ; idxLoc < nVals*nLhs ; ++idxLoc)
forces[idxLoc][0] += P[0] * P[1] * P[2] * localExpansion[2*idxLoc*nnodes + n];
// f1 component //////////////////////////////////////
P[0] = T_of_x[1][0] * T_of_roots[1][j[0]];
......@@ -1157,8 +1173,8 @@ inline void FChebInterpolator<ORDER,MatrixKernelClass>::applyL2PGradient(const F
P[0] *= FReal(2.); P[0] += FReal(1.);
P[1] *= FReal(2.);
P[2] *= FReal(2.); P[2] += FReal(1.);
for(int idxLhs = 0 ; idxLhs < nLhs ; ++idxLhs)
forces[idxLhs][1] += P[0] * P[1] * P[2] * localExpansion[2*idxLhs*nnodes + n];
for(int idxLoc = 0 ; idxLoc < nVals*nLhs ; ++idxLoc)
forces[idxLoc][1] += P[0] * P[1] * P[2] * localExpansion[2*idxLoc*nnodes + n];
// f2 component //////////////////////////////////////
P[0] = T_of_x[1][0] * T_of_roots[1][j[0]];
......@@ -1172,31 +1188,37 @@ inline void FChebInterpolator<ORDER,MatrixKernelClass>::applyL2PGradient(const F
P[0] *= FReal(2.); P[0] += FReal(1.);
P[1] *= FReal(2.); P[1] += FReal(1.);
P[2] *= FReal(2.);
for(int idxLhs = 0 ; idxLhs < nLhs ; ++idxLhs)
forces[idxLhs][2] += P[0] * P[1] * P[2] * localExpansion[2*idxLhs*nnodes + n];
for(int idxLoc = 0 ; idxLoc < nVals*nLhs ; ++idxLoc)
forces[idxLoc][2] += P[0] * P[1] * P[2] * localExpansion[2*idxLoc*nnodes + n];
}
for(int idxLhs = 0 ; idxLhs < nLhs ; ++idxLhs){
const unsigned int idxPot = idxLhs / nPV;
const unsigned int idxPV = idxLhs % nPV;
// scale forces
forces[idxLhs][0] *= jacobian[0] / nnodes;
forces[idxLhs][1] *= jacobian[1] / nnodes;
forces[idxLhs][2] *= jacobian[2] / nnodes;
// get pointers to PhysValues and force components
const FReal*const physicalValues = inParticles->getPhysicalValues(idxPV);
FReal*const forcesX = inParticles->getForcesX(idxPot);
FReal*const forcesY = inParticles->getForcesY(idxPot);
FReal*const forcesZ = inParticles->getForcesZ(idxPot);