Commit 67c21775 authored by BLANCHARD Pierre's avatar BLANCHARD Pierre

Implementation of Symmetric Uniform Kernel, optimized evaluation of derivative...

Implementation of Symmetric Uniform Kernel, optimized evaluation of derivative of Lagrange polynomials.
parent bbeed7db
......@@ -25,7 +25,7 @@
#include "../../Components/FAbstractKernels.hpp"
#include "./FChebInterpolator.hpp"
#include "./FChebSymmetries.hpp"
#include "../Interpolation/FInterpSymmetries.hpp"
class FTreeCoordinate;
......@@ -322,7 +322,7 @@ struct FChebFlopsSymKernel<CellClass, ContainerClass, MatrixKernelClass, ORDER>
}
// set permutation vector and indices
const FChebSymmetries<ORDER> Symmetries;
const FInterpSymmetries<ORDER> Symmetries;
for (int i=-3; i<=3; ++i)
for (int j=-3; j<=3; ++j)
for (int k=-3; k<=3; ++k)
......
......@@ -21,7 +21,7 @@
#include "../../Utils/FBlas.hpp"
#include "./FChebTensor.hpp"
#include "./FChebSymmetries.hpp"
#include "../Interpolation/FInterpSymmetries.hpp"
#include "./FChebM2LHandler.hpp"
/**
......@@ -539,7 +539,7 @@ public:
}
// set permutation vector and indices
const FChebSymmetries<ORDER> Symmetries;
const FInterpSymmetries<ORDER> Symmetries;
for (int i=-3; i<=3; ++i)
for (int j=-3; j<=3; ++j)
for (int k=-3; k<=3; ++k) {
......@@ -620,7 +620,7 @@ public:
// set permutation vector and indices
const FChebSymmetries<ORDER> Symmetries;
const FInterpSymmetries<ORDER> Symmetries;
for (int i=-3; i<=3; ++i)
for (int j=-3; j<=3; ++j)
for (int k=-3; k<=3; ++k) {
......
......@@ -13,8 +13,8 @@
// "http://www.cecill.info".
// "http://www.gnu.org/licenses".
// ===================================================================================
#ifndef FCHEBSYMMETRIES_HPP
#define FCHEBSYMMETRIES_HPP
#ifndef FINTERPSYMMETRIES_HPP
#define FINTERPSYMMETRIES_HPP
#include <climits>
......@@ -25,12 +25,12 @@
*/
/**
* @class FChebSymmetries
* @class FInterpSymmetries
*
* The class @p FChebSymmetries exploits all symmetries
* The class @p FInterpSymmetries exploits all symmetries
*/
template <int ORDER>
class FChebSymmetries
class FInterpSymmetries
{
enum {nnodes = ORDER*ORDER*ORDER};
......@@ -49,11 +49,11 @@ class FChebSymmetries
return (sk | sj | si);
}
public:
/** Constructor */
FChebSymmetries()
FInterpSymmetries()
{
// permutations for 8 quadrants
unsigned int quads[8][nnodes];
......
......@@ -45,8 +45,6 @@ class FUnifInterpolator : FNoCopyable
typedef FUnifRoots< ORDER> BasisType;
typedef FUnifTensor<ORDER> TensorType;
// FReal T_of_roots[ORDER][ORDER];
// FReal T[ORDER * (ORDER-1)];
unsigned int node_ids[nnodes][3];
FReal* ChildParentInterpolator[8];
......@@ -56,12 +54,13 @@ class FUnifInterpolator : FNoCopyable
////////////////////////////////////////////////////////////////////
// PB: use improved version of M2M/L2L
/**
* Initialize the child - parent - interpolator, it is basically the matrix
* S which is precomputed and reused for all M2M and L2L operations, ie for
* all non leaf inter/anterpolations.
*/
/*
void initM2MandL2L()
{
FPoint ParentRoots[nnodes], ChildRoots[nnodes];
......@@ -86,6 +85,7 @@ class FUnifInterpolator : FNoCopyable
assembleInterpolator(nnodes, ChildRoots, ChildParentInterpolator[child]);
}
}
*/
/**
* Initialize the child - parent - interpolator, it is basically the matrix
......@@ -141,22 +141,10 @@ public:
*/
explicit FUnifInterpolator()
{
// // initialize chebyshev polynomials of root nodes: T_o(x_j)
// for (unsigned int o=1; o<ORDER; ++o)
// for (unsigned int j=0; j<ORDER; ++j)
// T_of_roots[o][j] = FReal(BasisType::T(o, FReal(BasisType::roots[j])));
//
// // initialize chebyshev polynomials of root nodes: T_o(x_j)
// for (unsigned int o=1; o<ORDER; ++o)
// for (unsigned int j=0; j<ORDER; ++j)
// T[(o-1)*ORDER + j] = FReal(BasisType::T(o, FReal(BasisType::roots[j])));
// initialize root node ids
TensorType::setNodeIds(node_ids);
// initialize interpolation operator for non M2M and L2L (non leaf
// operations)
// initialize interpolation operator for M2M and L2L (non leaf operations)
//this -> initM2MandL2L(); // non tensor-product interpolation
this -> initTensorM2MandL2L(); // tensor-product interpolation
}
......@@ -278,7 +266,7 @@ public:
const FReal *const localExpansion,
ContainerClass *const localParticles) const;
// PB: ORDER^6 version of applyM2M/L2L
/*
void applyM2M(const unsigned int ChildIndex,
const FReal *const ChildExpansion,
......@@ -299,9 +287,7 @@ public:
}
*/
// PB: improvement of applyM2M/L2L can be preserved (TOFACTO)
// since relative position of child and parent interpolation is remains unchanged
// PB: improved version of applyM2M/L2L also applies to Lagrange interpolation
void applyM2M(const unsigned int ChildIndex,
const FReal *const ChildExpansion,
FReal *const ParentExpansion) const
......@@ -506,10 +492,10 @@ inline void FUnifInterpolator<ORDER>::applyL2PGradient(const FPoint& center,
// evaluate Lagrange polynomials of source particle
for (unsigned int o=0; o<ORDER; ++o) {
L_of_x[o][0] = BasisType::L(o, localPosition.getX()); // 3 * ORDER*(ORDER-1) flops PB: TODO confirm
L_of_x[o][0] = BasisType::L(o, localPosition.getX()); // 3 * ORDER*(ORDER-1) flops
L_of_x[o][1] = BasisType::L(o, localPosition.getY()); // 3 * ORDER*(ORDER-1) flops
L_of_x[o][2] = BasisType::L(o, localPosition.getZ()); // 3 * ORDER*(ORDER-1) flops
dL_of_x[o][0] = BasisType::dL(o, localPosition.getX()); // TODO verify 3 * ORDER*(ORDER-1) flops PB: TODO confirm
dL_of_x[o][0] = BasisType::dL(o, localPosition.getX()); // TODO verify 3 * ORDER*(ORDER-1) flops
dL_of_x[o][1] = BasisType::dL(o, localPosition.getY()); // TODO verify 3 * ORDER*(ORDER-1) flops
dL_of_x[o][2] = BasisType::dL(o, localPosition.getZ()); // TODO verify 3 * ORDER*(ORDER-1) flops
}
......
......@@ -96,7 +96,7 @@ public:
{
for(int idxRhs = 0 ; idxRhs < NVALS ; ++idxRhs){
// 1) apply Sy
FBlas::scal(AbstractBaseClass::nnodes*2, FReal(0.), ParentCell->getMultipole(idxRhs));
FBlas::scal(AbstractBaseClass::nnodes, FReal(0.), ParentCell->getMultipole(idxRhs));
for (unsigned int ChildIndex=0; ChildIndex < 8; ++ChildIndex){
if (ChildCells[ChildIndex]){
AbstractBaseClass::Interpolator->applyM2M(ChildIndex, ChildCells[ChildIndex]->getMultipole(idxRhs),
......@@ -170,7 +170,6 @@ public:
const int /*TreeLevel*/)
{
for(int idxRhs = 0 ; idxRhs < NVALS ; ++idxRhs){
// 1) Apply Inverse Discete Fourier Transform
M2LHandler->unapplyZeroPaddingAndDFT(ParentCell->getTransformedLocal(idxRhs),
const_cast<CellClass*>(ParentCell)->getLocal(idxRhs));
......
......@@ -64,7 +64,7 @@ class FUnifM2LHandler : FNoCopyable
unsigned int opt_rc;
typedef FUnifTensor<ORDER> TensorType;
unsigned int node_diff[nnodes*nnodes]; // PB: used in applyC to get id=i-j
unsigned int node_diff[nnodes*nnodes];
// FDft Dft; // Direct Discrete Fourier Transformator
FFft Dft; // Fast Discrete Fourier Transformator
......@@ -222,18 +222,12 @@ public:
FReal Py[rc];
FBlas::setzero(rc,Py);
FComplexe tmpFY[rc]; // not mandatory?
// Apply Zero Padding
for (unsigned int i=0; i<nnodes; ++i)
Py[node_diff[i*nnodes]]=y[i];
// Apply forward Discrete Fourier Transform
Dft.applyDFT(Py,tmpFY);
// not mandatory?
for (unsigned int j=0; j<rc; ++j) // could be opt_rc
FY[j]+=tmpFY[j];
Dft.applyDFT(Py,FY);
}
......
......@@ -63,14 +63,14 @@ struct FUnifRoots : FNoCopyable
x = (x < FReal(-1.) ? FReal(-1.) : x);
}
FReal tmpL=FReal(1.);
FReal L=FReal(1.);
for(unsigned int m=0;m<order;++m){
if(m!=n)
tmpL *= (x-FUnifRoots<order>::roots[m])/(FUnifRoots<order>::roots[n]-FUnifRoots<order>::roots[m]);
L *= (x-FUnifRoots<order>::roots[m])/(FUnifRoots<order>::roots[n]-FUnifRoots<order>::roots[m]);
}
return FReal(tmpL);
return FReal(L);
}
......@@ -90,26 +90,22 @@ struct FUnifRoots : FNoCopyable
x = (x < FReal(-1.) ? FReal(-1.) : x);
}
FReal tmpdL;
FReal dL=FReal(0.);
// optimized variant
FReal NdL=FReal(0.);// init numerator
FReal DdL=FReal(1.);// init denominator
FReal tmpNdL;
for(unsigned int p=0;p<order;++p){
if(p!=n){
tmpdL=1./(FUnifRoots<order>::roots[n]-FUnifRoots<order>::roots[p]);
for(unsigned int m=0;m<order;++m){
if(m!=n && m!=p)
tmpdL *= (x-FUnifRoots<order>::roots[m])/(FUnifRoots<order>::roots[n]-FUnifRoots<order>::roots[m]);
}// m
dL+=tmpdL;
tmpNdL=FReal(1.);
for(unsigned int m=0;m<order;++m)
if(m!=n && m!=p)
tmpNdL*=x-FUnifRoots<order>::roots[m];
NdL+=tmpNdL;
DdL*=FUnifRoots<order>::roots[n]-FUnifRoots<order>::roots[p];
}//endif
}// p
return FReal(dL);
return FReal(NdL/DdL);
}
};
......
// ===================================================================================
// Copyright ScalFmm 2011 INRIA, Olivier Coulaud, Bérenger Bramas, Matthias Messner
// olivier.coulaud@inria.fr, berenger.bramas@inria.fr
// This software is a computer program whose purpose is to compute the FMM.
//
// This software is governed by the CeCILL-C and LGPL licenses and
// abiding by the rules of distribution of free software.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public and CeCILL-C Licenses for more details.
// "http://www.cecill.info".
// "http://www.gnu.org/licenses".
// ===================================================================================
#ifndef FUNIFSYMKERNEL_HPP
#define FUNIFSYMKERNEL_HPP
#include "../../Utils/FGlobal.hpp"
#include "../../Utils/FTrace.hpp"
#include "../../Utils/FSmartPointer.hpp"
// Originally in M2LHandler but transferred to the kernel for the symmetric version
#include "../../Utils/FDft.hpp" // PB: for FFT
#include "../../Utils/FComplexe.hpp"
#include "./FUnifTensor.hpp" // PB: for node_diff
//
#include "./FAbstractUnifKernel.hpp"
#include "./FUnifInterpolator.hpp"
#include "./FUnifSymM2LHandler.hpp"
class FTreeCoordinate;
// for verbosity only!!!
//#define COUNT_BLOCKED_INTERACTIONS
// if timings should be logged
#define LOG_TIMINGS
/**
* @author Pierre Blanchard (pierre.blanchard@inria.fr)
* @class FUnifSymKernel
* @brief
* Please read the license
*
* This kernels implement the Lagrange interpolation based FMM operators
* exploiting the symmetries in the far-field. It implements all interfaces
* (P2P, P2M, M2M, M2L, L2L, L2P) which are required by the FFmmAlgorithm and
* FFmmAlgorithmThread.
*
* @tparam CellClass Type of cell
* @tparam ContainerClass Type of container to store particles
* @tparam MatrixKernelClass Type of matrix kernel function
* @tparam ORDER interpolation order
*/
template < class CellClass, class ContainerClass, class MatrixKernelClass, int ORDER, int NVALS = 1>
class FUnifSymKernel
: public FAbstractUnifKernel<CellClass, ContainerClass, MatrixKernelClass, ORDER, NVALS>
{
typedef FAbstractUnifKernel<CellClass, ContainerClass, MatrixKernelClass, ORDER, NVALS> AbstractBaseClass;
typedef FUnifSymM2LHandler<ORDER, MatrixKernelClass::Type> SymM2LHandlerClass;
enum {nnodes = AbstractBaseClass::nnodes};
/// Needed for handling all symmetries
const FSmartPointer<SymM2LHandlerClass,FSmartPointerMemory> SymM2LHandler;
// permuted local and multipole expansions
FReal** Loc;
FReal** Mul;
unsigned int* countExp;
// transformed expansions
FComplexe** TLoc;
FComplexe** TMul;
static const unsigned int rc = (2*ORDER-1)*(2*ORDER-1)*(2*ORDER-1);
static const unsigned int opt_rc = rc/2+1;
typedef FUnifTensor<ORDER> TensorType;
unsigned int node_diff[nnodes*nnodes];
// FDft Dft; // Direct Discrete Fourier Transformator
FFft Dft; // Fast Discrete Fourier Transformator
/**
* Allocate memory for storing locally permuted mulipole and local expansions
*/
void allocateMemoryForPermutedExpansions()
{
assert(Loc==NULL && Mul==NULL && countExp==NULL);
Loc = new FReal* [343];
Mul = new FReal* [343];
TLoc = new FComplexe* [343];
TMul = new FComplexe* [343];
countExp = new unsigned int [343];
// set all 343 to NULL
for (unsigned int idx=0; idx<343; ++idx) {
Mul[idx] = Loc[idx] = NULL;
TMul[idx] = TLoc[idx] = NULL;
}
// init only 16 of 343 possible translations due to symmetries
for (int i=2; i<=3; ++i)
for (int j=0; j<=i; ++j)
for (int k=0; k<=j; ++k) {
const unsigned int idx = (i+3)*7*7 + (j+3)*7 + (k+3);
assert(Mul[idx]==NULL || Loc[idx]==NULL);
Mul[idx] = new FReal [24 * nnodes];
Loc[idx] = new FReal [24 * nnodes];
TMul[idx] = new FComplexe [24 * rc];
TLoc[idx] = new FComplexe [24 * rc];
}
}
#ifdef LOG_TIMINGS
FTic time;
FReal t_m2l_1, t_m2l_2, t_m2l_3;
#endif
public:
/**
* The constructor initializes all constant attributes and it reads the
* precomputed and compressed M2L operators from a binary file (an
* runtime_error is thrown if the required file is not valid).
*/
FUnifSymKernel(const int inTreeHeight,
const FPoint& inBoxCenter,
const FReal inBoxWidth)
: AbstractBaseClass(inTreeHeight, inBoxCenter, inBoxWidth),
SymM2LHandler(new SymM2LHandlerClass(AbstractBaseClass::MatrixKernel.getPtr(), inBoxWidth, inTreeHeight)),
Loc(NULL), Mul(NULL), countExp(NULL),
TLoc(NULL), TMul(NULL),
Dft(rc) // initialize Discrete Fourier Transformator,
{
this->allocateMemoryForPermutedExpansions();
// initialize root node ids
TensorType::setNodeIdsDiff(node_diff);
#ifdef LOG_TIMINGS
t_m2l_1 = FReal(0.);
t_m2l_2 = FReal(0.);
t_m2l_3 = FReal(0.);
#endif
}
/** Copy constructor */
FUnifSymKernel(const FUnifSymKernel& other)
: AbstractBaseClass(other),
SymM2LHandler(other.SymM2LHandler),
Loc(NULL), Mul(NULL), countExp(NULL),
TLoc(NULL), TMul(NULL)
{
this->allocateMemoryForPermutedExpansions();
}
/** Destructor */
~FUnifSymKernel()
{
for (unsigned int t=0; t<343; ++t) {
if (Loc[t]!=NULL) delete [] Loc[t];
if (Mul[t]!=NULL) delete [] Mul[t];
if (TLoc[t]!=NULL) delete [] TLoc[t];
if (TMul[t]!=NULL) delete [] TMul[t];
}
if (Loc!=NULL) delete [] Loc;
if (Mul!=NULL) delete [] Mul;
if (countExp!=NULL) delete [] countExp;
if (TLoc!=NULL) delete [] TLoc;
if (TMul!=NULL) delete [] TMul;
#ifdef LOG_TIMINGS
std::cout << "- Permutation+Pad+Transfo took " << t_m2l_1 << "s"
<< "\n- Apply M2L in Fourier space took " << t_m2l_2 << "s"
<< "\n- Unpermutation+Untransfo+Unpad took " << t_m2l_3 << "s"
<< std::endl;
#endif
}
const SymM2LHandlerClass *const getPtrToSymM2LHandler() const
{ return SymM2LHandler.getPtr(); }
void P2M(CellClass* const LeafCell,
const ContainerClass* const SourceParticles)
{
const FPoint LeafCellCenter(AbstractBaseClass::getLeafCellCenter(LeafCell->getCoordinate()));
for(int idxRhs = 0 ; idxRhs < NVALS ; ++idxRhs){
// 1) apply Sy
AbstractBaseClass::Interpolator->applyP2M(LeafCellCenter, AbstractBaseClass::BoxWidthLeaf,
LeafCell->getMultipole(idxRhs), SourceParticles);
// // 2) apply Discrete Fourier Transform
// SymM2LHandler->applyZeroPaddingAndDFT(LeafCell->getMultipole(idxRhs),
// LeafCell->getTransformedMultipole(idxRhs));
}
}
void M2M(CellClass* const FRestrict ParentCell,
const CellClass*const FRestrict *const FRestrict ChildCells,
const int /*TreeLevel*/)
{
for(int idxRhs = 0 ; idxRhs < NVALS ; ++idxRhs){
// apply Sy
FBlas::scal(nnodes, FReal(0.), ParentCell->getMultipole(idxRhs));
for (unsigned int ChildIndex=0; ChildIndex < 8; ++ChildIndex){
if (ChildCells[ChildIndex]){
AbstractBaseClass::Interpolator->applyM2M(ChildIndex, ChildCells[ChildIndex]->getMultipole(idxRhs), ParentCell->getMultipole(idxRhs));
}
}
// // 2) Apply Discete Fourier Transform
// SymM2LHandler->applyZeroPaddingAndDFT(ParentCell->getMultipole(idxRhs),
// ParentCell->getTransformedMultipole(idxRhs));
}
}
void M2L(CellClass* const FRestrict TargetCell,
const CellClass* SourceCells[343],
const int /*NumSourceCells*/,
const int TreeLevel)
{
#ifdef LOG_TIMINGS
time.tic();
#endif
for(int idxRhs = 0 ; idxRhs < NVALS ; ++idxRhs){
// permute and copy multipole expansion
memset(countExp, 0, sizeof(int) * 343);
for (unsigned int idx=0; idx<343; ++idx) {
if (SourceCells[idx]) {
const unsigned int pidx = SymM2LHandler->pindices[idx];
const unsigned int count = (countExp[pidx])++;
FReal *const mul = Mul[pidx] + count*nnodes;
const unsigned int *const pvec = SymM2LHandler->pvectors[idx];
const FReal *const MultiExp = SourceCells[idx]->getMultipole(idxRhs);
// explicit loop unrolling
for (unsigned int n=0; n<nnodes/4 * 4; n+=4) {
mul[pvec[n ]] = MultiExp[n];
mul[pvec[n+1]] = MultiExp[n+1];
mul[pvec[n+2]] = MultiExp[n+2];
mul[pvec[n+3]] = MultiExp[n+3];
}
for (unsigned int n=nnodes/4 * 4; n<nnodes; ++n){
mul[pvec[n]] = MultiExp[n];
}
// transform permuted expansion
FComplexe *const tmul = TMul[pidx] + count*rc;
///////////////////////////////////////////
FReal pmul[rc];
FBlas::setzero(rc,pmul);
// Apply Zero Padding
for (unsigned int i=0; i<nnodes; ++i)
pmul[node_diff[i*nnodes]]=mul[i];
// Apply forward Discrete Fourier Transform
Dft.applyDFT(pmul,tmul);
///////////////////////////////////////////
}
}
#ifdef LOG_TIMINGS
t_m2l_1 += time.tacAndElapsed();
#endif
#ifdef COUNT_BLOCKED_INTERACTIONS ////////////////////////////////////
unsigned int count_lidx = 0;
unsigned int count_interactions = 0;
for (unsigned int idx=0; idx<343; ++idx)
count_interactions += countExp[idx];
if (count_interactions==189) {
for (unsigned int idx=0; idx<343; ++idx) {
if (countExp[idx])
std::cout << "gidx = " << idx << " gives lidx = " << count_lidx++ << " and has "
<< countExp[idx] << " interactions" << std::endl;
}
std::cout << std::endl;
}
#endif ///////////////////////////////////////////////////////////////
#ifdef LOG_TIMINGS
time.tic();
#endif
// multiply (mat-mat-mul)
const FReal scale = AbstractBaseClass::MatrixKernel->getScaleFactor(AbstractBaseClass::BoxWidth, TreeLevel);
FComplexe tmpTLoc;
for (unsigned int pidx=0; pidx<343; ++pidx) {
const unsigned int count = countExp[pidx];
if (count) {
FComplexe *const tmpK=const_cast<FComplexe*>(SymM2LHandler->getK(TreeLevel, pidx));
for (unsigned int i=0; i<count; ++i){
//unsigned int i=count;
FComplexe *const ttmul = TMul[pidx] + i*rc;
FComplexe *const ttloc = TLoc[pidx] + i*rc;
// Perform entrywise product manually
for (unsigned int j=0; j<opt_rc; ++j){
tmpTLoc=tmpK[j];
tmpTLoc*=ttmul[j];
tmpTLoc*=scale;
ttloc[j]=tmpTLoc;
}
}
// // rank * count * (2*nnodes-1) flops
// FBlas::gemtm(nnodes, rank, count, FReal(1.),
// const_cast<FReal*>(SymM2LHandler->getK(TreeLevel, pidx))+rank*nnodes,
// nnodes, Mul[pidx], nnodes, Compressed, rank);
// // nnodes *count * (2*rank-1) flops
// FBlas::gemm( nnodes, rank, count, scale,
// const_cast<FReal*>(SymM2LHandler->getK(TreeLevel, pidx)),
// nnodes, Compressed, rank, Loc[pidx], nnodes);
}
}
#ifdef LOG_TIMINGS
t_m2l_2 += time.tacAndElapsed();
#endif
#ifdef LOG_TIMINGS
time.tic();
#endif
// permute and add contribution to local expansions
FReal *const LocalExpansion = TargetCell->getLocal(idxRhs);
memset(countExp, 0, sizeof(int) * 343);
for (unsigned int idx=0; idx<343; ++idx) {
if (SourceCells[idx]) {
const unsigned int pidx = SymM2LHandler->pindices[idx];
const unsigned int count = (countExp[pidx])++;
FReal *const loc = Loc[pidx] + count*nnodes;
const FComplexe *const tloc = TLoc[pidx] + count*rc;
///////////////////////////////////////////
FReal ploc[rc];
FBlas::setzero(rc,ploc);
// Apply forward Discrete Fourier Transform
Dft.applyIDFT(tloc,ploc);
// Unapply Zero Padding
for (unsigned int j=0; j<nnodes; ++j)
loc[j]=ploc[node_diff[nnodes-j-1]];
//loc[j]+=ploc[node_diff[nnodes-j-1]];
///////////////////////////////////////////
const unsigned int *const pvec = SymM2LHandler->pvectors[idx];
/*
// no loop unrolling
for (unsigned int n=0; n<nnodes; ++n)
LocalExpansion[n] += loc[pvec[n]];
*/
// explicit loop unrolling
for (unsigned int n=0; n<nnodes/4 * 4; n+=4) {
LocalExpansion[n ] += loc[pvec[n ]];
LocalExpansion[n+1] += loc[pvec[n+1]];
LocalExpansion[n+2] += loc[pvec[n+2]];
LocalExpansion[n+3] += loc[pvec[n+3]];
}
for (unsigned int n=nnodes/4 * 4; n<nnodes; ++n)
LocalExpansion[n] += loc[pvec[n]];
}
}