FChebInterpolator.hpp 72.5 KB
Newer Older
BRAMAS Berenger's avatar
BRAMAS Berenger committed
1
// ===================================================================================
COULAUD Olivier's avatar
COULAUD Olivier committed
2
// Copyright ScalFmm 2011 INRIA, Olivier Coulaud, Berenger Bramas, Matthias Messner
3 4 5 6
// olivier.coulaud@inria.fr, berenger.bramas@inria.fr
// This software is a computer program whose purpose is to compute the FMM.
//
// This software is governed by the CeCILL-C and LGPL licenses and
7 8
// abiding by the rules of distribution of free software.
//
9 10 11 12
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
// GNU General Public and CeCILL-C Licenses for more details.
13
// "http://www.cecill.info".
14
// "http://www.gnu.org/licenses".
BRAMAS Berenger's avatar
BRAMAS Berenger committed
15
// ===================================================================================
16 17 18 19
#ifndef FCHEBINTERPOLATOR_HPP
#define FCHEBINTERPOLATOR_HPP


20 21 22 23
#include "../Interpolation/FInterpMapping.hpp"
#include "../Interpolation/FInterpMatrixKernel.hpp" //PB
#include "FChebTensor.hpp"
#include "FChebRoots.hpp"
24

25
#include "Utils/FBlas.hpp"
26

27 28 29 30 31 32 33 34 35


/**
 * @author Matthias Messner (matthias.matthias@inria.fr)
 * Please read the license
 */

/**
 * @class FChebInterpolator
36
 *
37
 * The class @p FChebInterpolator defines the anterpolation (M2M) and
38
 * interpolation (L2L) concerning operations.
39 40 41
 *
 * PB: MatrixKernelClass is passed as template in order to inform interpolator
 * of the size of the vectorial interpolators. Default is the scalar
42
 * matrix kernel class of type ONE_OVER_R (NRHS=NLHS=1).
43
 */
44
template <int ORDER, class MatrixKernelClass = struct FInterpMatrixKernelR>
45 46 47
class FChebInterpolator : FNoCopyable
{
  // compile time constants and types
48 49
  enum {nnodes = TensorTraits<ORDER>::nnodes,
        nRhs = MatrixKernelClass::NRHS,
50 51
        nLhs = MatrixKernelClass::NLHS,
        nPV = MatrixKernelClass::NPV};
52 53 54
  typedef FChebRoots< ORDER>  BasisType;
  typedef FChebTensor<ORDER> TensorType;

55 56
protected: // PB for OptiDis

57
  FReal T_of_roots[ORDER][ORDER];
58
  FReal T[ORDER * (ORDER-1)];
59 60
  unsigned int node_ids[nnodes][3];

61
  // 8 Non-leaf (i.e. M2M/L2L) interpolators
62 63
  // x1 per level if box is extended
  // only 1 is required for all levels if extension is 0
64 65 66 67 68 69 70 71 72
  FReal*** ChildParentInterpolator;

  // Tree height (needed by M2M/L2L if cell width is extended)
  const int TreeHeight;
  // Root cell width (only used by M2M/L2L)
  const FReal RootCellWidth;
  // Cell width extension (only used by M2M/L2L, kernel handles extension for P2M/L2P)
  const FReal CellWidthExtension;

73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273

    // permutations (only needed in the tensor product interpolation case)
    unsigned int perm[3][nnodes];

    ////////////////////////////////////////////////////////////////////
    // needed for P2M
    struct IMN2MNI {
        enum {size = ORDER * (ORDER-1) * (ORDER-1)};
        unsigned int imn[size], mni[size];
        IMN2MNI() {
            unsigned int counter = 0;
            for (unsigned int i=0; i<ORDER; ++i) {
                for (unsigned int m=0; m<ORDER-1; ++m) {
                    for (unsigned int n=0; n<ORDER-1; ++n) {
                        imn[counter] = n*(ORDER-1)*ORDER + m*ORDER + i;
                        mni[counter] = i*(ORDER-1)*(ORDER-1) + n*(ORDER-1) + m;
                        counter++;
                    }
                }
            }
        }
    } perm0;

    struct JNI2NIJ {
        enum {size = ORDER * ORDER * (ORDER-1)};
        unsigned int jni[size], nij[size];
        JNI2NIJ() {
            unsigned int counter = 0;
            for (unsigned int i=0; i<ORDER; ++i) {
                for (unsigned int j=0; j<ORDER; ++j) {
                    for (unsigned int n=0; n<ORDER-1; ++n) {
                        jni[counter] = i*(ORDER-1)*ORDER + n*ORDER + j;
                        nij[counter] = j*ORDER*(ORDER-1) + i*(ORDER-1) + n;
                        counter++;
                    }
                }
            }
        }
    } perm1;

    struct KIJ2IJK {
        enum {size = ORDER * ORDER * ORDER};
        unsigned int kij[size], ijk[size];
        KIJ2IJK() {
            unsigned int counter = 0;
            for (unsigned int i=0; i<ORDER; ++i) {
                for (unsigned int j=0; j<ORDER; ++j) {
                    for (unsigned int k=0; k<ORDER; ++k) {
                        kij[counter] = j*ORDER*ORDER + i*ORDER + k;
                        ijk[counter] = k*ORDER*ORDER + j*ORDER + i;
                        counter++;
                    }
                }
            }
        }
    } perm2;
    ////////////////////////////////////////////////////////////////////

    ////////////////////////////////////////////////////////////////////
    // needed for L2P
    struct IJK2JKI {
        enum {size = ORDER * ORDER * ORDER};
        unsigned int ijk[size], jki[size];
        IJK2JKI() {
            unsigned int counter = 0;
            for (unsigned int i=0; i<ORDER; ++i) {
                for (unsigned int j=0; j<ORDER; ++j) {
                    for (unsigned int k=0; k<ORDER; ++k) {
                        ijk[counter] = k*ORDER*ORDER + j*ORDER + i;
                        jki[counter] = i*ORDER*ORDER + k*ORDER + j;
                        counter++;
                    }
                }
            }
        }
        void permute(const FReal *const IN, FReal *const OUT) const
        { for (unsigned int i=0; i<size; ++i) OUT[jki[i]] = IN[ijk[i]]; }
    } perm3;

    struct IJK2KIJ {
        enum {size = ORDER * ORDER * ORDER};
        unsigned int ijk[size], kij[size];
        IJK2KIJ() {
            unsigned int counter = 0;
            for (unsigned int i=0; i<ORDER; ++i) {
                for (unsigned int j=0; j<ORDER; ++j) {
                    for (unsigned int k=0; k<ORDER; ++k) {
                        ijk[counter] = k*ORDER*ORDER + j*ORDER + i;
                        kij[counter] = j*ORDER*ORDER + i*ORDER + k;
                        counter++;
                    }
                }
            }
        }
        void permute(const FReal *const IN, FReal *const OUT) const
        { for (unsigned int i=0; i<size; ++i) OUT[kij[i]] = IN[ijk[i]]; }
    } perm4;

    struct LJK2JKL {
        enum {size = (ORDER-1) * ORDER * ORDER};
        unsigned int ljk[size], jkl[size];
        LJK2JKL() {
            unsigned int counter = 0;
            for (unsigned int l=0; l<ORDER-1; ++l) {
                for (unsigned int j=0; j<ORDER; ++j) {
                    for (unsigned int k=0; k<ORDER; ++k) {
                        ljk[counter] = k*ORDER*(ORDER-1) + j*(ORDER-1) + l;
                        jkl[counter] = l*ORDER*ORDER + k*ORDER + j;
                        counter++;
                    }
                }
            }
        }
        void permute(const FReal *const IN, FReal *const OUT) const
        { for (unsigned int i=0; i<size; ++i) OUT[jkl[i]] = IN[ljk[i]]; }
    } perm5;

    struct LJK2KLJ {
        enum {size = (ORDER-1) * ORDER * ORDER};
        unsigned int ljk[size], klj[size];
        LJK2KLJ() {
            unsigned int counter = 0;
            for (unsigned int l=0; l<ORDER-1; ++l) {
                for (unsigned int j=0; j<ORDER; ++j) {
                    for (unsigned int k=0; k<ORDER; ++k) {
                        ljk[counter] = k*ORDER*(ORDER-1) + j*(ORDER-1) + l;
                        klj[counter] = j*(ORDER-1)*ORDER + l*ORDER + k;
                        counter++;
                    }
                }
            }
        }
        void permute(const FReal *const IN, FReal *const OUT) const
        { for (unsigned int i=0; i<size; ++i) OUT[klj[i]] = IN[ljk[i]]; }
    } perm6;

    struct MKI2KIM {
        enum {size = (ORDER-1) * ORDER * ORDER};
        unsigned int mki[size], kim[size];
        MKI2KIM() {
            unsigned int counter = 0;
            for (unsigned int m=0; m<ORDER-1; ++m) {
                for (unsigned int k=0; k<ORDER; ++k) {
                    for (unsigned int i=0; i<ORDER; ++i) {
                        mki[counter] = i*ORDER*(ORDER-1) + k*(ORDER-1) + m;
                        kim[counter] = m*ORDER*ORDER + i*ORDER + k;
                        counter++;
                    }
                }
            }
        }
        void permute(const FReal *const IN, FReal *const OUT) const
        { for (unsigned int i=0; i<size; ++i) OUT[kim[i]] = IN[mki[i]]; }
    } perm7;

    struct MKL2KLM {
        enum {size = (ORDER-1) * ORDER * (ORDER-1)};
        unsigned int mkl[size], klm[size];
        MKL2KLM() {
            unsigned int counter = 0;
            for (unsigned int m=0; m<ORDER-1; ++m) {
                for (unsigned int k=0; k<ORDER; ++k) {
                    for (unsigned int l=0; l<ORDER-1; ++l) {
                        mkl[counter] = l*ORDER*(ORDER-1) + k*(ORDER-1) + m;
                        klm[counter] = m*(ORDER-1)*ORDER + l*ORDER + k;
                        counter++;
                    }
                }
            }
        }
        void permute(const FReal *const IN, FReal *const OUT) const
        { for (unsigned int i=0; i<size; ++i) OUT[klm[i]] = IN[mkl[i]]; }
    } perm8;

    struct NLM2LMN {
        enum {size = (ORDER-1) * (ORDER-1) * (ORDER-1)};
        unsigned int nlm[size], lmn[size];
        NLM2LMN() {
            unsigned int counter = 0;
            for (unsigned int n=0; n<ORDER-1; ++n) {
                for (unsigned int l=0; l<ORDER-1; ++l) {
                    for (unsigned int m=0; m<ORDER-1; ++m) {
                        nlm[counter] = m*(ORDER-1)*(ORDER-1) + l*(ORDER-1) + n;
                        lmn[counter] = n*(ORDER-1)*(ORDER-1) + m*(ORDER-1) + l;
                        counter++;
                    }
                }
            }
        }
        void permute(const FReal *const IN, FReal *const OUT) const
        { for (unsigned int i=0; i<size; ++i) OUT[lmn[i]] = IN[nlm[i]]; }
    } perm9;

    ////////////////////////////////////////////////////////////////////



    /**
     * Initialize the child - parent - interpolator, it is basically the matrix
     * S which is precomputed and reused for all M2M and L2L operations, ie for
     * all non leaf inter/anterpolations.
274 275
     * This is a sub-optimal version : complexity p^6.
     * This function handles extended cells.
276
     */
277 278 279
  void initM2MandL2L(const int TreeLevel, const FReal ParentWidth)
  {
    FPoint ChildRoots[nnodes];
280

281
    // Ratio of extended cell widths (definition: child ext / parent ext)
282
    const FReal ExtendedCellRatio =
283
      FReal(FReal(ParentWidth)/FReal(2.) + CellWidthExtension) / FReal(ParentWidth + CellWidthExtension);
284

285 286 287
    // Child cell center and width
    FPoint ChildCenter;
    const FReal ChildWidth(2.*ExtendedCellRatio);
288

289 290
    // loop: child cells
    for (unsigned int child=0; child<8; ++child) {
291

292 293
      // allocate memory
      ChildParentInterpolator[TreeLevel][child] = new FReal [nnodes * nnodes];
294

295 296 297
      // set child info
      FChebTensor<ORDER>::setRelativeChildCenter(child, ChildCenter, ExtendedCellRatio);
      FChebTensor<ORDER>::setRoots(ChildCenter, ChildWidth, ChildRoots);
298

299 300 301 302 303
      // assemble child - parent - interpolator
      assembleInterpolator(nnodes, ChildRoots, ChildParentInterpolator[TreeLevel][child]);
    }
  }

304

305 306 307 308 309 310 311 312 313 314
  /**
   * Initialize the child - parent - interpolator, it is basically the matrix
   * S which is precomputed and reused for all M2M and L2L operations, ie for
   * all non leaf inter/anterpolations.
   * This is a more optimal version : complexity p^4.
   * This function handles extended cells.
   *
   */
  void initTensorM2MandL2L(const int TreeLevel, const FReal ParentWidth)
  {
315
    FReal ChildCoords[3][ORDER];
316
    FPoint ChildCenter;
317

318
    // Ratio of extended cell widths (definition: child ext / parent ext)
319
    const FReal ExtendedCellRatio =
320 321 322 323 324 325 326 327
      FReal(FReal(ParentWidth)/FReal(2.) + CellWidthExtension) / FReal(ParentWidth + CellWidthExtension);

    // Child cell width
    const FReal ChildWidth(2.*ExtendedCellRatio);

    // loop: child cells
    for (unsigned int child=0; child<8; ++child) {

328
      // set child info
329 330 331 332 333 334 335 336
      FChebTensor<ORDER>::setRelativeChildCenter(child, ChildCenter, ExtendedCellRatio);
      FChebTensor<ORDER>::setPolynomialsRoots(ChildCenter, ChildWidth, ChildCoords);
      // allocate memory
      ChildParentInterpolator[TreeLevel][child] = new FReal [3 * ORDER*ORDER];
      assembleInterpolator(ORDER, ChildCoords[0], ChildParentInterpolator[TreeLevel][child]);
      assembleInterpolator(ORDER, ChildCoords[1], ChildParentInterpolator[TreeLevel][child] + 1 * ORDER*ORDER);
      assembleInterpolator(ORDER, ChildCoords[2], ChildParentInterpolator[TreeLevel][child] + 2 * ORDER*ORDER);
    }
337 338


339 340 341 342 343 344 345 346
    // init permutations
    for (unsigned int i=0; i<ORDER; ++i) {
      for (unsigned int j=0; j<ORDER; ++j) {
        for (unsigned int k=0; k<ORDER; ++k) {
          const unsigned int index = k*ORDER*ORDER + j*ORDER + i;
          perm[0][index] = k*ORDER*ORDER + j*ORDER + i;
          perm[1][index] = i*ORDER*ORDER + k*ORDER + j;
          perm[2][index] = j*ORDER*ORDER + i*ORDER + k;
347
        }
348
      }
349
    }
350

351 352
  }

353 354 355


public:
356 357 358
    /**
     * Constructor: Initialize the Chebyshev polynomials at the Chebyshev
     * roots/interpolation point
359 360 361
     *
     * PB: Input parameters ONLY affect the computation of the M2M/L2L ops.
     * These parameters are ONLY required in the context of extended bbox.
362
     * If no M2M/L2L is required then the interpolator can be built with
363
     * the default ctor.
364
     */
365
  explicit FChebInterpolator(const int inTreeHeight=3,
366 367 368
                             const FReal inRootCellWidth=FReal(1.),
                             const FReal inCellWidthExtension=FReal(0.))
  : TreeHeight(inTreeHeight),
369 370
    RootCellWidth(inRootCellWidth),
    CellWidthExtension(inCellWidthExtension)
371 372
    {
        // initialize chebyshev polynomials of root nodes: T_o(x_j)
373 374
    for (unsigned int o=1; o<ORDER; ++o)
      for (unsigned int j=0; j<ORDER; ++j)
messner's avatar
messner committed
375
        T_of_roots[o][j] = FReal(BasisType::T(o, FReal(BasisType::roots[j])));
376

377
        // initialize chebyshev polynomials of root nodes: T_o(x_j)
378 379 380
    for (unsigned int o=1; o<ORDER; ++o)
      for (unsigned int j=0; j<ORDER; ++j)
        T[(o-1)*ORDER + j] = FReal(BasisType::T(o, FReal(BasisType::roots[j])));
381 382 383 384 385


        // initialize root node ids
        TensorType::setNodeIds(node_ids);

386 387 388 389
        // initialize interpolation operator for M2M/L2L (non leaf operations)

        // allocate 8 arrays per level
        ChildParentInterpolator = new FReal**[TreeHeight];
COULAUD Olivier's avatar
COULAUD Olivier committed
390
        for (unsigned int l=0; l<static_cast<unsigned int>(TreeHeight); ++l){
391 392
          ChildParentInterpolator[l] = new FReal*[8];
          for (unsigned int c=0; c<8; ++c)
393
            ChildParentInterpolator[l][c]=nullptr;
394 395 396 397 398 399
        }

        // Set number of non-leaf ios that actually need to be computed
        unsigned int reducedTreeHeight; // = 2 + nb of computed nl ios
        if(CellWidthExtension==0.) // if no cell extension, then ...
          reducedTreeHeight = 3; // cmp only 1 non-leaf io
400
        else
401 402 403 404 405
          reducedTreeHeight = TreeHeight; // cmp 1 non-leaf io per level

        // Init non-leaf interpolators
        FReal CellWidth = RootCellWidth / FReal(2.); // at level 1
        CellWidth /= FReal(2.);                      // at level 2
406

407
        for (unsigned int l=2; l<reducedTreeHeight; ++l) {
408

409 410 411 412
          //this -> initM2MandL2L(l,CellWidth);     // non tensor-product interpolation
          this -> initTensorM2MandL2L(l,CellWidth); // tensor-product interpolation

          // update cell width
413
          CellWidth /= FReal(2.);                    // at level l+1
414
        }
415 416 417 418 419 420 421 422
    }


    /**
     * Destructor: Delete dynamically allocated memory for M2M and L2L operator
     */
    ~FChebInterpolator()
    {
423 424 425 426 427 428 429 430 431
        for (unsigned int l=0; l<static_cast<unsigned int>(TreeHeight); ++l){
            for (unsigned int child=0; child<8; ++child){
                if(ChildParentInterpolator[l][child] != nullptr){
                    delete [] ChildParentInterpolator[l][child];
                }
            }
            delete[] ChildParentInterpolator[l];
        }
        delete[] ChildParentInterpolator;
432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501
    }


    /**
     * Assembles the interpolator \f$S_\ell\f$ of size \f$N\times
     * \ell^3\f$. Here local points is meant as points whose global coordinates
     * have already been mapped to the reference interval [-1,1].
     *
     * @param[in] NumberOfLocalPoints
     * @param[in] LocalPoints
     * @param[out] Interpolator
     */
    void assembleInterpolator(const unsigned int NumberOfLocalPoints,
                  const FPoint *const LocalPoints,
                  FReal *const Interpolator) const
    {
        // values of chebyshev polynomials of source particle: T_o(x_i)
        FReal T_of_x[ORDER][3];
        // loop: local points (mapped in [-1,1])
        for (unsigned int m=0; m<NumberOfLocalPoints; ++m) {
            // evaluate chebyshev polynomials at local points
            for (unsigned int o=1; o<ORDER; ++o) {
                T_of_x[o][0] = BasisType::T(o, LocalPoints[m].getX());
                T_of_x[o][1] = BasisType::T(o, LocalPoints[m].getY());
                T_of_x[o][2] = BasisType::T(o, LocalPoints[m].getZ());
            }

            // assemble interpolator
            for (unsigned int n=0; n<nnodes; ++n) {
                //Interpolator[n*nnodes + m] = FReal(1.);
                Interpolator[n*NumberOfLocalPoints + m] = FReal(1.);
                for (unsigned int d=0; d<3; ++d) {
                    const unsigned int j = node_ids[n][d];
                    FReal S_d = FReal(1.) / ORDER;
                    for (unsigned int o=1; o<ORDER; ++o)
                        S_d += FReal(2.) / ORDER * T_of_x[o][d] * T_of_roots[o][j];
                    //Interpolator[n*nnodes + m] *= S_d;
                    Interpolator[n*NumberOfLocalPoints + m] *= S_d;
                }

            }

        }

    }


    void assembleInterpolator(const unsigned int M, const FReal *const x, FReal *const S) const
    {
        // values of chebyshev polynomials of source particle: T_o(x_i)
        FReal T_of_x[ORDER];

        // loop: local points (mapped in [-1,1])
        for (unsigned int m=0; m<M; ++m) {
            // evaluate chebyshev polynomials at local points
            for (unsigned int o=1; o<ORDER; ++o)
                T_of_x[o] = BasisType::T(o, x[m]);

            for (unsigned int n=0; n<ORDER; ++n) {
                S[n*M + m] = FReal(1.) / ORDER;
                for (unsigned int o=1; o<ORDER; ++o)
                    S[n*M + m] += FReal(2.) / ORDER * T_of_x[o] * T_of_roots[o][n];
            }

        }

    }



502
    const unsigned int * getPermutationsM2ML2L(unsigned int i) const
503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573
    { return perm[i]; }






    /**
     * Particle to moment: application of \f$S_\ell(y,\bar y_n)\f$
     * (anterpolation, it is the transposed interpolation)
     */
    template <class ContainerClass>
    void applyP2M(const FPoint& center,
                                const FReal width,
                                FReal *const multipoleExpansion,
                                const ContainerClass *const sourceParticles) const;



    /**
     * Local to particle operation: application of \f$S_\ell(x,\bar x_m)\f$ (interpolation)
     */
    template <class ContainerClass>
    void applyL2P(const FPoint& center,
                                const FReal width,
                                const FReal *const localExpansion,
                                ContainerClass *const localParticles) const;


    /**
     * Local to particle operation: application of \f$\nabla_x S_\ell(x,\bar x_m)\f$ (interpolation)
     */
    template <class ContainerClass>
    void applyL2PGradient(const FPoint& center,
                                                const FReal width,
                                                const FReal *const localExpansion,
                                                ContainerClass *const localParticles) const;

    /**
     * Local to particle operation: application of \f$S_\ell(x,\bar x_m)\f$ and
     * \f$\nabla_x S_\ell(x,\bar x_m)\f$ (interpolation)
     */
    template <class ContainerClass>
    void applyL2PTotal(const FPoint& center,
                                         const FReal width,
                                         const FReal *const localExpansion,
                                         ContainerClass *const localParticles) const;


    /*
    void applyM2M(const unsigned int ChildIndex,
                                const FReal *const ChildExpansion,
                                FReal *const ParentExpansion) const
    {
        FBlas::gemtva(nnodes, nnodes, FReal(1.),
                                    ChildParentInterpolator[ChildIndex],
                                    const_cast<FReal*>(ChildExpansion), ParentExpansion);
    }

    void applyL2L(const unsigned int ChildIndex,
                                const FReal *const ParentExpansion,
                                FReal *const ChildExpansion) const
    {
        FBlas::gemva(nnodes, nnodes, FReal(1.),
                                 ChildParentInterpolator[ChildIndex],
                                 const_cast<FReal*>(ParentExpansion), ChildExpansion);
    }
    */


    void applyM2M(const unsigned int ChildIndex,
574 575 576
                  const FReal *const ChildExpansion,
                  FReal *const ParentExpansion,
                  const unsigned int TreeLevel = 2) const
577 578 579 580
    {
        FReal Exp[nnodes], PermExp[nnodes];
        // ORDER*ORDER*ORDER * (2*ORDER-1)
        FBlas::gemtm(ORDER, ORDER, ORDER*ORDER, FReal(1.),
581
                                 ChildParentInterpolator[TreeLevel][ChildIndex], ORDER,
582 583 584 585 586
                                 const_cast<FReal*>(ChildExpansion), ORDER, PermExp, ORDER);

        for (unsigned int n=0; n<nnodes; ++n)	Exp[n] = PermExp[perm[1][n]];
        // ORDER*ORDER*ORDER * (2*ORDER-1)
        FBlas::gemtm(ORDER, ORDER, ORDER*ORDER, FReal(1.),
587
                                 ChildParentInterpolator[TreeLevel][ChildIndex] + 2 * ORDER*ORDER, ORDER,
588 589 590 591 592
                                 Exp, ORDER, PermExp, ORDER);

        for (unsigned int n=0; n<nnodes; ++n)	Exp[perm[1][n]] = PermExp[perm[2][n]];
        // ORDER*ORDER*ORDER * (2*ORDER-1)
        FBlas::gemtm(ORDER, ORDER, ORDER*ORDER, FReal(1.),
593
                                 ChildParentInterpolator[TreeLevel][ChildIndex] + 1 * ORDER*ORDER, ORDER,
594 595 596 597 598 599 600
                                 Exp, ORDER, PermExp, ORDER);

        for (unsigned int n=0; n<nnodes; ++n)	ParentExpansion[perm[2][n]] += PermExp[n];
    }


    void applyL2L(const unsigned int ChildIndex,
601 602 603
                  const FReal *const ParentExpansion,
                  FReal *const ChildExpansion,
                  const unsigned int TreeLevel = 2) const
604 605 606 607
    {
        FReal Exp[nnodes], PermExp[nnodes];
        // ORDER*ORDER*ORDER * (2*ORDER-1)
        FBlas::gemm(ORDER, ORDER, ORDER*ORDER, FReal(1.),
608
                                ChildParentInterpolator[TreeLevel][ChildIndex], ORDER,
609 610 611 612 613
                                const_cast<FReal*>(ParentExpansion), ORDER, PermExp, ORDER);

        for (unsigned int n=0; n<nnodes; ++n)	Exp[n] = PermExp[perm[1][n]];
        // ORDER*ORDER*ORDER * (2*ORDER-1)
        FBlas::gemm(ORDER, ORDER, ORDER*ORDER, FReal(1.),
614
                                ChildParentInterpolator[TreeLevel][ChildIndex] + 2 * ORDER*ORDER, ORDER,
615 616 617 618 619
                                Exp, ORDER, PermExp, ORDER);

        for (unsigned int n=0; n<nnodes; ++n)	Exp[perm[1][n]] = PermExp[perm[2][n]];
        // ORDER*ORDER*ORDER * (2*ORDER-1)
        FBlas::gemm(ORDER, ORDER, ORDER*ORDER, FReal(1.),
620
                                ChildParentInterpolator[TreeLevel][ChildIndex] + 1 * ORDER*ORDER, ORDER,
621 622 623 624 625
                                Exp, ORDER, PermExp, ORDER);

        for (unsigned int n=0; n<nnodes; ++n)	ChildExpansion[perm[2][n]] += PermExp[n];
    }
    // total flops count: 3 * ORDER*ORDER*ORDER * (2*ORDER-1)
626
};
627 628 629



630 631 632 633 634 635 636 637




/**
 * Particle to moment: application of \f$S_\ell(y,\bar y_n)\f$
 * (anterpolation, it is the transposed interpolation)
 */
638
template <int ORDER, class MatrixKernelClass>
639
template <class ContainerClass>
640 641 642 643
inline void FChebInterpolator<ORDER,MatrixKernelClass>::applyP2M(const FPoint& center,
                                                                 const FReal width,
                                                                 FReal *const multipoleExpansion,
                                                                 const ContainerClass *const inParticles) const
644
{
645 646 647 648 649

    // allocate stuff
    const map_glob_loc map(center, width);
    FPoint localPosition;

650
    // Init W
651 652 653 654 655 656 657 658 659 660 661
    FReal W1[nRhs];
    FReal W2[nRhs][3][ ORDER-1];
    FReal W4[nRhs][3][(ORDER-1)*(ORDER-1)];
    FReal W8[nRhs][   (ORDER-1)*(ORDER-1)*(ORDER-1)];
    for(int idxRhs = 0 ; idxRhs < nRhs ; ++idxRhs){
      W1[idxRhs] = FReal(0.);
      for(unsigned int i=0; i<(ORDER-1); ++i) W2[idxRhs][0][i] = W2[idxRhs][1][i] = W2[idxRhs][2][i] = FReal(0.);
      for(unsigned int i=0; i<(ORDER-1)*(ORDER-1); ++i)	W4[idxRhs][0][i] = W4[idxRhs][1][i] = W4[idxRhs][2][i] = FReal(0.);
      for(unsigned int i=0; i<(ORDER-1)*(ORDER-1)*(ORDER-1); ++i)	W8[idxRhs][i] = FReal(0.);
    }

662 663

    // loop over source particles
664
//    const FReal*const physicalValues = inParticles->getPhysicalValues();
665 666 667
    const FReal*const positionsX = inParticles->getPositions()[0];
    const FReal*const positionsY = inParticles->getPositions()[1];
    const FReal*const positionsZ = inParticles->getPositions()[2];
668

669
    for(int idxPart = 0 ; idxPart < inParticles->getNbParticles() ; ++idxPart){
670
        // map global position to [-1,1]
671
        map(FPoint(positionsX[idxPart],positionsY[idxPart],positionsZ[idxPart]), localPosition); // 15 flops
672 673 674 675 676 677 678 679 680 681 682 683 684

        FReal T_of_x[3][ORDER];
        T_of_x[0][0] = FReal(1.); T_of_x[0][1] = localPosition.getX();
        T_of_x[1][0] = FReal(1.); T_of_x[1][1] = localPosition.getY();
        T_of_x[2][0] = FReal(1.); T_of_x[2][1] = localPosition.getZ();
        const FReal x2 = FReal(2.) * T_of_x[0][1]; // 1 flop
        const FReal y2 = FReal(2.) * T_of_x[1][1]; // 1 flop
        const FReal z2 = FReal(2.) * T_of_x[2][1]; // 1 flop
        for (unsigned int j=2; j<ORDER; ++j) {
            T_of_x[0][j] = x2 * T_of_x[0][j-1] - T_of_x[0][j-2]; // 2 flops
            T_of_x[1][j] = y2 * T_of_x[1][j-1] - T_of_x[1][j-2]; // 2 flops
            T_of_x[2][j] = z2 * T_of_x[2][j-1] - T_of_x[2][j-2]; // 2 flops
        }
685 686
        for(int idxRhs = 0 ; idxRhs < nRhs ; ++idxRhs){
          const FReal*const physicalValues = inParticles->getPhysicalValues(idxRhs);
687

688 689 690
          const FReal weight = physicalValues[idxPart];
          W1[idxRhs] += weight; // 1 flop
          for (unsigned int i=1; i<ORDER; ++i) {
691 692 693
            const FReal wx = weight * T_of_x[0][i]; // 1 flop
            const FReal wy = weight * T_of_x[1][i]; // 1 flop
            const FReal wz = weight * T_of_x[2][i]; // 1 flop
694 695 696
            W2[idxRhs][0][i-1] += wx; // 1 flop
            W2[idxRhs][1][i-1] += wy; // 1 flop
            W2[idxRhs][2][i-1] += wz; // 1 flop
697
            for (unsigned int j=1; j<ORDER; ++j) {
698 699 700 701 702 703 704 705 706 707
              const FReal wxy = wx * T_of_x[1][j]; // 1 flop
              const FReal wxz = wx * T_of_x[2][j]; // 1 flop
              const FReal wyz = wy * T_of_x[2][j]; // 1 flop
              W4[idxRhs][0][(j-1)*(ORDER-1) + (i-1)] += wxy; // 1 flop
              W4[idxRhs][1][(j-1)*(ORDER-1) + (i-1)] += wxz; // 1 flop
              W4[idxRhs][2][(j-1)*(ORDER-1) + (i-1)] += wyz; // 1 flop
              for (unsigned int k=1; k<ORDER; ++k) {
                const FReal wxyz = wxy * T_of_x[2][k]; // 1 flop
                W8[idxRhs][(k-1)*(ORDER-1)*(ORDER-1) + (j-1)*(ORDER-1) + (i-1)] += wxyz; // 1 flop
              } // flops: (ORDER-1) * 2
708
            } // flops: (ORDER-1) * (6 + (ORDER-1) * 2)
709
          } // flops: (ORDER-1) * (6 + (ORDER-1) * (6 + (ORDER-1) * 2))
710
        } // flops: ... * NRHS
711 712 713 714
    } // flops: N * (18 + (ORDER-2) * 6 + (ORDER-1) * (6 + (ORDER-1) * (6 + (ORDER-1) * 2)))

    ////////////////////////////////////////////////////////////////////

715 716 717

    for(int idxRhs = 0 ; idxRhs < nRhs ; ++idxRhs){

718 719 720 721 722 723
    // loop over interpolation points
    FReal F2[3][ORDER];
    FReal F4[3][ORDER*ORDER];
    FReal F8[   ORDER*ORDER*ORDER];
    {
        // compute W2: 3 * ORDER*(2*(ORDER-1)-1) flops
724 725 726
        FBlas::gemv(ORDER, ORDER-1, FReal(1.), const_cast<FReal*>(T), W2[idxRhs][0], F2[0]);
        FBlas::gemv(ORDER, ORDER-1, FReal(1.), const_cast<FReal*>(T), W2[idxRhs][1], F2[1]);
        FBlas::gemv(ORDER, ORDER-1, FReal(1.), const_cast<FReal*>(T), W2[idxRhs][2], F2[2]);
727 728 729

        // compute W4: 3 * [ORDER*(ORDER-1)*(2*(ORDER-1)-1) + ORDER*ORDER*(2*(ORDER-1)-1)]
        FReal C[ORDER * (ORDER-1)];
730
        FBlas::gemmt(ORDER, ORDER-1, ORDER-1, FReal(1.), const_cast<FReal*>(T), ORDER, W4[idxRhs][0], ORDER-1, C,     ORDER);
731
        FBlas::gemmt(ORDER, ORDER-1, ORDER,   FReal(1.), const_cast<FReal*>(T), ORDER, C,     ORDER,   F4[0], ORDER);
732
        FBlas::gemmt(ORDER, ORDER-1, ORDER-1, FReal(1.), const_cast<FReal*>(T), ORDER, W4[idxRhs][1], ORDER-1, C,     ORDER);
733
        FBlas::gemmt(ORDER, ORDER-1, ORDER,   FReal(1.), const_cast<FReal*>(T), ORDER, C,     ORDER,   F4[1], ORDER);
734
        FBlas::gemmt(ORDER, ORDER-1, ORDER-1, FReal(1.), const_cast<FReal*>(T), ORDER, W4[idxRhs][2], ORDER-1, C,     ORDER);
735 736 737 738
        FBlas::gemmt(ORDER, ORDER-1, ORDER,   FReal(1.), const_cast<FReal*>(T), ORDER, C,     ORDER,   F4[2], ORDER);

        // compute W8: 3 * (2*(ORDER-1)-1) * [ORDER*(ORDER-1)*(ORDER-1) + ORDER*ORDER*(ORDER-1) + ORDER*ORDER*ORDER]
        FReal D[ORDER * (ORDER-1) * (ORDER-1)];
739
        FBlas::gemm(ORDER, ORDER-1, (ORDER-1)*(ORDER-1), FReal(1.),	const_cast<FReal*>(T), ORDER, W8[idxRhs], ORDER-1, D, ORDER);
740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755
        FReal E[(ORDER-1) * (ORDER-1) * ORDER];
        for (unsigned int s=0; s<perm0.size; ++s)	E[perm0.mni[s]] = D[perm0.imn[s]];
        FReal F[ORDER * (ORDER-1) * ORDER];
        FBlas::gemm(ORDER, ORDER-1, ORDER*(ORDER-1), FReal(1.), const_cast<FReal*>(T), ORDER, E, ORDER-1, F, ORDER);
        FReal G[(ORDER-1) * ORDER * ORDER];
        for (unsigned int s=0; s<perm1.size; ++s)	G[perm1.nij[s]] = F[perm1.jni[s]];
        FReal H[ORDER * ORDER * ORDER];
        FBlas::gemm(ORDER, ORDER-1, ORDER*ORDER, FReal(1.), const_cast<FReal*>(T), ORDER, G, ORDER-1, H, ORDER);
        for (unsigned int s=0; s<perm2.size; ++s)	F8[perm2.ijk[s]] = H[perm2.kij[s]];
    }

    // assemble multipole expansions
    for (unsigned int i=0; i<ORDER; ++i) {
        for (unsigned int j=0; j<ORDER; ++j) {
            for (unsigned int k=0; k<ORDER; ++k) {
                const unsigned int idx = k*ORDER*ORDER + j*ORDER + i;
756
                multipoleExpansion[2*idxRhs*nnodes + idx] += (W1[idxRhs] +
757 758 759 760 761 762
                                                                     FReal(2.) * (F2[0][i] + F2[1][j] + F2[2][k]) +
                                                                     FReal(4.) * (F4[0][j*ORDER+i] + F4[1][k*ORDER+i] + F4[2][k*ORDER+j]) +
                                                                     FReal(8.) *  F8[idx]) / nnodes; // 11 * ORDER*ORDER*ORDER flops
            }
        }
    }
763

764 765
    } // NRHS

766 767 768
}


769 770 771 772 773 774 775
///**
// * Particle to moment: application of \f$S_\ell(y,\bar y_n)\f$
// * (anterpolation, it is the transposed interpolation)
// */
//template <int ORDER>
//template <class ContainerClass>
//inline void FChebInterpolator<ORDER>::applyP2M(const FPoint& center,
776 777 778
//                                                                                                                                                                                       const FReal width,
//                                                                                                                                                                                       FReal *const multipoleExpansion,
//                                                                                                                                                                                       const ContainerClass *const sourceParticles) const
779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810
//{
//	// set all multipole expansions to zero
//	FBlas::setzero(nnodes, multipoleExpansion);
//
//	// allocate stuff
//	const map_glob_loc map(center, width);
//	FPoint localPosition;
//	FReal T_of_x[ORDER][3];
//	FReal S[3], c1;
//	//
//	FReal xpx,ypy,zpz ;
//	c1 = FReal(8.) / nnodes ; // 1 flop
//	// loop over source particles
//	typename ContainerClass::ConstBasicIterator iter(*sourceParticles);
//	while(iter.hasNotFinished()){
//
//		// map global position to [-1,1]
//		map(iter.data().getPosition(), localPosition); // 15 flops
//
//		// evaluate chebyshev polynomials of source particle: T_o(x_i)
//		T_of_x[0][0] = FReal(1.);	T_of_x[1][0] = localPosition.getX();
//		T_of_x[0][1] = FReal(1.);	T_of_x[1][1] = localPosition.getY();
//		T_of_x[0][2] = FReal(1.);	T_of_x[1][2] = localPosition.getZ();
//		xpx = FReal(2.) * localPosition.getX() ; // 1 flop
//		ypy = FReal(2.) * localPosition.getY() ; // 1 flop
//		zpz = FReal(2.) * localPosition.getZ() ; // 1 flop
//
//		for (unsigned int o=2; o<ORDER; ++o) {
//			T_of_x[o][0] = xpx * T_of_x[o-1][0] - T_of_x[o-2][0]; // 2 flops
//			T_of_x[o][1] = ypy * T_of_x[o-1][1] - T_of_x[o-2][1];	// 2 flops
//			T_of_x[o][2] = zpz * T_of_x[o-1][2] - T_of_x[o-2][2]; // 2 flops
//		} // flops: (ORDER-1) * 6
811
//
812 813 814 815
//		// anterpolate
//		const FReal sourceValue = iter.data().getPhysicalValue();
//		for (unsigned int n=0; n<nnodes; ++n) {
//			const unsigned int j[3] = {node_ids[n][0], node_ids[n][1], node_ids[n][2]};
816
//			S[0] = FReal(0.5) + T_of_x[1][0] * T_of_roots[1][j[0]]; // 2 flops
817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835
//			S[1] = FReal(0.5) + T_of_x[1][1] * T_of_roots[1][j[1]]; // 2 flops
//			S[2] = FReal(0.5) + T_of_x[1][2] * T_of_roots[1][j[2]]; // 2 flops
//			for (unsigned int o=2; o<ORDER; ++o) {
//				S[0] += T_of_x[o][0] * T_of_roots[o][j[0]]; // 2 flops
//				S[1] += T_of_x[o][1] * T_of_roots[o][j[1]]; // 2 flops
//				S[2] += T_of_x[o][2] * T_of_roots[o][j[2]]; // 2 flops
//			} // flops: (ORDER-2) * 6
//
//			// gather contributions
//			multipoleExpansion[n]	+= c1 *	S[0] * S[1] * S[2] *	sourceValue; // 4 flops
//		} // flops: ORDER*ORDER*ORDER * (10 + (ORDER-2) * 6)
//
//		// increment source iterator
//		iter.gotoNext();
//	} // flops: M * (18 + (ORDER-1) * 6 + ORDER*ORDER*ORDER * (10 + (ORDER-2) * 6))
//}



836 837 838
/**
 * Local to particle operation: application of \f$S_\ell(x,\bar x_m)\f$ (interpolation)
 */
839
template <int ORDER, class MatrixKernelClass>
840
template <class ContainerClass>
841
inline void FChebInterpolator<ORDER,MatrixKernelClass>::applyL2P(const FPoint& center,
842 843 844
                                             const FReal width,
                                             const FReal *const localExpansion,
                                             ContainerClass *const inParticles) const
845
{
846 847 848 849
    FReal f1[nLhs];
    FReal W2[nLhs][3][ ORDER-1];
    FReal W4[nLhs][3][(ORDER-1)*(ORDER-1)];
    FReal W8[nLhs][   (ORDER-1)*(ORDER-1)*(ORDER-1)];
850
    {
851 852 853 854
      for(int idxLhs = 0 ; idxLhs < nLhs ; ++idxLhs){

      // sum over interpolation points
        f1[idxLhs] = FReal(0.);
855
        for(unsigned int i=0; i<ORDER-1; ++i)                      W2[idxLhs][0][i] = W2[idxLhs][1][i] = W2[idxLhs][2][i] = FReal(0.);
856 857
        for(unsigned int i=0; i<(ORDER-1)*(ORDER-1); ++i)        W4[idxLhs][0][i] = W4[idxLhs][1][i] = W4[idxLhs][2][i] = FReal(0.);
        for(unsigned int i=0; i<(ORDER-1)*(ORDER-1)*(ORDER-1); ++i)	W8[idxLhs][i] = FReal(0.);
858 859 860 861 862 863

        for (unsigned int idx=0; idx<nnodes; ++idx) {
            const unsigned int i = node_ids[idx][0];
            const unsigned int j = node_ids[idx][1];
            const unsigned int k = node_ids[idx][2];

864
            f1[idxLhs] += localExpansion[2*idxLhs*nnodes + idx]; // 1 flop
865 866

            for (unsigned int l=0; l<ORDER-1; ++l) {
867 868 869 870 871 872
                const FReal wx = T[l*ORDER+i] * localExpansion[2*idxLhs*nnodes + idx]; // 1 flops
                const FReal wy = T[l*ORDER+j] * localExpansion[2*idxLhs*nnodes + idx]; // 1 flops
                const FReal wz = T[l*ORDER+k] * localExpansion[2*idxLhs*nnodes + idx]; // 1 flops
                W2[idxLhs][0][l] += wx; // 1 flops
                W2[idxLhs][1][l] += wy; // 1 flops
                W2[idxLhs][2][l] += wz; // 1 flops
873 874 875 876
                for (unsigned int m=0; m<ORDER-1; ++m) {
                    const FReal wxy = wx * T[m*ORDER + j]; // 1 flops
                    const FReal wxz = wx * T[m*ORDER + k]; // 1 flops
                    const FReal wyz = wy * T[m*ORDER + k]; // 1 flops
877 878 879
                    W4[idxLhs][0][m*(ORDER-1)+l] += wxy; // 1 flops
                    W4[idxLhs][1][m*(ORDER-1)+l] += wxz; // 1 flops
                    W4[idxLhs][2][m*(ORDER-1)+l] += wyz; // 1 flops
880 881
                    for (unsigned int n=0; n<ORDER-1; ++n) {
                        const FReal wxyz = wxy * T[n*ORDER + k]; // 1 flops
882
                        W8[idxLhs][n*(ORDER-1)*(ORDER-1) + m*(ORDER-1) + l]	+= wxyz; // 1 flops
883 884 885 886
                    } // (ORDER-1) * 2 flops
                } // (ORDER-1) * (6 + (ORDER-1)*2) flops
            } // (ORDER-1) * (6 + (ORDER-1) * (6 + (ORDER-1)*2)) flops
        } // ORDER*ORDER*ORDER * (1 + (ORDER-1) * (6 + (ORDER-1) * (6 + (ORDER-1)*2))) flops
887
      } // NLHS
888 889 890 891 892 893
    }


    // loop over particles
    const map_glob_loc map(center, width);
    FPoint localPosition;
894 895 896 897 898 899 900 901

    //const FReal*const physicalValues = inParticles->getPhysicalValues();
    const FReal*const positionsX = inParticles->getPositions()[0];
    const FReal*const positionsY = inParticles->getPositions()[1];
    const FReal*const positionsZ = inParticles->getPositions()[2];
    //FReal*const forcesX = inParticles->getForcesX();
    //FReal*const forcesY = inParticles->getForcesY();
    //FReal*const forcesZ = inParticles->getForcesZ();
902
//    FReal*const potentials = inParticles->getPotentials();
903 904

    for(int idxPart = 0 ; idxPart < inParticles->getNbParticles() ; ++ idxPart){
905

906 907
      // map global position to [-1,1]
      map(FPoint(positionsX[idxPart],positionsY[idxPart],positionsZ[idxPart]), localPosition); // 15 flops
908

909 910 911 912 913 914 915 916 917 918 919 920
      FReal T_of_x[3][ORDER];
      {
        T_of_x[0][0] = FReal(1.); T_of_x[0][1] = localPosition.getX();
        T_of_x[1][0] = FReal(1.); T_of_x[1][1] = localPosition.getY();
        T_of_x[2][0] = FReal(1.); T_of_x[2][1] = localPosition.getZ();
        const FReal x2 = FReal(2.) * T_of_x[0][1]; // 1 flop
        const FReal y2 = FReal(2.) * T_of_x[1][1]; // 1 flop
        const FReal z2 = FReal(2.) * T_of_x[2][1]; // 1 flop
        for (unsigned int j=2; j<ORDER; ++j) {
          T_of_x[0][j] = x2 * T_of_x[0][j-1] - T_of_x[0][j-2]; // 2 flops
          T_of_x[1][j] = y2 * T_of_x[1][j-1] - T_of_x[1][j-2]; // 2 flops
          T_of_x[2][j] = z2 * T_of_x[2][j-1] - T_of_x[2][j-2]; // 2 flops
921
        }
922 923
      }

924
      for(int idxLhs = 0 ; idxLhs < nLhs ; ++idxLhs){
925 926
        // distribution over potential components:
        // We sum the multidim contribution of PhysValue
927
        // This was originally done at M2L step but moved here
928 929
        // because their storage is required by the force computation.
        // In fact : f_{ik}(x)=w_j(x) \nabla_{x_i} K_{ij}(x,y)w_j(y))
930
        const unsigned int idxPot = idxLhs / nPV;
931 932

        FReal*const potentials = inParticles->getPotentials(idxPot);
933 934

        // interpolate and increment target value
935
        FReal targetValue = potentials[idxPart];
936
        {
937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958
          FReal f2, f4, f8;
          {
            f2 = f4 = f8 = FReal(0.);
            for (unsigned int l=1; l<ORDER; ++l) {
              f2 +=
                T_of_x[0][l] * W2[idxLhs][0][l-1] +
                T_of_x[1][l] * W2[idxLhs][1][l-1] +
                T_of_x[2][l] * W2[idxLhs][2][l-1]; // 6 flops
              for (unsigned int m=1; m<ORDER; ++m) {
                f4 +=
                  T_of_x[0][l] * T_of_x[1][m] * W4[idxLhs][0][(m-1)*(ORDER-1)+(l-1)] +
                  T_of_x[0][l] * T_of_x[2][m] * W4[idxLhs][1][(m-1)*(ORDER-1)+(l-1)] +
                  T_of_x[1][l] * T_of_x[2][m] * W4[idxLhs][2][(m-1)*(ORDER-1)+(l-1)]; // 9 flops
                for (unsigned int n=1; n<ORDER; ++n) {
                  f8 +=
                    T_of_x[0][l] * T_of_x[1][m] * T_of_x[2][n] *
                    W8[idxLhs][(n-1)*(ORDER-1)*(ORDER-1) + (m-1)*(ORDER-1) + (l-1)];
                } // ORDER * 4 flops
              } // ORDER * (9 + ORDER * 4) flops
            } // ORDER * (ORDER * (9 + ORDER * 4)) flops
          }
          targetValue = (f1[idxLhs] + FReal(2.)*f2 + FReal(4.)*f4 + FReal(8.)*f8) / nnodes; // 7 flops
959 960
        } // 7 + ORDER * (ORDER * (9 + ORDER * 4)) flops

961
          // set potential
962
        potentials[idxPart] += (targetValue);
963 964
      } // NLHS
    } // N * (7 + ORDER * (ORDER * (9 + ORDER * 4))) flops
965 966 967
}


968 969 970 971 972 973 974
//	FReal F2[3][ORDER-1];
//	FBlas::gemtv(ORDER, ORDER-1, FReal(1.), const_cast<FReal*>(T), const_cast<FReal*>(localExpansion), F2[0]);
//	for (unsigned int i=1; i<ORDER*ORDER; ++i)
//		FBlas::gemtva(ORDER, ORDER-1, FReal(1.), const_cast<FReal*>(T),
//									const_cast<FReal*>(localExpansion) + ORDER*i, F2[0]);
//	for (unsigned int i=0; i<ORDER-1; ++i)
//		std::cout << W2[0][i] << "\t" << F2[0][i] << std::endl;
975

976 977
//	FReal F2[(ORDER-1) * ORDER*ORDER];
//	FBlas::gemtm(ORDER, ORDER-1, ORDER*ORDER, FReal(1.), const_cast<FReal*>(T), ORDER,
978
//                                                       const_cast<FReal*>(localExpansion), ORDER, F2, ORDER-1);
979 980 981 982 983
//	FReal F[ORDER-1]; FBlas::setzero(ORDER-1, F);
//	for (unsigned int i=0; i<ORDER-1; ++i)
//		for (unsigned int j=0; j<ORDER*ORDER; ++j) F[i] += F2[j*(ORDER-1) + i];
//	for (unsigned int i=0; i<ORDER-1; ++i)
//		std::cout << W2[0][i] << "\t" << F[i] << std::endl;
984 985


986 987 988 989 990 991
///**
// * Local to particle operation: application of \f$S_\ell(x,\bar x_m)\f$ (interpolation)
// */
//template <int ORDER>
//template <class ContainerClass>
//inline void FChebInterpolator<ORDER>::applyL2P(const FPoint& center,
992 993 994
//                                                                                                                                                                                       const FReal width,
//                                                                                                                                                                                       const FReal *const localExpansion,
//                                                                                                                                                                                       ContainerClass *const localParticles) const
995 996 997 998 999 1000 1001 1002 1003 1004 1005
//{
//	// allocate stuff
//	const map_glob_loc map(center, width);
//	FPoint localPosition;
//	FReal T_of_x[ORDER][3];
//	FReal xpx,ypy,zpz ;
//	FReal S[3],c1;
//	//
//	c1 = FReal(8.) / nnodes ;
//	typename ContainerClass::BasicIterator iter(*localParticles);
//	while(iter.hasNotFinished()){
1006
//
1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033
//		// map global position to [-1,1]
//		map(iter.data().getPosition(), localPosition); // 15 flops
//
//		// evaluate chebyshev polynomials of source particle: T_o(x_i)
//		T_of_x[0][0] = FReal(1.);	T_of_x[1][0] = localPosition.getX();
//		T_of_x[0][1] = FReal(1.);	T_of_x[1][1] = localPosition.getY();
//		T_of_x[0][2] = FReal(1.);	T_of_x[1][2] = localPosition.getZ();
//		xpx = FReal(2.) * localPosition.getX() ; // 1 flop
//		ypy = FReal(2.) * localPosition.getY() ; // 1 flop
//		zpz = FReal(2.) * localPosition.getZ() ; // 1 flop
//		for (unsigned int o=2; o<ORDER; ++o) {
//			T_of_x[o][0] = xpx * T_of_x[o-1][0] - T_of_x[o-2][0]; // 2 flop
//			T_of_x[o][1] = ypy * T_of_x[o-1][1] - T_of_x[o-2][1]; // 2 flop
//			T_of_x[o][2] = zpz * T_of_x[o-1][2] - T_of_x[o-2][2]; // 2 flop
//		} // (ORDER-2) * 6 flops
//
//		// interpolate and increment target value
//		FReal targetValue = iter.data().getPotential();
//		for (unsigned int n=0; n<nnodes; ++n) {
//			const unsigned int j[3] = {node_ids[n][0], node_ids[n][1], node_ids[n][2]};
//			S[0] = T_of_x[1][0] * T_of_roots[1][j[0]]; // 1 flops
//			S[1] = T_of_x[1][1] * T_of_roots[1][j[1]]; // 1 flops
//			S[2] = T_of_x[1][2] * T_of_roots[1][j[2]]; // 1 flops
//			for (unsigned int o=2; o<ORDER; ++o) {
//				S[0] += T_of_x[o][0] * T_of_roots[o][j[0]]; // 2 flops
//				S[1] += T_of_x[o][1] * T_of_roots[o][j[1]]; // 2 flops
//				S[2] += T_of_x[o][2] * T_of_roots[o][j[2]]; // 2 flops
1034
//			} // (ORDER-2) * 6 flops
1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050
//			// gather contributions
//			S[0] += FReal(0.5); // 1 flops
//			S[1] += FReal(0.5); // 1 flops
//			S[2] += FReal(0.5); // 1 flops
//			targetValue	+= S[0] * S[1] * S[2] * localExpansion[n]; // 4 flops
//		} // ORDER*ORDER*ORDER * (10 + (ORDER-2) * 6) flops
//		// scale
//		targetValue *= c1; // 1 flops
//
//		// set potential
//		iter.data().setPotential(targetValue);
//
//		// increment target iterator
//		iter.gotoNext();
//	} // N * ORDER*ORDER*ORDER * (10 + (ORDER-2) * 6) flops
//}
1051 1052 1053 1054 1055 1056






Matthias Messner's avatar
Matthias Messner committed
1057 1058 1059
/**
 * Local to particle operation: application of \f$\nabla_x S_\ell(x,\bar x_m)\f$ (interpolation)
 */
1060
template <int ORDER, class MatrixKernelClass>
Matthias Messner's avatar
Matthias Messner committed
1061
template <class ContainerClass>
1062 1063 1064 1065
inline void FChebInterpolator<ORDER,MatrixKernelClass>::applyL2PGradient(const FPoint& center,
                                                                         const FReal width,
                                                                         const FReal *const localExpansion,
                                                                         ContainerClass *const inParticles) const
Matthias Messner's avatar
Matthias Messner committed
1066
{
1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079
    ////////////////////////////////////////////////////////////////////
    // TENSOR-PRODUCT INTERPOLUTION NOT IMPLEMENTED YET HERE!!! ////////
    ////////////////////////////////////////////////////////////////////

    // setup local to global mapping
    const map_glob_loc map(center, width);
    FPoint Jacobian;
    map.computeJacobian(Jacobian);
    const FReal jacobian[3] = {Jacobian.getX(), Jacobian.getY(), Jacobian.getZ()};
    FPoint localPosition;
    FReal T_of_x[ORDER][3];
    FReal U_of_x[ORDER][3];
    FReal P[3];
Matthias Messner's avatar
Matthias Messner committed
1080

1081
//    const FReal*const physicalValues = inParticles->getPhysicalValues();
1082 1083 1084
    const FReal*const positionsX = inParticles->getPositions()[0];
    const FReal*const positionsY = inParticles->getPositions()[1];
    const FReal*const positionsZ = inParticles->getPositions()[2];
1085 1086 1087
//    FReal*const forcesX = inParticles->getForcesX();
//    FReal*const forcesY = inParticles->getForcesY();
//    FReal*const forcesZ = inParticles->getForcesZ();
1088 1089 1090
    //FReal*const potentials = inParticles->getPotentials();

    for(int idxPart = 0 ; idxPart < inParticles->getNbParticles() ; ++ idxPart){
1091 1092

        // map global position to [-1,1]
1093
        map(FPoint(positionsX[idxPart],positionsY[idxPart],positionsZ[idxPart]), localPosition);
1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122

        // evaluate chebyshev polynomials of source particle
        // T_0(x_i) and T_1(x_i)
        T_of_x[0][0] = FReal(1.);	T_of_x[1][0] = localPosition.getX();
        T_of_x[0][1] = FReal(1.);	T_of_x[1][1] = localPosition.getY();
        T_of_x[0][2] = FReal(1.);	T_of_x[1][2] = localPosition.getZ();
        // U_0(x_i) and U_1(x_i)
        U_of_x[0][0] = FReal(1.);	U_of_x[1][0] = localPosition.getX() * FReal(2.);
        U_of_x[0][1] = FReal(1.);	U_of_x[1][1] = localPosition.getY() * FReal(2.);
        U_of_x[0][2] = FReal(1.);	U_of_x[1][2] = localPosition.getZ() * FReal(2.);
        for (unsigned int o=2; o<ORDER; ++o) {
            // T_o(x_i)
            T_of_x[o][0] = FReal(2.)*localPosition.getX()*T_of_x[o-1][0] - T_of_x[o-2][0];
            T_of_x[o][1] = FReal(2.)*localPosition.getY()*T_of_x[o-1][1] - T_of_x[o-2][1];
            T_of_x[o][2] = FReal(2.)*localPosition.getZ()*T_of_x[o-1][2] - T_of_x[o-2][2];
            // U_o(x_i)
            U_of_x[o][0] = FReal(2.)*localPosition.getX()*U_of_x[o-1][0] - U_of_x[o-2][0];
            U_of_x[o][1] = FReal(2.)*localPosition.getY()*U_of_x[o-1][1] - U_of_x[o-2][1];
            U_of_x[o][2] = FReal(2.)*localPosition.getZ()*U_of_x[o-1][2] - U_of_x[o-2][2];
        }

        // scale, because dT_o/dx = oU_{o-1}
        for (unsigned int o=2; o<ORDER; ++o) {
            U_of_x[o-1][0] *= FReal(o);
            U_of_x[