Attention une mise à jour du service Gitlab va être effectuée le mardi 18 janvier (et non lundi 17 comme annoncé précédemment) entre 18h00 et 18h30. Cette mise à jour va générer une interruption du service dont nous ne maîtrisons pas complètement la durée mais qui ne devrait pas excéder quelques minutes.

testUnifTensorialInterpolator.cpp 24.6 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
// ===================================================================================
// Ce LOGICIEL "ScalFmm" est couvert par le copyright Inria 20xx-2012.
// Inria détient tous les droits de propriété sur le LOGICIEL, et souhaite que
// la communauté scientifique l'utilise afin de le tester et de l'évaluer.
// Inria donne gracieusement le droit d'utiliser ce LOGICIEL. Toute utilisation
// dans un but lucratif ou à des fins commerciales est interdite sauf autorisation
// expresse et préalable d'Inria.
// Toute utilisation hors des limites précisées ci-dessus et réalisée sans l'accord
// expresse préalable d'Inria constituerait donc le délit de contrefaçon.
// Le LOGICIEL étant un produit en cours de développement, Inria ne saurait assurer
// aucune responsabilité et notamment en aucune manière et en aucun cas, être tenu
// de répondre d'éventuels dommages directs ou indirects subits par l'utilisateur.
// Tout utilisateur du LOGICIEL s'engage à communiquer à Inria ses remarques
// relatives à l'usage du LOGICIEL
// ===================================================================================

// ==== CMAKE =====
18
// @FUSE_FFT
19
// ================
20
21
// Keep in private GIT
// @SCALFMM_PRIVATE
22
23
24
25
26
27
28

#include <iostream>

#include <stdio.h>
#include <stdlib.h>
#include <time.h>

29
30
31
#include "Utils/FTic.hpp"
#include "Utils/FMath.hpp"
#include "Utils/FBlas.hpp"
32

33
#include "Containers/FVector.hpp"
34

35
36
#include "Utils/FAssert.hpp"
#include "Utils/FPoint.hpp"
37

38
#include "Utils/FParameterNames.hpp"
39

40
41
42
#include "Kernels/Uniform/FUnifInterpolator.hpp"
#include "Kernels/Interpolation/FInterpMatrixKernel.hpp"
#include "Kernels/Uniform/FUnifTensor.hpp"
43
44

// Check DFT
45
#include "Utils/FComplex.hpp"
46
#include "Utils/FDft.hpp"
47
48


49
50
#include "Kernels/P2P/FP2PParticleContainer.hpp"
#include "Components/FSimpleLeaf.hpp"
51
52
53
54
55




/**
56
 * In this file we compute the interactions (direct and Unif FM-approximate) for
57
 * a tensorial interaction kernel (R_ij) as well as the forces (comparison
58
 * with direct computation using R_ijk kernel).
59
60
 */

61
62
63
int main(int argc, char ** argv){
    FHelpDescribeAndExit(argc, argv, "Test the uniform tensorial interpolator (only the code is interesting)");

64
65
    typedef double FReal;

66
    typedef FInterpMatrixKernel_R_IJ<FReal> MatrixKernelClass;
67
68
69
70
71
72
73
    const double a = 0.0; // core width (Beware! if diff from 0. then Kernel should be NON HOMOGENEOUS !!!)

    const unsigned int ncmp = MatrixKernelClass::NCMP;
    const unsigned int nrhs = MatrixKernelClass::NRHS;
    const unsigned int nlhs = MatrixKernelClass::NLHS;
    const unsigned int npot = MatrixKernelClass::NPOT;

74
    typedef FP2PParticleContainer<FReal,nrhs,nlhs> ContainerClass;
75
    typedef FSimpleLeaf<FReal, ContainerClass> LeafClass;
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91

    ///////////////////////What we do/////////////////////////////
    std::cout << "\nTask: Compute interactions between source particles in leaf Y and target\n";
    std::cout << " particles in leaf X. Compare the fast summation K ~ Lx K Ly' with the\n";
    std::cout << " direct computation.\n" << std::endl;
    //////////////////////////////////////////////////////////////

    const FReal FRandMax = FReal(RAND_MAX);
    FTic time;


    // Leaf size
    FReal width = FReal(3.723);

    ////////////////////////////////////////////////////////////////////
    LeafClass X;
92
    FPoint<FReal> cx(0., 0., 0.);
93
94
95
96
97
98
99
100
101
102
    const unsigned long M = 5000;
    std::cout << "Fill the leaf X of width " << width
              << " centered at cx=" << cx << " with M=" << M << " target particles" << std::endl;
    {

        for(unsigned long i=0; i<M; ++i){
            FReal x = (FReal(rand())/FRandMax - FReal(.5)) * width + cx.getX();
            FReal y = (FReal(rand())/FRandMax - FReal(.5)) * width + cx.getY();
            FReal z = (FReal(rand())/FRandMax - FReal(.5)) * width + cx.getZ();
            // PB: need to know the actual value of NRHS (=3 here)
103
            X.push(FPoint<FReal>(x, y, z), FReal(rand())/FRandMax, FReal(rand())/FRandMax, FReal(rand())/FRandMax);
104
        }
105
106
107
    }


108
109
    ////////////////////////////////////////////////////////////////////
    LeafClass Y;
110
111
    FPoint<FReal> cy(FReal(2.)*width, 0., 0.);
    //FPoint<FReal> cy(FReal(2.)*width, FReal(2.)*width, 0.);
112
113
114
115
116
117
118
119
120
121

    const unsigned long N = 5000;
    std::cout << "Fill the leaf Y of width " << width
              << " centered at cy=" << cy	<< " with N=" << N << " source particles" << std::endl;
    {
        for(unsigned long i=0; i<N; ++i){
            FReal x = (FReal(rand())/FRandMax - FReal(.5)) * width + cy.getX();
            FReal y = (FReal(rand())/FRandMax - FReal(.5)) * width + cy.getY();
            FReal z = (FReal(rand())/FRandMax - FReal(.5)) * width + cy.getZ();
            // PB: need to know the actual value of NRHS (=3 here)
122
            Y.push(FPoint<FReal>(x, y, z), FReal(rand())/FRandMax, FReal(rand())/FRandMax, FReal(rand())/FRandMax);
123
        }
124
125
126
    }


127
128
129
130
131

    ////////////////////////////////////////////////////////////////////
    // approximative computation
    const unsigned int ORDER = 6;
    const unsigned int nnodes = TensorTraits<ORDER>::nnodes;
132
    typedef FUnifInterpolator<FReal,ORDER,MatrixKernelClass> InterpolatorClass;
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
    InterpolatorClass S;
    MatrixKernelClass MatrixKernel;

    std::cout << "\nCompute interactions approximatively, interpolation order = " << ORDER << " ..." << std::endl;

    std::cout << "\nP2M ... " << std::flush;
    time.tic();
    // Anterpolate: W_n = \sum_j^N S(y_j,\bar y_n) * w_j
    FReal W[nrhs*nnodes]; // multipole expansion
    // tensorial case interpolate same Y for each component
    S.applyP2M(cy, width, W, Y.getSrc()); // the multipole expansions are set to 0 in S.applyP2M
    std::cout << "took " << time.tacAndElapsed() << "s" << std::endl;

    std::cout << "M2L ... " << std::flush;
    time.tic();
    // Multipole to local: F_m = \sum_n^L K(\bar x_m, \bar y_n) * W_n
149
    FPoint<FReal> rootsX[nnodes], rootsY[nnodes];
150
151
    FUnifTensor<FReal,ORDER>::setRoots(cx, width, rootsX);
    FUnifTensor<FReal,ORDER>::setRoots(cy, width, rootsY);
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166

    FReal F[nlhs*nnodes]; // local expansion
    for (unsigned int i=0; i<nnodes*nlhs; ++i) F[i] = FReal(0.);

    for (unsigned int i=0; i<nnodes; ++i) {
        for (unsigned int j=0; j<nnodes; ++j){

            for (unsigned int idxLhs=0; idxLhs<nlhs; ++idxLhs){
                unsigned int idxRhs = idxLhs % npot;
                unsigned int d = MatrixKernel.getPosition(idxLhs);

                F[i+idxLhs*nnodes] += MatrixKernelClass(a,d).evaluate(rootsX[i], rootsY[j]) * W[j+idxRhs*nnodes];

            }
        }
167
    }
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
    std::cout << "took " << time.tacAndElapsed() << "s" << std::endl;

    //  std::cout<< "F via direct applyM2L: "<<std::endl;
    //  for (unsigned int d=0; d<nlhs; ++d){
    //    for (unsigned int i=0; i<nnodes; ++i)
    //      std::cout<< F[i+d*nnodes] << ", ";
    //    std::cout<<std::endl;
    //  }
    //  std::cout<<std::endl;

    ////////////////////////////////////////////////////////////////////////////
    // Store M2L in K and apply K
    FReal K[ncmp*nnodes*nnodes]; // local expansion
    for (unsigned int i=0; i<nnodes; ++i) {
        for (unsigned int j=0; j<nnodes; ++j){

            for (unsigned int d=0; d<ncmp; ++d){
                K[d*nnodes*nnodes+i*nnodes+j] = MatrixKernelClass(a,d).evaluate(rootsX[i], rootsY[j]);
            }
187

188
        }
189
    }
190
191
192
    std::cout<< "Apply M2L in usual sense: ";
    time.tic();
    for (unsigned int i=0; i<nnodes*nlhs; ++i) F[i] = FReal(0.);
193

194
195
    for (unsigned int i=0; i<nnodes; ++i)
        for (unsigned int j=0; j<nnodes; ++j){
196

197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
            for (unsigned int idxLhs=0; idxLhs<nlhs; ++idxLhs){
                unsigned int idxRhs = idxLhs % npot;
                unsigned int d = MatrixKernel.getPosition(idxLhs);

                F[i+idxLhs*nnodes] += K[d*nnodes*nnodes+i*nnodes+j] * W[j+idxRhs*nnodes];

            }
        }

    time.tac();
    std::cout << "took " << time.elapsed() << "sec." << std::endl;

    //  std::cout<< "F via store and applyM2L: "<<std::endl;
    //  for (unsigned int d=0; d<nlhs; ++d){
    //    for (unsigned int i=0; i<nnodes; ++i)
    //      std::cout<< F[i+d*nnodes] << ", ";
    //    std::cout<<std::endl;
    //  }
    //  std::cout<<std::endl;

    /////////////////////////////////////////////////////////////////////////////////////
    // PB: Verify storage improvement works (indexing etc...)
    // 1) store circulant matrix
    const unsigned int rc = (2*ORDER-1)*(2*ORDER-1)*(2*ORDER-1);
    FReal C[ncmp*rc];

223
    typedef FUnifTensor<FReal,ORDER> TensorType;
224
225
226
227
228
229
230
231
232
233
    unsigned int node_diff[nnodes*nnodes];
    TensorType::setNodeIdsDiff(node_diff);
    unsigned int node_ids_pairs[rc][2];
    TensorType::setNodeIdsPairs(node_ids_pairs);

    unsigned int ido=0;

    for(unsigned int l=0; l<2*ORDER-1; ++l)
        for(unsigned int m=0; m<2*ORDER-1; ++m)
            for(unsigned int n=0; n<2*ORDER-1; ++n){
234

235
                for (unsigned int d=0; d<ncmp; ++d){
236

237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
                    C[d*rc + ido]
                            = MatrixKernelClass(a,d).evaluate(rootsX[node_ids_pairs[ido][0]],
                            rootsY[node_ids_pairs[ido][1]]);
                }

                ido++;
            }

    //  // Display C (gathers every values of K that need to be stored,
    //  // corresponds to the first line of the padded matrix (reverse order?))
    //  std::cout<<"C="<<std::endl;
    //    for (unsigned int d=0; d<ncmp; ++d){
    //      for (unsigned int n=0; n<rc; ++n)
    //        std::cout<< C[n + d*rc] << ", ";
    //      std::cout<<std::endl;
    //    }
    //  std::cout<<std::endl;

    //////////////////////////////////////////////////////////////////////////////////////////////////////
    // K is a block Toeplitz matrix
    // i.e. a blockwise Toeplitz matrix where the block also have the Toeplitz structure.
    // e.g. for ORDER=3: K=[K_{1,1} K_{1,2} K_{1,3},  where K_{i,j}=[k11 k12 k13,
    //                      K_{2,1} K_{1,1} K_{1,2},                 k21 k11 k12,
    //                      K_{3,1} K_{2,1} K_{1,1}];                k31 k21 k11];
    // K is of size order^3 x order^3
    // (i.e. order^2 x order^2 Toeplitz blocks of size order x order),
    // K is very close to be Toeplitz itself and even circulant.
    // In order to actually embed K into a circulant matrix C one just
    // needs to insert (ORDER-1) extra lines/columns (to each block).

    //  std::cout<< "K=" <<std::endl;
    //  for (unsigned int i=0; i<nnodes; ++i){
    //    for (unsigned int j=0; j<nnodes; ++j){
    //      std::cout<< K[i*nnodes+j]<<", ";
    //    }
    //    std::cout<<std::endl;
    //  }
    //  std::cout<<std::endl;

    //  // Check matrix node_diff
    //  std::cout<< "node_diff=" <<std::endl;
    //  for (unsigned int i=0; i<nnodes; ++i){
    //    for (unsigned int j=0; j<nnodes; ++j){
    //      std::cout<< node_diff[i*nnodes+j] <<", ";
    //    }
    //    std::cout<<std::endl;
    //  }
    //  std::cout<<std::endl;

    //  // Expected ido for the (2*ORDER-1)^3x(2*ORDER-1)^3 circulant matrix
    //  for (unsigned int i=0; i<rc; ++i){
    //    for (unsigned int j=0; j<rc; ++j){
    //      if(i>j) std::cout<< i-j-1 << ", ";
    //      else std::cout<< rc+i-j-1 << ", ";
    //    } std::cout<<std::endl;
    //  } std::cout<<std::endl;

    //  // kernel evaluated at previous ido returns a circulant matrix
    //  for (unsigned int i=0; i<rc/2; ++i){
    //    for (unsigned int j=0; j<rc/2; ++j){
    //      if(i>j) std::cout<< C[i-j-1] << ", ";
    //      else std::cout<< C[rc+i-j-1] << ", ";
    //    } std::cout<<std::endl;
    //  } std::cout<<std::endl;

    // In 1D the Zero Padding consists in
    // inserting ORDER-1 zeros in the multipole exp
    // in order to apply the (ORDER+ORDER-1)x(ORDER+ORDER-1)
    // circulant matrix to it.
    // Let us extend it to the 3D case:
    FReal MultExp[nrhs*nnodes]; FReal PaddedMultExp[nrhs*rc];
    for (unsigned int i=0; i<nrhs*nnodes; ++i) MultExp[i]=W[i];
    FReal LocalExp[nlhs*nnodes]; FReal PaddedLocalExp[nlhs*rc];
    FBlas::setzero(nlhs*nnodes,LocalExp);
    FBlas::setzero(nlhs*rc,PaddedLocalExp);

    //  std::cout<< "Expected LocalExp: "<<std::endl;
    //  for (unsigned int d=0; d<nlhs; ++d){
    //    for (unsigned int i=0; i<nnodes; ++i)
    //      std::cout<< F[i+d*nnodes] << ", ";
    //    std::cout<<std::endl;
    //  }

    /////////////////////////////////////////////////////////////////////////////////////
    // Application of circulant Toeplitz system in PHYSICAL SPACE
    std::cout<< "Apply circulant M2L in Physical space: ";
    time.tic();
    for (unsigned int i=0; i<nnodes; ++i){

        // Pad Multipole Expansion with respect to current row
        FBlas::setzero(nrhs*rc,PaddedMultExp);
        for (unsigned int j=0; j<nnodes; ++j)
            for (unsigned int d=0; d<nrhs; ++d)
                PaddedMultExp[node_diff[i*nnodes+j] + d*rc]=MultExp[j + d*nnodes];

        //    std::cout<< "Padded MultExp for row i=" << i << ": "<<std::endl;
        //    for (unsigned int p=0; p<rc; ++p)
        //      std::cout<< PaddedMultExp[p] << ", ";
        //    std::cout<<std::endl;

        // Application of M2L in PHYSICAL SPACE
        for (unsigned int pj=0; pj<rc; ++pj)
            for (unsigned int idxLhs=0; idxLhs<nlhs; ++idxLhs){
                unsigned int idxRhs = idxLhs % npot;
                unsigned int d = MatrixKernel.getPosition(idxLhs);

                LocalExp[i + idxLhs*nnodes]+=C[pj + d*rc]*PaddedMultExp[pj + idxRhs*rc];

            }

    }// end i
    time.tac();
    std::cout << "took " << time.elapsed() << "sec." << std::endl;

    //  std::cout<< "LocalExp via product in PHYSICAL SPACE: "<<std::endl;
    //  for (unsigned int d=0; d<nlhs; ++d){
    //    for (unsigned int i=0; i<nnodes; ++i)
    //      std::cout<< LocalExp[i+d*nnodes] << ", ";
    //    std::cout<<std::endl;
    //  }
    //  std::cout<<std::endl;

    /////////////////////////////////////////////////////////////////////////////////////
    // Efficient application of the Toeplitz system in FOURIER SPACE

    // Init DFT
    const int dimfft = 1;
    //FDft Dft(rc); // direct version
365
    FFftw<FReal,FComplex<FReal>,dimfft> Dft(rc); // fast version
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384

    // Get first COLUMN of K and Store in T
    FReal T[ncmp*rc];
    // use permutations
    unsigned int perm[rc];
    for(unsigned int p=0; p<rc; ++p){
        if(p>0) perm[p]=p-1;
        else perm[p]=rc+p-1;
        //    std::cout << "perm["<< p << "]="<< perm[p] << std::endl;
    }

    for (unsigned int i=0; i<rc; ++i){
        // keep this lines commented to see what permutation accounts for:
        //  for (unsigned int j=0; j<rc; ++j){
        //      if(i>0) T[i]=C[i-0-1];
        //      else T[i]=C[rc+i-0-1];
        for (unsigned int d=0; d<ncmp; ++d)
            T[i + d*rc]=C[perm[i] + d*rc];
        //  }
385
    }
386
387
388
389
390
391
392

    //  std::cout<< "First column of C[rc x rc]: "<<std::endl;
    //  for (unsigned int p=0; p<rc; ++p)
    //    std::cout<< T[p] << ", ";
    //  std::cout<<std::endl;

    // Apply DFT to T
393
394
    FComplex<FReal> FT[ncmp*rc];
    //  for (unsigned int n=0; n<rc; ++n) FT[n]=FComplex<FReal>(0.0,0.0);
395
396
397
398
399
400
401
402
    FBlas::c_setzero(ncmp*rc,reinterpret_cast<FReal*>(FT));

    // if first COLUMN (T) of C is used
    for (unsigned int d=0; d<ncmp; ++d)
        Dft.applyDFT(T+d*rc,FT+d*rc);
    //  // if first ROW of C is used
    //  Dft.applyDFT(C,FT);

403
404
    FComplex<FReal> FPMultExp[nrhs*rc];
    FComplex<FReal> FPLocalExp[nlhs*rc];
405
406
    FReal PLocalExp[nlhs*rc];

407
    //for (unsigned int n=0; n<rc; ++n) FPLocalExp[n]=FComplex<FReal>(0.0,0.0);
408
409
410
411
412
413
414
415
416
417
418
419
    FBlas::c_setzero(nlhs*rc,reinterpret_cast<FReal*>(FPLocalExp));

    FBlas::setzero(nlhs*rc,PLocalExp);

    // Pad physical MultExp
    FBlas::setzero(nrhs*rc,PaddedMultExp); //part of padding
    for (unsigned int idRhs=0; idRhs<nrhs; ++idRhs)
        for (unsigned int j=0; j<nnodes; ++j){
            // if first COLUMN (T) of C is used
            PaddedMultExp[node_diff[j*nnodes]+idRhs*rc]=MultExp[j+idRhs*nnodes];
            //    // if first ROW of C is used
            //    PaddedMultExp[node_diff[j]]=MultExp[j];
420
        }
421
422
423
424
425
426
427
428

    //    std::cout<< "Padded MultExp: "<<std::endl;
    //    for (unsigned int p=0; p<rc; ++p)
    //      std::cout<< PaddedMultExp[p] << ", ";
    //    std::cout<<std::endl;


    // Set transformed MultExp to 0
429
    //  for (unsigned int n=0; n<rc; ++n) FPMultExp[n]=FComplex<FReal>(0.0,0.0);
430
431
432
433
434
435
436
437
438
439
    FBlas::c_setzero(nrhs*rc,reinterpret_cast<FReal*>(FPMultExp));

    // Transform PaddedMultExp
    for (unsigned int idxRhs=0; idxRhs<nrhs; ++idxRhs) // apply nrhs 1 dimensionnal FFT
        Dft.applyDFT(PaddedMultExp+idxRhs*rc,FPMultExp+idxRhs*rc);

    std::cout<< "Apply M2L in Fourier space: ";
    time.tic();

    // Application of M2L in FOURIER SPACE
440
    FComplex<FReal> tmpFX;
441
    for (unsigned int idxLhs=0; idxLhs<nlhs; ++idxLhs){
442
443
        unsigned int idxRhs = idxLhs % npot;
        unsigned int d = MatrixKernel.getPosition(idxLhs);
444

445
446
447
448
449
450
        for (unsigned int pj=0; pj<rc; ++pj){
            tmpFX=FT[pj + d*rc];
            tmpFX*=FPMultExp[pj+idxRhs*rc];
            FPLocalExp[pj+idxLhs*rc]+=tmpFX; // add new contribution +RijYj
        }

451
    }
452
453
454
455
456
457
458
459
460
    time.tac();
    std::cout << "took " << time.elapsed() << "sec." << std::endl;

    //    std::cout<< "Transfo Padded LocalExp: "<<std::endl;
    //    for (unsigned int p=0; p<rc; ++p)
    //      std::cout<< FPLocalExp[p] << ", ";
    //    std::cout<<std::endl;

    for (unsigned int idxLhs=0; idxLhs<nlhs; ++idxLhs) // apply nrhs 1 dimensionnal FFT
461
        Dft.applyIDFTNorm(FPLocalExp+idxLhs*rc,PLocalExp+idxLhs*rc);
462
463
464
465
466
467
468
469
470
471
472
473
474
475

    //  std::cout<< "Padded LocalExp: "<<std::endl;
    //  for (unsigned int p=0; p<rc; ++p)
    //    std::cout<< PLocalExp[p] << ", ";
    //  std::cout<<std::endl;

    // Unpad
    for (unsigned int idLhs=0; idLhs<nlhs; ++idLhs)
        for (unsigned int j=0; j<nnodes; ++j){
            // if first COLUMN (T) of C is used
            LocalExp[j+idLhs*nnodes]=PLocalExp[node_diff[nnodes-j-1]+idLhs*rc];
            //    // if first ROW of C is used
            //    LocalExp[j]=PLocalExp[node_diff[j*nnodes]];
        }
476

477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
    //  std::cout<< "Mask to be applied to Padded LocalExp: "<<std::endl;
    //  for (unsigned int j=0; j<nnodes; ++j)
    //    std::cout<< node_diff[nnodes-j-1] << ", ";
    //  std::cout<<std::endl;

    //  std::cout<< "LocalExp via product in FOURIER SPACE: "<<std::endl;
    //  for (unsigned int d=0; d<nlhs; ++d){
    //    for (unsigned int p=0; p<nnodes; ++p)
    //      std::cout<< LocalExp[p + d*nnodes] << ", ";
    //    std::cout<<std::endl;
    //  }
    //  std::cout<<std::endl;


    /////////////////////////////////////////////////////////////////////////////////////

    std::cout << "L2P (potential) ... " << std::flush;
    time.tic();
    // Interpolate p_i = \sum_m^L S(x_i,\bar x_m) * F_m
    S.applyL2P(cx, width, F, X.getTargets());
    std::cout << "took " << time.tacAndElapsed() << "s" << std::endl;

    std::cout << "L2P (forces) ... " << std::flush;
    time.tic();
    // Interpolate f_i = \sum_m^L P(x_i,\bar x_m) * F_m
    S.applyL2PGradient(cx, width, F, X.getTargets());
    std::cout << "took " << time.tacAndElapsed() << "s" << std::endl;

    ////////////////////////////////////////////////////////////////////
    // direct computation
    std::cout << "Compute interactions directly ..." << std::endl;
    time.tic();

    FReal** approx_f = new FReal* [npot];
    FReal**        f = new FReal* [npot];
    for (unsigned int i=0; i<npot; ++i){
        approx_f[i] = new FReal [M * 3];
        f[i] = new FReal [M * 3];
        FBlas::setzero(M*3, f[i]);
516
    }
517

518
519
520
521
522
523
    FReal** approx_p = new FReal* [npot];
    FReal**        p = new FReal* [npot];
    for (unsigned int i=0; i<npot; ++i){
        approx_p[i] = new FReal [M];
        p[i] = new FReal [M];
        FBlas::setzero(M, p[i]);
524
525
    }

526
527
528
    { // start direct computation
        unsigned int counter = 0;

529
        for(FSize idxPartX = 0 ; idxPartX < X.getSrc()->getNbParticles() ; ++idxPartX){
530
            const FPoint<FReal> x = FPoint<FReal>(X.getSrc()->getPositions()[0][idxPartX],
531
532
533
534
535
536
                    X.getSrc()->getPositions()[1][idxPartX],
                    X.getSrc()->getPositions()[2][idxPartX]);
            const FReal wx[nrhs] = {X.getSrc()->getPhysicalValues(0)[idxPartX],
                                    X.getSrc()->getPhysicalValues(1)[idxPartX],
                                    X.getSrc()->getPhysicalValues(2)[idxPartX]};

537
            for(FSize idxPartY = 0 ; idxPartY < Y.getSrc()->getNbParticles() ; ++idxPartY){
538
                const FPoint<FReal> y = FPoint<FReal>(Y.getSrc()->getPositions()[0][idxPartY],
539
540
541
542
543
544
545
546
547
548
549
550
                        Y.getSrc()->getPositions()[1][idxPartY],
                        Y.getSrc()->getPositions()[2][idxPartY]);
                const FReal wy[nrhs] = {Y.getSrc()->getPhysicalValues(0)[idxPartY],
                                        Y.getSrc()->getPhysicalValues(1)[idxPartY],
                                        Y.getSrc()->getPhysicalValues(2)[idxPartY]};

                //        // 1/R
                //        const FReal one_over_r = MatrixKernel.evaluate(x, y);
                //
                //        // potential
                //        p[counter] += one_over_r * wy;
                //        // force
551
                //        FPoint<FReal> force(y - x);
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
                //        force *= one_over_r*one_over_r*one_over_r;
                //        f[counter*3 + 0] += force.getX() * wx * wy;
                //        f[counter*3 + 1] += force.getY() * wx * wy;
                //        f[counter*3 + 2] += force.getZ() * wx * wy;

                FReal rij[ncmp];
                FReal rijk[ncmp][3];
                MatrixKernel.evaluateBlockAndDerivative(x,y,rij,rijk);

                // R,ij and (R,ij),k
                for (unsigned int i=0; i<npot; ++i) // sum all compo
                    for (unsigned int j=0; j<nrhs; ++j){
                        unsigned int d = MatrixKernel.getPosition(i*nrhs+j);
                        // potential
                        p[i][counter] += rij[d] * wy[j];
                        // force
                        FReal force_k;
                        for (unsigned int k=0; k<3; ++k){
                            // Convention in matrix kernel: R_ij(x-y), while R_ijk(y-x)
                            force_k = FReal(1.) * rijk[d][k];
                            f[i][counter*3 + k] += force_k * wx[j] * wy[j];
                        }
                    }
575
576

            }
577
578
579
            counter++;
        }
    } // end direct computation
580
581


582
583
    time.tac();
    std::cout << "Done in " << time.elapsed() << "sec." << std::endl;
584
585


586
587
    ////////////////////////////////////////////////////////////////////
    unsigned int counter = 0;
588
    for(FSize idxPartX = 0 ; idxPartX < X.getSrc()->getNbParticles() ; ++idxPartX){
589
590
        for (unsigned int i=0; i<npot; ++i){
            approx_p[i][counter] = X.getSrc()->getPotentials(i)[idxPartX];
591
            const FPoint<FReal> force = FPoint<FReal>(X.getSrc()->getForcesX(i)[idxPartX],
592
593
594
595
596
597
598
599
                                        X.getSrc()->getForcesY(i)[idxPartX],
                                        X.getSrc()->getForcesZ(i)[idxPartX]);
            approx_f[i][counter*3 + 0] = force.getX();
            approx_f[i][counter*3 + 1] = force.getY();
            approx_f[i][counter*3 + 2] = force.getZ();
        }
        counter++;
    }
600

601
602
603
    //  std::cout << "Check Potential, forceX, forceY, forceZ " << std::endl;
    //  for (unsigned int i=0; i<npot; ++i){
    //    std::cout<< "idxLhs="<< i << std::endl;
604
    //    for(FSize idxPart = 0 ; idxPart < 20 ; ++idxPart){
605
606
607
608
609
610
611
612
613
614
615
616
617
618
    //      std::cout << approx_p[i][idxPart]     << ", "<< p[i][idxPart] << "|| ";
    //      std::cout << approx_f[i][idxPart]     << ", "<< f[i][idxPart] << "|| ";
    //      std::cout << approx_f[i][idxPart+M]   << ", "<< f[i][idxPart+M] << "|| ";
    //      std::cout << approx_f[i][idxPart+2*M] << ", "<< f[i][idxPart+2*M] << "|| ";
    //      std::cout << std::endl;
    //    }
    //    std::cout << std::endl;
    //  }
    //  std::cout << std::endl;

    std::cout << "\nRelative Inf/L2 errors: " << std::endl;
    std::cout << "  Potential:" << std::endl;
    for(unsigned i = 0; i<npot;++i) {
        std::cout << "    " << i << ": "
619
620
                  << FMath::FAccurater<FReal>(p[i],approx_p[i],M).getRelativeInfNorm()<<", "
                  << FMath::FAccurater<FReal>(p[i],approx_p[i],M).getRelativeL2Norm()
621
                  << std::endl;
622
    }
623
624
625
626
627
    std::cout << std::endl;

    std::cout << "  Force:" << std::endl;
    for(unsigned i = 0; i<npot;++i) {
        std::cout << "    " << i << ": "
628
629
                  << FMath::FAccurater<FReal>(f[i],approx_f[i],3*M).getRelativeInfNorm()<<", "
                  << FMath::FAccurater<FReal>(f[i],approx_f[i],3*M).getRelativeL2Norm()
630
631
632
633
634
635
636
637
638
639
640
641
                  << std::endl;
    }
    std::cout << std::endl;

    // free memory
    delete [] approx_p;
    delete [] p;
    delete [] approx_f;
    delete [] f;


    return 0;
642
}