Commit 3eca64c2 authored by Matthias Messner's avatar Matthias Messner

added partially pivoted ACA

parent f63e802b
......@@ -5,6 +5,8 @@
#include "../../Utils/FPoint.hpp"
#include "../../Utils/FNoCopyable.hpp"
#include "../../Utils/FMath.hpp"
#include "../../Utils/FBlas.hpp"
// extendable
enum KERNEL_FUNCCTION_IDENTIFIER {ONE_OVER_R,
......@@ -126,6 +128,73 @@ struct FChebMatrixKernelLJ : FChebAbstractMatrixKernel
/*! Functor which provides the interface to assemble a matrix based on the
number of rows and cols and on the coordinates x and y and the type of the
generating matrix-kernel function.
*/
template <typename MatrixKernel>
class EntryComputer
{
const MatrixKernel Kernel;
const unsigned int nx, ny;
const FPoint *const px, *const py;
const FReal *const weights;
public:
explicit EntryComputer(const unsigned int _nx, const FPoint *const _px,
const unsigned int _ny, const FPoint *const _py,
const FReal *const _weights = NULL)
: Kernel(), nx(_nx), ny(_ny), px(_px), py(_py), weights(_weights) {}
// template <typename Point>
// void operator()(const unsigned int nx, const Point *const px,
// const unsigned int ny, const Point *const py,
// FReal *const data) const
// {
// for (unsigned int j=0; j<ny; ++j)
// for (unsigned int i=0; i<nx; ++i)
// data[j*nx + i] = Kernel.evaluate(px[i], py[j]);
// }
void operator()(const unsigned int xbeg, const unsigned int xend,
const unsigned int ybeg, const unsigned int yend,
FReal *const data) const
{
unsigned int idx = 0;
if (weights) {
for (unsigned int j=ybeg; j<yend; ++j)
for (unsigned int i=xbeg; i<xend; ++i)
data[idx++] = weights[i] * weights[j] * Kernel.evaluate(px[i], py[j]);
} else {
for (unsigned int j=ybeg; j<yend; ++j)
for (unsigned int i=xbeg; i<xend; ++i)
data[idx++] = Kernel.evaluate(px[i], py[j]);
}
/*
// apply weighting matrices
if (weights) {
if ((xend-xbeg) == (yend-ybeg) && (xend-xbeg) == nx)
for (unsigned int n=0; n<nx; ++n) {
FBlas::scal(nx, weights[n], data + n, nx); // scale rows
FBlas::scal(nx, weights[n], data + n * nx); // scale cols
}
else if ((xend-xbeg) == 1 && (yend-ybeg) == ny)
for (unsigned int j=0; j<ny; ++j) data[j] *= weights[j];
else if ((yend-ybeg) == 1 && (xend-xbeg) == nx)
for (unsigned int i=0; i<nx; ++i) data[i] *= weights[i];
}
*/
}
};
#endif // FCHEBMATRIXKERNEL_HPP
// [--END--]
......@@ -208,82 +208,6 @@ public:
}
// template <KERNEL_FUNCTION_TYPE TYPE>
// struct BlockedOuterProduct;
//
// template<>
// struct BlockedOuterProduct<HOMOGENEOUS>
// {
// static void apply() const
// {
// FReal Compressed [nnodes * 24];
// const FReal CellWidth(AbstractBaseClass::BoxWidth / FReal(FMath::pow(2, TreeLevel)));
// const FReal scale(AbstractBaseClass::MatrixKernel->getScaleFactor(CellWidth));
// for (unsigned int pidx=0; pidx<343; ++pidx) {
// const unsigned int count = countExp[pidx];
// if (count) {
// const unsigned int rank = SymHandler->LowRank[pidx];
// // rank * count * (2*nnodes-1) flops
// FBlas::gemtm(nnodes, rank, count, FReal(1.),
// SymHandler->K[pidx]+rank*nnodes, nnodes, Mul[pidx], nnodes, Compressed, rank);
// // nnodes *count * (2*rank-1) flops
// FBlas::gemm( nnodes, rank, count, scale,
// SymHandler->K[pidx], nnodes, Compressed, rank, Loc[pidx], nnodes);
// }
// }
// }
// };
//template <class ParticleClass, class CellClass, class ContainerClass, class MatrixKernelClass, int ORDER>
//template<>
//void FChebSymKernel::BlockedOuterProduct<NON_HOMOGENEOUS>()
//{
// FReal Compressed [nnodes * 24];
// for (unsigned int pidx=0; pidx<343; ++pidx) {
// const unsigned int count = countExp[pidx];
// if (count) {
// const unsigned int rank = SymHandler->LowRank[pidx];
// // rank * count * (2*nnodes-1) flops
// FBlas::gemtm(nnodes, rank, count, FReal(1.),
// SymHandler->K[pidx]+rank*nnodes, nnodes, Mul[pidx], nnodes, Compressed, rank);
// // nnodes *count * (2*rank-1) flops
// FBlas::gemm( nnodes, rank, count, FReal(1.),
// SymHandler->K[pidx], nnodes, Compressed, rank, Loc[pidx], nnodes);
// }
// }
//}
//template <KERNEL_FUNCTION_TYPE TYPE>
//struct blocked_outer_product { static void apply(const int TreeLevel); };
//
//template <>
//struct blocked_outer_product<HOMOGENEOUS>
//{
// static void apply(const int TreeLevel)
// {
// FReal Compressed [nnodes * 24];
// const FReal CellWidth(AbstractBaseClass::BoxWidth / FReal(FMath::pow(2, TreeLevel)));
// const FReal scale(AbstractBaseClass::MatrixKernel->getScaleFactor(CellWidth));
// for (unsigned int pidx=0; pidx<343; ++pidx) {
// const unsigned int count = countExp[pidx];
// if (count) {
// const unsigned int rank = SymHandler->LowRank[pidx];
// // rank * count * (2*nnodes-1) flops
// FBlas::gemtm(nnodes, rank, count, FReal(1.),
// SymHandler->K[pidx]+rank*nnodes, nnodes, Mul[pidx], nnodes, Compressed, rank);
// // nnodes *count * (2*rank-1) flops
// FBlas::gemm( nnodes, rank, count, scale,
// SymHandler->K[pidx], nnodes, Compressed, rank, Loc[pidx], nnodes);
// }
// }
// }
//};
//
//template <>
//struct blocked_outer_product<NON_HOMOGENEOUS>
//{
// static void apply(const int TreeLevel) {}
//};
/*
void M2L(CellClass* const FRestrict TargetCell,
......
......@@ -15,17 +15,21 @@
*/
/*!
If \a FACASVD is defined ACA+SVD will be used instead of only SVD.
/*! Choose either \a FULLY_PIVOTED_ACASVD or \a PARTIALLY_PIVOTED_ACASVD or
\a ONLY_SVD.
*/
#define FACASVD
//#define ONLY_SVD
//#define FULLY_PIVOTED_ACASVD
#define PARTIALLY_PIVOTED_ACASVD
/*! The fully pivoted adaptive cross approximation (fACA) compresses a
far-field interaction as \f$K\sim UV^\top\f$. The fACA requires all entries
to be computed beforehand, then the compression follows in
\f$\mathcal{O}(2\ell^3k)\f$ operations based on the required accuracy
\f$\varepsilon\f$.
\f$\varepsilon\f$. The matrix K will be destroyed as a result.
@param[in] K far-field to be approximated
@param[in] eps prescribed accuracy
......@@ -113,6 +117,116 @@ static void fACA(FReal *const K, const double eps, FReal* &U, FReal* &V, unsigne
}
/*! The partially pivoted adaptive cross approximation (pACA) compresses a
far-field interaction as \f$K\sim UV^\top\f$. The pACA computes the matrix
entries on the fly, as they are needed. The compression follows in
\f$\mathcal{O}(2\ell^3k)\f$ operations based on the required accuracy
\f$\varepsilon\f$. The matrix K will be destroyed as a result.
@tparam ComputerType the functor type which allows to compute matrix entries
@param[in] Computer the entry-computer functor
@param[in] eps prescribed accuracy
@param[in] nx number of rows
@param[in] ny number of cols
@param[out] U matrix containing \a k column vectors
@param[out] V matrix containing \a k row vectors
@param[out] k final low-rank depends on prescribed accuracy \a eps
*/
template <typename ComputerType>
void pACA(const ComputerType& Computer, const FReal eps,
const unsigned int nx, const unsigned int ny,
FReal* &U, FReal* &V, unsigned int &k)
{
// control vectors (true if not used, false if used)
bool *const r = new bool[nx];
bool *const c = new bool[ny];
for (unsigned int i=0; i<nx; ++i) r[i] = true;
for (unsigned int j=0; j<ny; ++j) c[j] = true;
// initialize rank k and UV'
k = 0;
const int maxk = (nx + ny) / 2;
U = new FReal[nx * maxk];
V = new FReal[ny * maxk];
// initialize norm
FReal norm2S(0.);
FReal norm2uv(0.);
////////////////////////////////////////////////
// start partially pivoted ACA
unsigned int J = 0, I = 0;
do {
FReal *const colU = U + nx*k;
FReal *const colV = V + ny*k;
////////////////////////////////////////////
// compute row I and its residual
Computer(I, I+1, 0, ny, colV);
r[I] = false;
for (unsigned int l=0; l<k; ++l) {
FReal *const u = U + nx*l;
FReal *const v = V + ny*l;
for (unsigned int j=0; j<ny; ++j)
colV[j] -= u[I] * v[j];
}
// find max of residual and argmax
FReal maxval = 0.;
for (unsigned int j=0; j<ny; ++j)
if (c[j] && maxval < FMath::Abs(colV[j])) {
maxval = FMath::Abs(colV[j]);
J = j;
}
// scale pivot v
const FReal pivot = colV[J];
for (unsigned int j=0; j<ny; ++j) colV[j] /= pivot;
////////////////////////////////////////////
// compute col J and its residual
Computer(0, nx, J, J+1, colU);
c[J] = false;
for (unsigned int l=0; l<k; ++l) {
FReal *const u = U + nx*l;
FReal *const v = V + ny*l;
for (unsigned int i=0; i<nx; ++i)
colU[i] -= u[i] * v[J];
}
// find max of residual and argmax
maxval = 0.;
for (unsigned int i=0; i<nx; ++i)
if (r[i] && maxval < FMath::Abs(colU[i])) {
maxval = FMath::Abs(colU[i]);
I = i;
}
////////////////////////////////////////////
// increment Frobenius norm: |Sk|^2 += |uk|^2 |vk|^2 + 2 sumj ukuj vjvk
FReal normuuvv(0.);
for (unsigned int l=0; l<k; ++l)
normuuvv += FBlas::scpr(nx, colU, U + nx*l) * FBlas::scpr(ny, V + ny*l, colV);
norm2uv = FBlas::scpr(nx, colU, colU) * FBlas::scpr(ny, colV, colV);
norm2S += norm2uv + 2*normuuvv;
////////////////////////////////////////////
// increment low-rank
k++;
} while (norm2uv > eps*eps * norm2S);
delete [] r;
delete [] c;
}
/*! Precomputes the 16 far-field interactions (due to symmetries in their
arrangement all 316 far-field interactions can be represented by
permutations of the 16 we compute in this function). Depending on whether
......@@ -139,19 +253,34 @@ static void precompute(const MatrixKernelClass *const MatrixKernel, const FReal
FReal *const WORK = new FReal [LWORK];
FReal *const VT = new FReal [nnodes*nnodes];
FReal *const S = new FReal [nnodes];
// initialize timer
FTic time;
FReal overall_time(0.);
FReal elapsed_time(0.);
unsigned int counter = 0;
for (int i=2; i<=3; ++i) {
for (int j=0; j<=i; ++j) {
for (int k=0; k<=j; ++k) {
// assemble matrix
// assemble matrix and apply weighting matrices
const FPoint cy(CellWidth*FReal(i), CellWidth*FReal(j), CellWidth*FReal(k));
FChebTensor<ORDER>::setRoots(cy, CellWidth, Y);
for (unsigned int n=0; n<nnodes; ++n)
for (unsigned int m=0; m<nnodes; ++m)
U[n*nnodes + m] = MatrixKernel->evaluate(X[m], Y[n]);
FReal weights[nnodes];
FChebTensor<ORDER>::setRootOfWeights(weights);
// now the entry-computer is responsible for weighting the matrix entries
EntryComputer<MatrixKernelClass> Computer(nnodes, X, nnodes, Y, weights);
// start timer
time.tic();
#if (defined ONLY_SVD || defined FULLY_PIVOTED_ACASVD)
Computer(0, nnodes, 0, nnodes, U);
#endif
/*
// applying weights ////////////////////////////////////////
FReal weights[nnodes];
FChebTensor<ORDER>::setRootOfWeights(weights);
......@@ -159,16 +288,28 @@ static void precompute(const MatrixKernelClass *const MatrixKernel, const FReal
FBlas::scal(nnodes, weights[n], U + n, nnodes); // scale rows
FBlas::scal(nnodes, weights[n], U + n * nnodes); // scale cols
}
//////////////////////////////////////////////////////////
*/
//////////////////////////////////////////////////////////////
//////////////////////////////////////////////////////////////
//////////////////////////////////////////////////////////////
// ALL PREPROC FLAGS ARE SET ON TOP OF THIS FILE !!! /////////
//////////////////////////////////////////////////////////////
//////////////////////////////////////////////////////////////
//////////////////////////////////////////////////////////////
//////////////////////////////////////////////////////////////
#ifdef FACASVD ///////////////////////////////////////////////////////
// fully pivoted ACA
#if (defined FULLY_PIVOTED_ACASVD || defined PARTIALLY_PIVOTED_ACASVD) ////////////
FReal *UU, *VV;
unsigned int rank;
#ifdef FULLY_PIVOTED_ACASVD
fACA<ORDER>(U, Epsilon, UU, VV, rank);
#else
pACA(Computer, Epsilon, nnodes, nnodes, UU, VV, rank);
#endif
// QR decomposition
FReal* phi = new FReal [rank*rank];
......@@ -246,10 +387,20 @@ static void precompute(const MatrixKernelClass *const MatrixKernel, const FReal
delete [] UU;
delete [] VV;
elapsed_time = time.tacAndElapsed();
overall_time += elapsed_time;
std::cout << "(" << i << "," << j << "," << k << ") " << idx <<
", low rank = " << rank << " (" << aca_rank << ")" << std::endl;
", low rank = " << rank << " (" << aca_rank << ") in " << elapsed_time << "s" << std::endl;
#else
//////////////////////////////////////////////////////////////
//////////////////////////////////////////////////////////////
//////////////////////////////////////////////////////////////
// ALL PREPROC FLAGS ARE SET ON TOP OF THIS FILE !!! /////////
//////////////////////////////////////////////////////////////
//////////////////////////////////////////////////////////////
//////////////////////////////////////////////////////////////
#elif defined ONLY_SVD
// truncated singular value decomposition of matrix
INFO = FBlas::gesvd(nnodes, nnodes, U, S, VT, nnodes, LWORK, WORK);
if (INFO!=0) throw std::runtime_error("SVD did not converge with " + INFO);
......@@ -266,12 +417,23 @@ static void precompute(const MatrixKernelClass *const MatrixKernel, const FReal
for (unsigned int r=0; r<rank; ++r)
FBlas::copy(nnodes, VT + r, nnodes, K[idx] + rank*nnodes + r*nnodes, 1);
elapsed_time = time.tacAndElapsed();
overall_time += elapsed_time;
std::cout << "(" << i << "," << j << "," << k << ") " << idx <<
", low rank = " << rank << std::endl;
", low rank = " << rank << " in " << elapsed_time << "s" << std::endl;
#else
#error Either fully-, partially pivoted ACA or only SVD must be defined!
#endif ///////////////////////////////////////////////////////////////
//////////////////////////////////////////////////////////////
//////////////////////////////////////////////////////////////
//////////////////////////////////////////////////////////////
//////////////////////////////////////////////////////////////
// ALL PREPROC FLAGS ARE SET ON TOP OF THIS FILE !!! /////////
//////////////////////////////////////////////////////////////
//////////////////////////////////////////////////////////////
//////////////////////////////////////////////////////////////
// un-weighting ////////////////////////////////////////////
......@@ -285,7 +447,7 @@ static void precompute(const MatrixKernelClass *const MatrixKernel, const FReal
}
}
}
std::cout << "num interactions = " << counter << std::endl;
std::cout << "num interactions = " << counter << " in " << overall_time << "s" << std::endl;
delete [] U;
delete [] WORK;
delete [] VT;
......
......@@ -80,8 +80,8 @@ int main(int argc, char* argv[])
const unsigned int SubTreeHeight = FParameters::getValue(argc, argv, "-sh", 2);
const unsigned int NbThreads = FParameters::getValue(argc, argv, "-t", 1);
const unsigned int ORDER = 5;
const FReal epsilon = FReal(1e-5);
const unsigned int ORDER = 7;
const FReal epsilon = FReal(1e-7);
// init timer
FTic time;
......@@ -168,7 +168,7 @@ int main(int argc, char* argv[])
{
omp_set_num_threads(NbThreads);
std::cout << "\nChebyshev FMM using" << omp_get_max_threads() << " threads ..." << std::endl;
std::cout << "\nChebyshev FMM using " << omp_get_max_threads() << " threads ..." << std::endl;
KernelClass kernels(TreeHeight, loader.getCenterOfBox(), loader.getBoxWidth(), epsilon);
FmmClass algorithm(&tree, &kernels);
time.tic();
......
// ===================================================================================
// Ce LOGICIEL "ScalFmm" est couvert par le copyright Inria 20xx-2012.
// Inria détient tous les droits de propriété sur le LOGICIEL, et souhaite que
// la communauté scientifique l'utilise afin de le tester et de l'évaluer.
// Inria donne gracieusement le droit d'utiliser ce LOGICIEL. Toute utilisation
// dans un but lucratif ou à des fins commerciales est interdite sauf autorisation
// expresse et préalable d'Inria.
// Toute utilisation hors des limites précisées ci-dessus et réalisée sans l'accord
// expresse préalable d'Inria constituerait donc le délit de contrefaçon.
// Le LOGICIEL étant un produit en cours de développement, Inria ne saurait assurer
// aucune responsabilité et notamment en aucune manière et en aucun cas, être tenu
// de répondre d'éventuels dommages directs ou indirects subits par l'utilisateur.
// Tout utilisateur du LOGICIEL s'engage à communiquer à Inria ses remarques
// relatives à l'usage du LOGICIEL
// ===================================================================================
// ==== CMAKE =====
// @FUSE_BLAS
// ================
#include <iostream>
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include "../../Src/Utils/FTic.hpp"
#include "../../Src/Utils/FMath.hpp"
//#include "../../Src/Utils/FParameters.hpp"
//#include "../../Src/Containers/FVector.hpp"
//
////#include "../../Src/Utils/FAssertable.hpp"
#include "../../Src/Utils/FPoint.hpp"
//#include "../../Src/Kernels/Chebyshev/FChebParticle.hpp"
//#include "../../Src/Kernels/Chebyshev/FChebLeaf.hpp"
//#include "../../Src/Kernels/Chebyshev/FChebInterpolator.hpp"
#include "../../Src/Kernels/Chebyshev/FChebMatrixKernel.hpp"
#include "../../Src/Kernels/Chebyshev/FChebRoots.hpp"
#include "../../Src/Kernels/Chebyshev/FChebTensor.hpp"
#include "../../Src/Kernels/Chebyshev/FChebSymM2LHandler.hpp"
FReal computeL2norm(unsigned int N, FReal *const u, FReal *const v)
{
FReal dot = FReal(0.);
FReal diff_dot = FReal(0.);
for (unsigned int i=0; i<N; ++i) {
FReal w = v[i] - u[i];
diff_dot += w * w;
dot += u[i] * u[i];
}
return FMath::Sqrt(diff_dot / dot);
}
FReal computeINFnorm(unsigned int N, FReal *const u, FReal *const v)
{
FReal max = FReal(0.);
FReal diff_max = FReal(0.);
for (unsigned int n=0; n<N; ++n) {
if ( max<std::abs(u[n])) max = std::abs(u[n]);
if (diff_max<std::abs(u[n]-v[n])) diff_max = std::abs(u[n]-v[n]);
}
return diff_max / max;
}
/**
* In this file we show how to use octree
*/
int main(int argc, char* argv[])
{
typedef FChebMatrixKernelR MatrixKernelClass;
MatrixKernelClass MatrixKernel;
const unsigned int ORDER = 10;
const FReal epsilon = 1e-10;
const unsigned int nnodes = TensorTraits<ORDER>::nnodes;
// cell size
FReal width = FReal(2.);
// centers of cells X and Y
FPoint cx(0., 0., 0.);
FPoint cy(FReal(2.)*width, 0., 0.);
// compute Cheb points in cells X and Y
FPoint rootsX[nnodes], rootsY[nnodes];
FChebTensor<ORDER>::setRoots(cx, width, rootsX);
FChebTensor<ORDER>::setRoots(cy, width, rootsY);
// initialize timer
FTic time;
// fully pivoted ACA ///////////////////////////
std::cout << "Fully pivoted ACA of Acc(" << ORDER << ", " << epsilon << ")" << std::endl;
std::cout << "|- Assembling K" << std::flush;
FReal *const K = new FReal [nnodes * nnodes];
time.tic();
EntryComputer<MatrixKernelClass> Computer(nnodes, rootsX, nnodes, rootsY);
Computer(0, nnodes, 0, nnodes, K);
std::cout << ", finished in " << time.tacAndElapsed() << "s." << std::endl;
// generate right hand side vector
FReal *const w = new FReal [nnodes];
const FReal FRandMax = FReal(RAND_MAX);
for (unsigned int j=0; j<nnodes; ++j)
w[j] = FReal(rand())/FRandMax;
// compute f0 = Kw
FReal *const f0 = new FReal [nnodes];
FBlas::gemv(nnodes, nnodes, FReal(1.), K, w, f0);
// call fACA /////////////////////////////////
std::cout << "|- Computing fACA" << std::flush;
time.tic();
FReal *U, *V;
unsigned int k;
fACA<ORDER>(K, epsilon, U, V, k);
std::cout << " (k = " << k << "), finished in " << time.tacAndElapsed() << "s." << std::endl;
delete [] K;
// compute f1 = UV'w
FReal *const f1 = new FReal [nnodes];
FReal *const c1 = new FReal [k];
FBlas::gemtv(nnodes, k, FReal(1.), V, w, c1);
FBlas::gemv( nnodes, k, FReal(1.), U, c1, f1);
delete [] c1;
std::cout << " |- L2 error = " << computeL2norm(nnodes, f0, f1) << std::endl;
delete [] U;
delete [] V;
// call pACA ///////////////////////////////////
std::cout << "|- Computing pACA" << std::flush;
time.tic();
// pACA(Computer, epsilon, nnodes, rootsX, nnodes, rootsY, U, V, k);
pACA(Computer, epsilon, nnodes, nnodes, U, V, k);
std::cout << " (k = " << k << "), finished in " << time.tacAndElapsed() << "s." << std::endl;
// compute f1 = UV'w
FReal *const f2 = new FReal [nnodes];
FReal *const c2 = new FReal [k];
FBlas::gemtv(nnodes, k, FReal(1.), V, w, c2);
FBlas::gemv( nnodes, k, FReal(1.), U, c2, f2);
delete [] c2;
std::cout << " |- L2 error = " << computeL2norm(nnodes, f0, f2) << std::endl;
delete [] U;
delete [] V;
delete [] w;
delete [] f0;
delete [] f1;
delete [] f2;
return 0;
}
// [--END--]
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment