Commit e3262e2e authored by BLANCHARD Pierre's avatar BLANCHARD Pierre

HMat: Factorize ACA routines; Provided routine for recompression (ChebSym...

HMat: Factorize ACA routines; Provided routine for recompression (ChebSym keeps its own recompression scheme, namely with storage in K); getRank can be found in namespace FSvd.
parent eb439e34
// ===================================================================================
// Copyright ScalFmm 2011 INRIA, Olivier Coulaud, Berenger Bramas, Matthias Messner
// olivier.coulaud@inria.fr, berenger.bramas@inria.fr
// This software is a computer program whose purpose is to compute the FMM.
//
// This software is governed by the CeCILL-C and LGPL licenses and
// abiding by the rules of distribution of free software.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public and CeCILL-C Licenses for more details.
// "http://www.cecill.info".
// "http://www.gnu.org/licenses".
// ===================================================================================
//
// @SCALFMM_PRIVATE
//
#ifndef FACABLOCK_HPP
#define FACABLOCK_HPP
#include "Utils/FBlas.hpp"
#include "Utils/FAca.hpp"
/*! Choose either \a FULLY_PIVOTED_ACA or \a PARTIALLY_PIVOTED_ACA */
#define FULLY_PIVOTED_ACA
//#define PARTIALLY_PIVOTED_ACA
/*! Choose \a RECOMPRESSED_ACA */
//#define RECOMPRESSED_ACA
template <class FReal, int ORDER = 14>
class FACABlock{
protected:
// members
FReal* block;
FReal* U;
FReal* VT;
int nbRows;
int nbCols;
int level;
unsigned int rank;
FReal accuracy;
FACABlock(const FACABlock&) = delete;
FACABlock& operator=(const FACABlock&) = delete;
public:
FACABlock()
: block(nullptr), U(nullptr), VT(nullptr), nbRows(0), nbCols(0), level(0), rank(0), accuracy(FMath::pow(10.0,static_cast<FReal>(-ORDER))) {
}
// ctor
template <class ViewerClass>
void fill(const ViewerClass& viewer, const int inLevel){
clear();
// Allocate memory
level = inLevel;
nbRows = viewer.getNbRows();
nbCols = viewer.getNbCols();
#if (defined FULLY_PIVOTED_ACA)
block = new FReal[nbRows*nbCols];
for(int idxRow = 0 ; idxRow < nbRows ; ++idxRow){
for(int idxCol = 0 ; idxCol < nbCols ; ++idxCol){
block[idxCol*nbRows+idxRow] = viewer.getValue(idxRow,idxCol);
}
}
#endif
// SVD specific (col major)
rank = std::min(nbRows,nbCols);
//S = new FReal[rank];
FReal* _U ;// = new FReal[nbRows*nbCols]; // Call to computeSVD() copies block into U
FReal* _V;// = new FReal[rank*nbCols];
#if (defined FULLY_PIVOTED_ACA)
// Perform fully pivoted ACA
FAca::fACA<FReal>(block, nbRows, nbCols, accuracy, _U, _V, rank);
#elif (defined PARTIAL_PIVOTED_ACA)
// TODO
// Perform partially pivoted ACA
//FAca::fACA<FReal>(viewer, nbRows, nbCols, accuracy, _U, _V, rank);
#endif
// display rank
std::cout << "rank after ACA=" << rank << std::endl;
#if (defined RECOMPRESSED_ACA)
// Recompression by QR+SVD
FAca::recompress(_U,_V,nbRows,nbCols,accuracy,U,VT,rank);
// display rank after recompression
std::cout << " rank after QR+SVD=" << rank << std::endl;
#else
// Resize U and VT
U = new FReal[nbRows*rank];
for(int idxRow = 0 ; idxRow < nbRows ; ++idxRow)
for(int idxCol = 0 ; idxCol < rank ; ++idxCol)
U[idxCol*nbRows+idxRow] = _U[idxCol*nbRows+idxRow];
VT = new FReal[rank*nbCols];
for(int idxRow = 0 ; idxRow < rank ; ++idxRow)
for(int idxCol = 0 ; idxCol < nbCols ; ++idxCol)
VT[idxCol*rank+idxRow] = _V[idxRow*nbCols+idxCol]; // transposition
#endif
// Free memory
delete [] _U;
delete [] _V;
};
// dtor
~FACABlock(){
// Free memory
clear();
};
void clear(){
nbRows = 0;
nbCols = 0;
level = 0;
rank = 0;
delete[] block;
block = 0;
delete[] U;
U = 0;
delete[] VT;
VT = 0;
}
void gemv(FReal res[], const FReal vec[], const FReal scale = FReal(1.)) const {
//// Apply (dense) block
//FReal* res_dense = new FReal[nbRows];
//FBlas::copy(nbRows,res,res_dense);
//FBlas::gemva(nbRows, nbCols, scale, const_cast<FReal*>(block), const_cast<FReal*>(vec), res_dense);
// Apply low-rank block
FReal* VTvec = new FReal[rank];
FBlas::setzero(rank,VTvec);
// Apply VT
FBlas::gemv(rank, nbCols, scale, const_cast<FReal*>(VT), const_cast<FReal*>(vec), VTvec);
// Apply U
FBlas::gemva(nbRows, rank, scale, const_cast<FReal*>(U), const_cast<FReal*>(VTvec), res);
}
void gemm(FReal res[], const FReal mat[], const int nbRhs, const FReal scale = FReal(1.)) const {
//// Apply (dense) block
//FBlas::gemma(nbRows, nbCols, nbRhs, scale, const_cast<FReal*>(block), nbRows, const_cast<FReal*>(mat), nbCols, res, nbRows);
// Apply low-rank block
FReal* VTmat = new FReal[nbCols*nbRhs];
// Apply VT
FBlas::gemm(rank, nbCols, nbRhs, scale, const_cast<FReal*>(VT), rank, const_cast<FReal*>(mat), nbCols, VTmat, rank);
// Apply U
FBlas::gemma(nbRows, rank, nbRhs, scale, const_cast<FReal*>(U), nbRows, const_cast<FReal*>(VTmat), rank, res, nbRows);
}
};
#endif // FACABLOCK_HPP
// [--END--]
......@@ -20,6 +20,8 @@
#include "../Src/Viewers/FMatDense.hpp"
#include "../Src/Blocks/FDenseBlock.hpp"
#include "../Src/Blocks/FSVDBlock.hpp"
#include "../Src/Blocks/FACABlock.hpp"
#include "Utils/FParameters.hpp"
#include "Utils/FParameterNames.hpp"
......@@ -66,11 +68,11 @@ int main(int argc, char** argv){
{
std::cout << "Test Dense:\n";
//typedef FDenseBlock<FReal> LeafClass;
typedef FDenseBlock<FReal> LeafClass;
//typedef FDenseBlock<FReal> CellClass;
typedef FSVDBlock<FReal,7> LeafClass;
typedef FSVDBlock<FReal,7> CellClass;
//typedef FSVDBlock<FReal,7> CellClass;
typedef FACABlock<FReal,7> CellClass;
typedef FStaticDiagonalBisection<FReal, LeafClass, CellClass> GridClass;
GridClass grid(dim, height);
......@@ -98,10 +100,11 @@ int main(int argc, char** argv){
std::cout << "Test Dense with partitions:\n";
//typedef FDenseBlock<FReal> LeafClass;
typedef FDenseBlock<FReal> LeafClass;
//typedef FDenseBlock<FReal> CellClass;
typedef FSVDBlock<FReal,7> LeafClass;
typedef FSVDBlock<FReal,7> CellClass;
//typedef FSVDBlock<FReal,7> CellClass;
typedef FACABlock<FReal,7> CellClass;
typedef FStaticDiagonalBisection<FReal, LeafClass, CellClass> GridClass;
const int nbPartitions = FMath::pow2(height-1);
......
......@@ -21,6 +21,8 @@
#include "Utils/FBlas.hpp"
#include "Utils/FAca.hpp"
#include "FChebTensor.hpp"
#include "../Interpolation/FInterpSymmetries.hpp"
#include "FChebM2LHandler.hpp"
......@@ -40,221 +42,6 @@
/*! The fully pivoted adaptive cross approximation (fACA) compresses a
far-field interaction as \f$K\sim UV^\top\f$. The fACA requires all entries
to be computed beforehand, then the compression follows in
\f$\mathcal{O}(2\ell^3k)\f$ operations based on the required accuracy
\f$\varepsilon\f$. The matrix K will be destroyed as a result.
@param[in] K far-field to be approximated
@param[in] nx number of rows
@param[in] ny number of cols
@param[in] eps prescribed accuracy
@param[out] U matrix containing \a k column vectors
@param[out] V matrix containing \a k row vectors
@param[out] k final low-rank depends on prescribed accuracy \a eps
*/
template <class FReal>
void fACA(FReal *const K,
const unsigned int nx, const unsigned int ny,
const double eps, FReal* &U, FReal* &V, unsigned int &k)
{
// control vectors (true if not used, false if used)
bool *const r = new bool[nx];
bool *const c = new bool[ny];
for (unsigned int i=0; i<nx; ++i) r[i] = true;
for (unsigned int j=0; j<ny; ++j) c[j] = true;
// compute Frobenius norm of original Matrix K
FReal norm2K = 0;
for (unsigned int j=0; j<ny; ++j) {
const FReal *const colK = K + j*nx;
norm2K += FBlas::scpr(nx, colK, colK);
}
// initialize rank k and UV'
k = 0;
const unsigned int maxk = (nx + ny) / 2;
U = new FReal[nx * maxk];
V = new FReal[ny * maxk];
FBlas::setzero(nx*maxk, U);
FBlas::setzero(ny*maxk, V);
FReal norm2R;
////////////////////////////////////////////////
// start fully pivoted ACA
do {
// find max(K) and argmax(K)
FReal maxK = 0.;
unsigned int pi=0, pj=0;
for (unsigned int j=0; j<ny; ++j)
if (c[j]) {
const FReal *const colK = K + j*nx;
for (unsigned int i=0; i<nx; ++i)
if (r[i] && maxK < FMath::Abs(colK[i])) {
maxK = FMath::Abs(colK[i]);
pi = i;
pj = j;
}
}
// copy pivot cross into U and V
FReal *const colU = U + k*nx;
FReal *const colV = V + k*ny;
const FReal pivot = K[pj*nx + pi];
for (unsigned int i=0; i<nx; ++i) if (r[i]) colU[i] = K[pj*nx + i];
for (unsigned int j=0; j<ny; ++j) if (c[j]) colV[j] = K[j *nx + pi] / pivot;
// don't use these cols and rows anymore
c[pj] = false;
r[pi] = false;
// subtract k-th outer product from K
for (unsigned int j=0; j<ny; ++j)
if (c[j]) {
FReal *const colK = K + j*nx;
FBlas::axpy(nx, FReal(-1. * colV[j]), colU, colK);
}
// compute Frobenius norm of updated K
norm2R = 0.0;
for (unsigned int j=0; j<ny; ++j)
if (c[j]) {
const FReal *const colK = K + j*nx;
norm2R += FBlas::scpr(nx, colK, colK);
}
// increment rank k
++k ;
} while (norm2R > eps*eps * norm2K);
////////////////////////////////////////////////
delete [] r;
delete [] c;
}
/*! The partially pivoted adaptive cross approximation (pACA) compresses a
far-field interaction as \f$K\sim UV^\top\f$. The pACA computes the matrix
entries on the fly, as they are needed. The compression follows in
\f$\mathcal{O}(2\ell^3k)\f$ operations based on the required accuracy
\f$\varepsilon\f$. The matrix K will be destroyed as a result.
@tparam ComputerType the functor type which allows to compute matrix entries
@param[in] Computer the entry-computer functor
@param[in] eps prescribed accuracy
@param[in] nx number of rows
@param[in] ny number of cols
@param[out] U matrix containing \a k column vectors
@param[out] V matrix containing \a k row vectors
@param[out] k final low-rank depends on prescribed accuracy \a eps
*/
template <class FReal, typename ComputerType>
void pACA(const ComputerType& Computer,
const unsigned int nx, const unsigned int ny,
const FReal eps, FReal* &U, FReal* &V, unsigned int &k)
{
// control vectors (true if not used, false if used)
bool *const r = new bool[nx];
bool *const c = new bool[ny];
for (unsigned int i=0; i<nx; ++i) r[i] = true;
for (unsigned int j=0; j<ny; ++j) c[j] = true;
// initialize rank k and UV'
k = 0;
const FReal eps2 = eps * eps;
const unsigned int maxk = (nx + ny) / 2;
U = new FReal[nx * maxk];
V = new FReal[ny * maxk];
// initialize norm
FReal norm2S(0.);
FReal norm2uv(0.);
////////////////////////////////////////////////
// start partially pivoted ACA
unsigned int J = 0, I = 0;
do {
FReal *const colU = U + nx*k;
FReal *const colV = V + ny*k;
////////////////////////////////////////////
// compute row I and its residual
Computer(I, I+1, 0, ny, colV);
r[I] = false;
for (unsigned int l=0; l<k; ++l) {
FReal *const u = U + nx*l;
FReal *const v = V + ny*l;
FBlas::axpy(ny, FReal(-1. * u[I]), v, colV);
}
// find max of residual and argmax
FReal maxval = 0.;
for (unsigned int j=0; j<ny; ++j) {
const FReal abs_val = FMath::Abs(colV[j]);
if (c[j] && maxval < abs_val) {
maxval = abs_val;
J = j;
}
}
// find pivot and scale column of V
const FReal pivot = FReal(1.) / colV[J];
FBlas::scal(ny, pivot, colV);
////////////////////////////////////////////
// compute col J and its residual
Computer(0, nx, J, J+1, colU);
c[J] = false;
for (unsigned int l=0; l<k; ++l) {
FReal *const u = U + nx*l;
FReal *const v = V + ny*l;
FBlas::axpy(nx, FReal(-1. * v[J]), u, colU);
}
// find max of residual and argmax
maxval = 0.0;
for (unsigned int i=0; i<nx; ++i) {
const FReal abs_val = FMath::Abs(colU[i]);
if (r[i] && maxval < abs_val) {
maxval = abs_val;
I = i;
}
}
////////////////////////////////////////////
// increment Frobenius norm: |Sk|^2 += |uk|^2 |vk|^2 + 2 sumj ukuj vjvk
FReal normuuvv(0.);
for (unsigned int l=0; l<k; ++l)
normuuvv += FBlas::scpr(nx, colU, U + nx*l) * FBlas::scpr(ny, V + ny*l, colV);
norm2uv = FBlas::scpr(nx, colU, colU) * FBlas::scpr(ny, colV, colV);
norm2S += norm2uv + 2*normuuvv;
////////////////////////////////////////////
// increment low-rank
++k;
} while (norm2uv > eps2 * norm2S);
delete [] r;
delete [] c;
}
/*! Precomputes the 16 far-field interactions (due to symmetries in their
arrangement all 316 far-field interactions can be represented by
permutations of the 16 we compute in this function). Depending on whether
......@@ -338,9 +125,9 @@ static void precompute(const MatrixKernelClass *const MatrixKernel, const FReal
unsigned int rank;
#ifdef FULLY_PIVOTED_ACASVD
fACA(U, nnodes, nnodes, Epsilon, UU, VV, rank);
FAca::pACA(U, nnodes, nnodes, Epsilon, UU, VV, rank);
#else
pACA(Computer, nnodes, nnodes, Epsilon, UU, VV, rank);
FAca::fACA(Computer, nnodes, nnodes, Epsilon, UU, VV, rank);
#endif
// QR decomposition
......
// ===================================================================================
// Copyright ScalFmm 2011 INRIA
// olivier.coulaud@inria.fr, berenger.bramas@inria.fr
// This software is a computer program whose purpose is to compute the FMM.
//
// This software is governed by the CeCILL-C and LGPL licenses and
// abiding by the rules of distribution of free software.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public and CeCILL-C Licenses for more details.
// "http://www.cecill.info".
// "http://www.gnu.org/licenses".
// ===================================================================================
#ifndef FACA_HPP
#define FACA_HPP
#include "FGlobal.hpp"
#include "FSvd.hpp"
/**
* @class FAca
*
* @brief This class provides the function to perform fully or partially pivoted ACA.
*
*/
namespace FAca {
/*! The fully pivoted adaptive cross approximation (fACA) compresses a
far-field interaction as \f$K\sim UV^\top\f$. The fACA requires all entries
to be computed beforehand, then the compression follows in
\f$\mathcal{O}(2\ell^3k)\f$ operations based on the required accuracy
\f$\varepsilon\f$. The matrix K will be destroyed as a result.
@param[in] K far-field to be approximated
@param[in] nx number of rows
@param[in] ny number of cols
@param[in] eps prescribed accuracy
@param[out] U matrix containing \a k column vectors
@param[out] V matrix containing \a k row vectors
@param[out] k final low-rank depends on prescribed accuracy \a eps
*/
template <class FReal>
void fACA(FReal *const K,
const unsigned int nx, const unsigned int ny,
const double eps, FReal* &U, FReal* &V, unsigned int &k)
{
// control vectors (true if not used, false if used)
bool *const r = new bool[nx];
bool *const c = new bool[ny];
for (unsigned int i=0; i<nx; ++i) r[i] = true;
for (unsigned int j=0; j<ny; ++j) c[j] = true;
// compute Frobenius norm of original Matrix K
FReal norm2K = 0;
for (unsigned int j=0; j<ny; ++j) {
const FReal *const colK = K + j*nx;
norm2K += FBlas::scpr(nx, colK, colK);
}
// initialize rank k and UV'
k = 0;
const unsigned int maxk = (nx + ny) / 2;
U = new FReal[nx * maxk];
V = new FReal[ny * maxk];
FBlas::setzero(nx*maxk, U);
FBlas::setzero(ny*maxk, V);
FReal norm2R;
////////////////////////////////////////////////
// start fully pivoted ACA
do {
// find max(K) and argmax(K)
FReal maxK = 0.;
unsigned int pi=0, pj=0;
for (unsigned int j=0; j<ny; ++j)
if (c[j]) {
const FReal *const colK = K + j*nx;
for (unsigned int i=0; i<nx; ++i)
if (r[i] && maxK < FMath::Abs(colK[i])) {
maxK = FMath::Abs(colK[i]);
pi = i;
pj = j;
}
}
// copy pivot cross into U and V
FReal *const colU = U + k*nx;
FReal *const colV = V + k*ny;
const FReal pivot = K[pj*nx + pi];
for (unsigned int i=0; i<nx; ++i) if (r[i]) colU[i] = K[pj*nx + i];
for (unsigned int j=0; j<ny; ++j) if (c[j]) colV[j] = K[j *nx + pi] / pivot;
// don't use these cols and rows anymore
c[pj] = false;
r[pi] = false;
// subtract k-th outer product from K
for (unsigned int j=0; j<ny; ++j)
if (c[j]) {
FReal *const colK = K + j*nx;
FBlas::axpy(nx, FReal(-1. * colV[j]), colU, colK);
}
// compute Frobenius norm of updated K
norm2R = 0.0;
for (unsigned int j=0; j<ny; ++j)
if (c[j]) {
const FReal *const colK = K + j*nx;
norm2R += FBlas::scpr(nx, colK, colK);
}
// increment rank k
++k ;
} while (norm2R > eps*eps * norm2K);
////////////////////////////////////////////////
delete [] r;
delete [] c;
}
/*! The partially pivoted adaptive cross approximation (pACA) compresses a
far-field interaction as \f$K\sim UV^\top\f$. The pACA computes the matrix
entries on the fly, as they are needed. The compression follows in
\f$\mathcal{O}(2\ell^3k)\f$ operations based on the required accuracy
\f$\varepsilon\f$.