Mentions légales du service

Skip to content
Snippets Groups Projects
Commit cd26d18e authored by hhakim's avatar hhakim
Browse files

Optimize algorithms Faust::HierarchicalFactFGFT and faust::Palm4MSAFGFT.

- D is now a MatSparse (before it was a MatDense).
- Avoiding copies at several locations.

NOTE about misc/test/src/Python/benchmark_Lap_diag.py: to measure the exec. time relatively to matlab original scripts (FAuST 1.03) it's needed to use a constant step like those scripts do (otherwise 2-norms are computed then it's slower).
parent 0b23bbd5
Branches
Tags
No related merge requests found
......@@ -221,7 +221,11 @@ void doit(HierarchicalFact<FPP,Cpu,FPP2>* hierFact, int argc, FPP expectedLambda
//relativeError 2
Faust::MatDense<FPP,Cpu> lapProduct, lapErr;
const Faust::MatDense<FPP,Cpu>& D = hierFactFFT->get_D();
const Faust::MatDense<FPP,Cpu>& D(hierFactFFT->get_D());
FPP testD[Lap.getNbRow()];
hierFactFFT->get_D(testD);
for(int i=0;i<Lap.getNbRow();i++)
assert(testD[i] == D(i,i));
lapErr = hierFactCore.get_product();
lapProduct = hierFactCore.get_product();
lapProduct.transpose();
......
......@@ -24,10 +24,11 @@ namespace Faust
handleError(m_className,"constructor : params and Laplacian matrix Lap haven't compatible size");
delete this->palm_global;
this->palm_global = new Palm4MSAFGFT<FPP,DEVICE,FPP2>(Lap, params, cublasHandle, true);
params.Display();
}
const MatDense<FPP, DEVICE>& get_D();
const MatSparse<FPP, DEVICE>& get_D();
void get_D(FPP* out_diag) const;
......
......@@ -5,11 +5,11 @@ const char * Faust::HierarchicalFactFGFT<FPP,DEVICE,FPP2>::m_className="Faust::H
template<typename FPP,Device DEVICE,typename FPP2>
void Faust::HierarchicalFactFGFT<FPP,DEVICE,FPP2>::get_D(FPP* out_diag) const
{
return dynamic_cast<Palm4MSAFGFT<FPP,Cpu,FPP2>*>(this->palm_global)->get_D(out_diag);
dynamic_cast<Palm4MSAFGFT<FPP,Cpu,FPP2>*>(this->palm_global)->get_D(out_diag);
}
template<typename FPP,Device DEVICE,typename FPP2>
const Faust::MatDense<FPP, DEVICE>& Faust::HierarchicalFactFGFT<FPP,DEVICE,FPP2>::get_D()
const Faust::MatSparse<FPP, DEVICE>& Faust::HierarchicalFactFGFT<FPP,DEVICE,FPP2>::get_D()
{
return dynamic_cast<Palm4MSAFGFT<FPP,Cpu,FPP2>*>(this->palm_global)->get_D();
}
......@@ -143,6 +143,7 @@ namespace Faust
void update_L();
void update_R();
virtual void compute_lambda();
void compute_lambda(Faust::MatDense<FPP,DEVICE>& LorR);
static const char * m_className;
static const FPP lipschitz_multiplicator;
......
......@@ -374,6 +374,12 @@ t_local_compute_grad_over_c.stop();
template<typename FPP,Device DEVICE,typename FPP2>
void Faust::Palm4MSA<FPP,DEVICE,FPP2>::compute_lambda()
{
compute_lambda(LorR);
}
template<typename FPP,Device DEVICE,typename FPP2>
void Faust::Palm4MSA<FPP,DEVICE,FPP2>::compute_lambda(Faust::MatDense<FPP,DEVICE>& LorR)
{
#ifdef __COMPILE_TIMERS__
t_global_compute_lambda.start();
......
......@@ -12,14 +12,13 @@ namespace Faust {
template<typename FPP, Device DEVICE, typename FPP2 = double>
class Palm4MSAFGFT : public Palm4MSA<FPP, DEVICE, FPP2>
{
MatDense<FPP, DEVICE> D; //TODO: later it will need to be Sparse (which needs to add a prototype overload for multiplication in faust_linear_algebra.h)
MatDense<FPP,DEVICE> D_grad_over_c; //TODO: move to sparse mat later
MatSparse<FPP, DEVICE> D;
MatDense<FPP,DEVICE> D_grad_over_c;
public:
//TODO: another ctor (like in Palm4MSA) for hierarchical algo. use
Palm4MSAFGFT(const ParamsPalmFGFT<FPP, DEVICE, FPP2>& params, const BlasHandle<DEVICE> blasHandle, const bool isGlobal=false);
Palm4MSAFGFT(const MatDense<FPP,DEVICE>& Lap, const ParamsFGFT<FPP,DEVICE,FPP2> & params, const BlasHandle<DEVICE> blasHandle, const bool isGlobal);
void next_step();
const MatDense<FPP, DEVICE>& get_D();
const MatSparse<FPP, DEVICE>& get_D();
void get_D(FPP* diag_data);
private:
void compute_grad_over_c();
......
#include <cstring>
template <typename FPP, Device DEVICE, typename FPP2>
Palm4MSAFGFT<FPP,DEVICE,FPP2>::Palm4MSAFGFT(const ParamsPalmFGFT<FPP, DEVICE, FPP2>& params, const BlasHandle<DEVICE> blasHandle, const bool isGlobal) : Palm4MSA<FPP,DEVICE,FPP2>(params, blasHandle, isGlobal), D(params.init_D)
Palm4MSAFGFT<FPP,DEVICE,FPP2>::Palm4MSAFGFT(const ParamsPalmFGFT<FPP, DEVICE, FPP2>& params, const BlasHandle<DEVICE> blasHandle, const bool isGlobal) : Palm4MSA<FPP,DEVICE,FPP2>(params, blasHandle, isGlobal), D(MatSparse<FPP,Cpu>(params.init_D))
{
//TODO: is there something to check additionally to what parent's ctor checks ?
}
template<typename FPP,Device DEVICE,typename FPP2>
Palm4MSAFGFT<FPP,DEVICE,FPP2>::Palm4MSAFGFT(const MatDense<FPP,DEVICE>& Lap, const ParamsFGFT<FPP,DEVICE,FPP2> & params, const BlasHandle<DEVICE> blasHandle, const bool isGlobal) : Palm4MSA<FPP,DEVICE,FPP2>(Lap, params, blasHandle, isGlobal), D(params.init_D)
Palm4MSAFGFT<FPP,DEVICE,FPP2>::Palm4MSAFGFT(const MatDense<FPP,DEVICE>& Lap, const ParamsFGFT<FPP,DEVICE,FPP2> & params, const BlasHandle<DEVICE> blasHandle, const bool isGlobal) : Palm4MSA<FPP,DEVICE,FPP2>(Lap, params, blasHandle, isGlobal), D(MatSparse<FPP,Cpu>(params.init_D))
{
}
template <typename FPP, Device DEVICE, typename FPP2>
void Palm4MSAFGFT<FPP,DEVICE,FPP2>::compute_grad_over_c()
{
......@@ -50,7 +51,6 @@ void Palm4MSAFGFT<FPP,DEVICE,FPP2>::compute_grad_over_c()
complexity[3] = L2*S2*R2 + L1*L2*R2 + L1*R2*S2 + L2*L1*S2;
int idx = distance(complexity.begin(), min_element(complexity.begin(), complexity.end()));
this->error = this->data;
Faust::MatDense<FPP,DEVICE> tmp1,tmp2,tmp3;
......@@ -88,9 +88,9 @@ void Palm4MSAFGFT<FPP,DEVICE,FPP2>::compute_grad_over_c()
multiply(this->RorL[this->m_indFact], tmp1, tmp2, this->blas_handle);
}
}
// tmp1 = L*this->S*R*D //TODO: review the mul with D being MatSparse
//TODO: optimize by determining the best product order regarding computation time
multiply(tmp2, D, tmp1, this->blas_handle);
// tmp1 = L*this->S*R*D
tmp1 = tmp2;
tmp1 *= D;
// this->error = lambda*tmp1*lambda*tmp2'-data // this->error is data before this call
gemm(tmp1, tmp2, this->error, this->m_lambda*this->m_lambda, (FPP)-1.0, 'N', this->TorH, this->blas_handle);
......@@ -137,6 +137,7 @@ void Palm4MSAFGFT<FPP,DEVICE,FPP2>::compute_grad_over_c()
}
this->isGradComputed = true;
#ifdef __COMPILE_TIMERS__
......@@ -154,18 +155,14 @@ void Palm4MSAFGFT<FPP,DEVICE,FPP2>::compute_lambda()
// Xhat = LorR*D*LorR' // LorR equals the prod of all factors after their update iterations (in loop of next_step())
MatDense<FPP,Cpu> tmp;
// tmp = D*LorR'
gemm(this->D, this->LorR, tmp, (FPP) 1.0, (FPP) 0.0, 'N', this->TorH, this->blas_handle);
Faust::spgemm(this->D, this->LorR, tmp, (FPP) 1.0, (FPP) 0.0, 'N', this->TorH/*, this->blas_handle*/);
// LorR = LorR*tmp
gemm(this->LorR, tmp, D_grad_over_c, (FPP) 1.0, (FPP) 0.0, 'N', 'N', this->blas_handle);
tmp = this->LorR;
this->LorR = D_grad_over_c;
//NOTE: D_grad_over_c has nothing to do here but is equal to LorR*D*LorR*D'
//NOTE: D_grad_over_c has nothing to do here but is equal to LorR*D*LorR'
// this product is thus not re-computed in compute_D_grad_over_c()
//TODO: avoid all these copies
// at this stage we can rely on parent function to compute lambda
Palm4MSA<FPP,DEVICE,FPP2>::compute_lambda();
Palm4MSA<FPP,DEVICE,FPP2>::compute_lambda(D_grad_over_c);
// reset LorR at the factor product to continue next iterations
this->LorR = tmp;
//then we finish the lambda computation with a sqrt() (Fro. norm)
this->m_lambda = std::sqrt(/*Faust::abs(*/this->m_lambda);//);
// (that's an additional operation in Palm4MSAFGFT)
......@@ -181,22 +178,15 @@ void Palm4MSAFGFT<FPP,DEVICE,FPP2>::next_step()
}
template <typename FPP, Device DEVICE, typename FPP2>
void Palm4MSAFGFT<FPP,DEVICE,FPP2>::compute_D()
{
// besides to what the parent has done
// we need to update D
compute_D_grad_over_c();
D -= D_grad_over_c;
//TODO: optimize MatSparse + no-copy (Eigen::DiagonalMatrix ?)
FPP * data = new FPP[D.getNbRow()*D.getNbCol()];
memset(data, 0, sizeof(FPP)*D.getNbRow()*D.getNbCol());
//#pragma omp parallel for schedule(static)
for(faust_unsigned_int i = 0; i < D.getNbCol();i++)
data[i*D.getNbCol()+i] = D[i*D.getNbCol()+i];
D = MatDense<FPP,Cpu>(data, D.getNbRow(), D.getNbCol());
delete data;
D.getValuePtr()[i] = D.getValuePtr()[i]-D_grad_over_c.getData()[i*D.getNbCol()+i];
}
template <typename FPP, Device DEVICE, typename FPP2>
......@@ -205,10 +195,9 @@ void Palm4MSAFGFT<FPP,DEVICE,FPP2>::compute_D_grad_over_c()
// Uhat = lambda*LorR
// grad = 0.5*Uhat'*(Uhat*D*Uhat' - X)*Uhat
MatDense<FPP, Cpu> tmp;
//compute_lambda has already compute D_grad_over_c = LorR*D*LorR'
//compute_lambda has already computed D_grad_over_c = LorR*D*LorR'
D_grad_over_c.scalarMultiply(this->m_lambda*this->m_lambda);
D_grad_over_c -= this->data;
//TODO: opt. by determining best order of product
// tmp = Uhat'*(Uhat*D*Uhat' - X)
gemm(this->LorR, D_grad_over_c, tmp, (FPP) this->m_lambda, (FPP) 0., this->TorH, 'N', this->blas_handle);
// D_grad_over_c = 0.5*Uhat'*(Uhat*D*Uhat' - X)*Uhat
......@@ -216,7 +205,7 @@ void Palm4MSAFGFT<FPP,DEVICE,FPP2>::compute_D_grad_over_c()
}
template <typename FPP, Device DEVICE, typename FPP2>
const MatDense<FPP, DEVICE>& Palm4MSAFGFT<FPP,DEVICE,FPP2>::get_D()
const MatSparse<FPP, DEVICE>& Palm4MSAFGFT<FPP,DEVICE,FPP2>::get_D()
{
return this->D;
}
......@@ -224,9 +213,7 @@ const MatDense<FPP, DEVICE>& Palm4MSAFGFT<FPP,DEVICE,FPP2>::get_D()
template <typename FPP, Device DEVICE, typename FPP2>
void Palm4MSAFGFT<FPP,DEVICE,FPP2>::get_D(FPP* diag_data)
{
for(int i=0;i<D.getNbRow();i++)
diag_data[i] = D.getData()[i*D.getNbRow()+i];
//TODO: when D will switch to MatSparse (or diag mat) directory copy the buffer per block
memcpy(diag_data, this->D.getValuePtr(), sizeof(FPP)*this->D.getNbCol());
}
template <typename FPP, Device DEVICE, typename FPP2>
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment