Mentions légales du service

Skip to content
Snippets Groups Projects
Commit cfffefd7 authored by hhakim's avatar hhakim
Browse files

Attempt to optimize GivensFGFT(Parallel) for L residual being a MatSparse.

It didn't show any enhancement in performances, so the optimization is disabled but kept in code (the compilation constant OPT_UPDATE_SPARSE_L must be defined for the optimization to be enabled).

The optimization is about the product L = S'*L*S calculation. And the optimization attempt is similar to what have been done for the case where L is dense (in this case it works well).

Here are some results:
J=256, t=128
with "optimization"
>>python$ for i in {1..10}; do python3 test_eigtj_mira_lap_sparse.py;done | grep time | awk '{print $2}'
0.4260462010000001
0.4810052160000007
0.4244757539999995
0.4957183700000005
0.4338328650000003
0.4226386240000002
0.4573114149999995
0.4332176910000003
0.4304637250000001
0.4894571430000001

without "optimization"
>>python$ for i in {1..10}; do python3 test_eigtj_mira_lap_sparse.py;done | grep time | awk '{print $2}'
0.4188697339999994
0.415433846
0.41125124199999963
0.41828689799999985
0.4699177429999999
0.4142650020000005
0.46411832699999955
0.42669971800000006
0.4497221189999996
0.4138595669999994

other runs with J=32, t=2
with "optimization"
>>python$ for i in {1..10}; do python3 test_eigtj_mira_lap_sparse.py;done | grep time | awk '{print $2}'
2.7853622209999997
2.8483473470000007
3.12851224
2.8791390860000003
2.84674213
2.8365956349999992
2.7762758190000003
2.85138975
2.8108322590000006
2.9169309619999995

without "optimization"
>>python$ for i in {1..10}; do python3 test_eigtj_mira_lap_sparse.py;done | grep time | awk '{print $2}'
2.787849833
2.8399939510000003
2.809797146
2.8418070140000005
2.7759090029999998
2.7810152440000007
2.833329901
2.772917798
2.846831988000001
2.765091783
parent 8bf4da62
No related branches found
No related tags found
No related merge requests found
......@@ -172,6 +172,8 @@ namespace Faust {
*/
void update_L_second(Faust::Vect<FPP,DEVICE>& L_vec_p, Faust::Vect<FPP,DEVICE>& L_vec_q, const FPP2& c, const FPP2& s, int p, int q, MatDense<FPP,DEVICE> & L);
void update_L_first(Eigen::SparseMatrix<FPP, RowMajor> & L_vec_p, Eigen::SparseMatrix<FPP, RowMajor>& L_vec_q, const FPP2& c, const FPP2& s, int p, int q, Faust::MatSparse<FPP,DEVICE> & L);
void update_L_second(Eigen::SparseMatrix<FPP, RowMajor> & L_vec_p, Eigen::SparseMatrix<FPP, RowMajor>& L_vec_q, const FPP2& c, const FPP2& s, int p, int q, Faust::MatSparse<FPP,DEVICE> & L);
/**
* \brief Algo. step 2.5.
*
......
......@@ -222,9 +222,21 @@ void GivensFGFT<FPP,DEVICE,FPP2>::update_L(Faust::MatDense<FPP,Cpu> & L)
template<typename FPP, Device DEVICE, typename FPP2>
void GivensFGFT<FPP,DEVICE,FPP2>::update_L(Faust::MatSparse<FPP,Cpu> & L)
{
//#define OPT_UPDATE_SPARSE_L
#undef OPT_UPDATE_SPARSE_L
#ifndef OPT_UPDATE_SPARSE_L
// L = S'*L*S
facts[ite].multiply(L, 'T');
L.multiplyRight(facts[ite]);
#else
Eigen::SparseMatrix<FPP,RowMajor> L_vec_p, L_vec_q;
FPP2 c = *(fact_mod_values.end()-1); // cos(theta)
FPP2 s = *(fact_mod_values.end()-2); // sin(theta)
update_L_first(L_vec_p, L_vec_q, c, s, this->p, this->q, L);
L.update_dim();
// update_L_second(L_vec_p, L_vec_q, c, s, this->p, this->q, L);
L.multiplyRight(facts[ite]);
#endif
}
template<typename FPP, Device DEVICE, typename FPP2>
......@@ -284,6 +296,57 @@ void GivensFGFT<FPP,DEVICE,FPP2>::update_L_second(Faust::Vect<FPP,DEVICE>& L_vec
memcpy(L.getData()+L.getNbRow()*q, tmp.getData(), sizeof(FPP)*L.getNbRow());
}
template<typename FPP, Device DEVICE, typename FPP2>
void GivensFGFT<FPP,DEVICE,FPP2>::update_L_second(Eigen::SparseMatrix<FPP,RowMajor > & L_vec_p, Eigen::SparseMatrix<FPP, RowMajor>& L_vec_q, const FPP2& c, const FPP2& s, int p, int q, Faust::MatSparse<FPP,DEVICE> & L)
{
Eigen::SparseMatrix<FPP, RowMajor> tmp, tmp2;
/*========== L *= S */
//L_vec_p = L.get_col(p), L_vec_q = L.get_col(q);
L_vec_p = L.mat.block(0, p, L.getNbRow(), 1);
L_vec_q = L.mat.block(0, q, L.getNbRow(), 1);
// L(:,p) = c*L(:,p) + s*L(:,q)
tmp = L_vec_p;
tmp *= c;
tmp2 = L_vec_q;
tmp2 *= s;
tmp += tmp2;
L.mat.col(p) = tmp;
// L(:,q) = -s*L(:,p) + c*L(:,q)
tmp = L_vec_p;
tmp *= -s;
tmp2 = L_vec_q;
tmp2 *= c;
tmp += tmp2;
L.mat.col(q) = tmp;
}
template<typename FPP, Device DEVICE, typename FPP2>
void GivensFGFT<FPP,DEVICE,FPP2>::update_L_first(Eigen::SparseMatrix<FPP, RowMajor> & L_vec_p, Eigen::SparseMatrix<FPP, RowMajor>& L_vec_q, const FPP2& c, const FPP2& s, int p, int q, Faust::MatSparse<FPP,DEVICE> & L)
{
Eigen::SparseMatrix<FPP, RowMajor> tmp, tmp2, tmp3;
/*========== L = S'*L */
L_vec_p = L.mat.innerVector(p), L_vec_q = L.mat.innerVector(q);
// L_vec_p = L.mat.block(p, 0, 1, L.getNbCol());
// L_vec_q = L.mat.block(q, 0, 1, L.getNbCol());
// L(p,:) = c*L(p,:) + s*L(q,:)
tmp = L_vec_p;
tmp *= c;
tmp2 = L_vec_q;
tmp2 *= s;
tmp += tmp2;
L.mat.innerVector(p) = tmp;
//
//
// L(q,:) = -s*L(p,:) + c*L(q,:)
tmp = L_vec_p;
tmp *= -s;
tmp2 = L_vec_q;
tmp2 *= c;
tmp += tmp2;
L.mat.innerVector(q) = tmp;
}
template<typename FPP, Device DEVICE, typename FPP2>
void GivensFGFT<FPP,DEVICE,FPP2>::update_D()
{
......
......@@ -239,8 +239,72 @@ template<typename FPP, Device DEVICE, typename FPP2>
void GivensFGFTParallel<FPP,DEVICE,FPP2>::update_L(Faust::MatSparse<FPP,Cpu> & L)
{
// L = S'*L*S
#ifdef OPT_UPDATE__SPARSE_L
int choice_id;
FPP2 c,s;
int p, q, i;
// L = S'*L
// HINT to enable OpenMP: add flag -fopenmp (to compilation and linker flags in CMakeCache)
// likewise in setup.py for python wrapper (into extra_compile_args and extra_link_args list -- setuptools)
// NOTE: OpenMP was an attempt not really useful because no speedup was noticed in two or four threads (running on 4-core CPU)
// The code is nevertheless kept here, just in case, to use it you need to set the compilation/preprocessor constant OPT_UPDATE_L_OMP
#ifdef OPT_UPDATE_L_OMP
#pragma omp parallel private(c, s, choice_id, p, q, i)
#endif
{
Eigen::SparseMatrix<FPP, Eigen::RowMajor> L_vec_p, L_vec_q;
#ifdef OPT_UPDATE_L_OMP
int num_per_th = fact_nrots/omp_get_num_threads();
int th_id = omp_get_thread_num();
int th_offset = fact_nrots/omp_get_num_threads()*th_id;
if(th_id == omp_get_num_threads() - 1) num_per_th += fact_nrots % omp_get_num_threads();
#else
int num_per_th = fact_nrots; // a single thread is used (no openmp)
int th_id = 0;
int th_offset = 0;
#endif
//first attempt to openmp-ize the loops (abandoned for the single section started above)
//#ifdef OPT_UPDATE_L_OMP
//#pragma omp parallel for schedule(static) private(c, s, choice_id, p, q, i) default(shared) // num_threads(4)
//#endif OPT_UPDATE_L_OMP
for(i=th_offset; i < th_offset+num_per_th; i++)
{
// applying first the last rotation submatrix
choice_id = this->coord_choices.size()-1-i;
c = *(this->fact_mod_values.end()-1-4*i); // cos(theta)
s = *(this->fact_mod_values.end()-2*(2*i+1)); // sin(theta)
p = this->coord_choices[choice_id].first;
q = this->coord_choices[choice_id].second;
//rely on parent for doing the job
this->update_L_first(L_vec_p, L_vec_q, c, s, p, q, L);
}
#ifdef OPT_UPDATE_L_OMP
#pragma omp barrier //S'*L must be computed before starting L*S computation
#endif
// L = L*S
//#ifdef OPT_UPDATE_L_OMP
//#pragma omp parallel for schedule(static) private(c, s, choice_id, p, q, i) default(shared) // num_threads(4)
//#endif
// for(i=th_offset; i < th_offset+num_per_th; i++)
// {
// // applying first the last rotation submatrix
// choice_id = this->coord_choices.size()-1-i;
// c = *(this->fact_mod_values.end()-1-4*i); // cos(theta)
// s = *(this->fact_mod_values.end()-2*(2*i+1)); // sin(theta)
// p = this->coord_choices[choice_id].first;
// q = this->coord_choices[choice_id].second;
// //rely on parent for doing the job
// this->update_L_second(L_vec_p, L_vec_q, c, s, p, q, L);
// L.update_dim();
// }
L.multiplyRight(this->facts[this->ite]);
// L.update_dim();
}
#else
// L = S'*L*S
this->facts[this->ite].multiply(L, 'T');
L.multiplyRight(this->facts[this->ite]);
#endif
}
......
......@@ -100,11 +100,17 @@ namespace Faust
//! Faust::MatDense class template of dense matrix
template<typename FPP,Device DEVICE> class MatDense;
template<typename FPP, Device DEVICE, typename FPP2> class GivensFGFT;
template<typename FPP, Device DEVICE, typename FPP2> class GivensFGFTParallel;
template<typename FPP>
class MatSparse<FPP,Cpu> : public Faust::MatGeneric<FPP,Cpu>
{
friend Faust::GivensFGFT<FPP,Cpu, double>;
friend Faust::GivensFGFTParallel<FPP,Cpu, double>;
friend Faust::GivensFGFT<FPP,Cpu, float>;
friend Faust::GivensFGFTParallel<FPP,Cpu, float>;
friend Faust::TransformHelper<FPP,Cpu>; // TODO: limit to needed member functions only
friend void Faust::wht_factors<>(unsigned int n, vector<MatGeneric<FPP,Cpu>*>& factors, const bool, const bool);
friend class MatDense<FPP,Cpu>;
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment