Attempt to optimize GivensFGFT(Parallel) for L residual being a MatSparse.

It didn't show any enhancement in performances, so the optimization is disabled but kept in code (the compilation constant OPT_UPDATE_SPARSE_L must be defined for the optimization to be enabled). The optimization is about the product L = S'*L*S calculation. And the optimization attempt is similar to what have been done for the case where L is dense (in this case it works well). Here are some results: J=256, t=128 with "optimization" >>python$ for i in {1..10}; do python3 test_eigtj_mira_lap_sparse.py;done | grep time | awk '{print $2}' 0.4260462010000001 0.4810052160000007 0.4244757539999995 0.4957183700000005 0.4338328650000003 0.4226386240000002 0.4573114149999995 0.4332176910000003 0.4304637250000001 0.4894571430000001 without "optimization" >>python$ for i in {1..10}; do python3 test_eigtj_mira_lap_sparse.py;done | grep time | awk '{print $2}' 0.4188697339999994 0.415433846 0.41125124199999963 0.41828689799999985 0.4699177429999999 0.4142650020000005 0.46411832699999955 0.42669971800000006 0.4497221189999996 0.4138595669999994 other runs with J=32, t=2 with "optimization" >>python$ for i in {1..10}; do python3 test_eigtj_mira_lap_sparse.py;done | grep time | awk '{print $2}' 2.7853622209999997 2.8483473470000007 3.12851224 2.8791390860000003 2.84674213 2.8365956349999992 2.7762758190000003 2.85138975 2.8108322590000006 2.9169309619999995 without "optimization" >>python$ for i in {1..10}; do python3 test_eigtj_mira_lap_sparse.py;done | grep time | awk '{print $2}' 2.787849833 2.8399939510000003 2.809797146 2.8418070140000005 2.7759090029999998 2.7810152440000007 2.833329901 2.772917798 2.846831988000001 2.765091783

Attempt to optimize GivensFGFT(Parallel) for L residual being a MatSparse.
cfffefd7 · hhakim · 8bf4da62 · cfffefd7 · cfffefd7 · cfffefd7
Commit cfffefd7 authored 5 years ago by hhakim
--- a/src/algorithm/factorization/faust_GivensFGFT.h
+++ b/src/algorithm/factorization/faust_GivensFGFT.h
@@ -172,6 +172,8 @@ namespace Faust {
 				 */
 				void update_L_second(Faust::Vect<FPP,DEVICE>& L_vec_p, Faust::Vect<FPP,DEVICE>& L_vec_q, const FPP2& c, const FPP2& s, int p, int q, MatDense<FPP,DEVICE> & L);

+				void update_L_first(Eigen::SparseMatrix<FPP, RowMajor> & L_vec_p, Eigen::SparseMatrix<FPP, RowMajor>& L_vec_q, const FPP2& c, const FPP2& s, int p, int q, Faust::MatSparse<FPP,DEVICE> & L);
+				void update_L_second(Eigen::SparseMatrix<FPP, RowMajor> & L_vec_p, Eigen::SparseMatrix<FPP, RowMajor>& L_vec_q, const FPP2& c, const FPP2& s, int p, int q, Faust::MatSparse<FPP,DEVICE> & L);
 				/**
 				 * \brief Algo. step 2.5.
 				 *

--- a/src/algorithm/factorization/faust_GivensFGFT.hpp
+++ b/src/algorithm/factorization/faust_GivensFGFT.hpp
@@ -222,9 +222,21 @@ void GivensFGFT<FPP,DEVICE,FPP2>::update_L(Faust::MatDense<FPP,Cpu> & L)
 template<typename FPP, Device DEVICE, typename FPP2>
 void GivensFGFT<FPP,DEVICE,FPP2>::update_L(Faust::MatSparse<FPP,Cpu> & L)
 {
+//#define OPT_UPDATE_SPARSE_L
+#undef OPT_UPDATE_SPARSE_L
+#ifndef OPT_UPDATE_SPARSE_L
 	// L = S'*L*S
 	facts[ite].multiply(L, 'T');
 	L.multiplyRight(facts[ite]);
+#else
+	Eigen::SparseMatrix<FPP,RowMajor> L_vec_p, L_vec_q;
+	FPP2 c = *(fact_mod_values.end()-1); // cos(theta)
+	FPP2 s = *(fact_mod_values.end()-2); // sin(theta)
+	update_L_first(L_vec_p, L_vec_q, c, s, this->p, this->q, L);
+	L.update_dim();
+//	update_L_second(L_vec_p, L_vec_q, c, s, this->p, this->q, L);
+	L.multiplyRight(facts[ite]);
+#endif
 }

 template<typename FPP, Device DEVICE, typename FPP2>
@@ -284,6 +296,57 @@ void GivensFGFT<FPP,DEVICE,FPP2>::update_L_second(Faust::Vect<FPP,DEVICE>& L_vec
 	memcpy(L.getData()+L.getNbRow()*q, tmp.getData(), sizeof(FPP)*L.getNbRow());
 }

+template<typename FPP, Device DEVICE, typename FPP2>
+void GivensFGFT<FPP,DEVICE,FPP2>::update_L_second(Eigen::SparseMatrix<FPP,RowMajor > & L_vec_p, Eigen::SparseMatrix<FPP, RowMajor>& L_vec_q, const FPP2& c, const FPP2& s, int p, int q, Faust::MatSparse<FPP,DEVICE> & L)
+{
+	Eigen::SparseMatrix<FPP, RowMajor> tmp, tmp2;
+	/*========== L *= S */
+	//L_vec_p = L.get_col(p), L_vec_q = L.get_col(q);
+	L_vec_p = L.mat.block(0, p, L.getNbRow(), 1);
+	L_vec_q = L.mat.block(0, q, L.getNbRow(), 1);
+	// L(:,p) = c*L(:,p) + s*L(:,q)
+	tmp = L_vec_p;
+	tmp *= c;
+	tmp2 = L_vec_q;
+	tmp2 *= s;
+	tmp += tmp2;
+	L.mat.col(p) = tmp;
+
+	// L(:,q) = -s*L(:,p) + c*L(:,q)
+	tmp = L_vec_p;
+	tmp *= -s;
+	tmp2 = L_vec_q;
+	tmp2 *= c;
+	tmp += tmp2;
+	L.mat.col(q) = tmp;
+}
+
+template<typename FPP, Device DEVICE, typename FPP2>
+void GivensFGFT<FPP,DEVICE,FPP2>::update_L_first(Eigen::SparseMatrix<FPP, RowMajor> & L_vec_p, Eigen::SparseMatrix<FPP, RowMajor>& L_vec_q, const FPP2& c, const FPP2& s, int p, int q, Faust::MatSparse<FPP,DEVICE> & L)
+{
+	Eigen::SparseMatrix<FPP, RowMajor> tmp, tmp2, tmp3;
+	/*========== L = S'*L */
+	L_vec_p = L.mat.innerVector(p), L_vec_q = L.mat.innerVector(q);
+//	L_vec_p = L.mat.block(p, 0, 1, L.getNbCol());
+//	L_vec_q = L.mat.block(q, 0, 1, L.getNbCol());
+	// L(p,:) = c*L(p,:) + s*L(q,:)
+	tmp = L_vec_p;
+	tmp *= c;
+	tmp2 = L_vec_q;
+	tmp2 *= s;
+	tmp += tmp2;
+	L.mat.innerVector(p) = tmp;
+	//
+	//
+	// L(q,:) = -s*L(p,:) + c*L(q,:)
+	tmp = L_vec_p;
+	tmp *= -s;
+	tmp2 = L_vec_q;
+	tmp2 *= c;
+	tmp += tmp2;
+	L.mat.innerVector(q) = tmp;
+}
+
 template<typename FPP, Device DEVICE, typename FPP2>
 void GivensFGFT<FPP,DEVICE,FPP2>::update_D()
 {

--- a/src/algorithm/factorization/faust_GivensFGFTParallel.hpp
+++ b/src/algorithm/factorization/faust_GivensFGFTParallel.hpp
@@ -239,8 +239,72 @@ template<typename FPP, Device DEVICE, typename FPP2>
 void GivensFGFTParallel<FPP,DEVICE,FPP2>::update_L(Faust::MatSparse<FPP,Cpu> & L)
 {
 	// L = S'*L*S
+#ifdef OPT_UPDATE__SPARSE_L
+	int choice_id;
+	FPP2 c,s;
+	int p, q, i;
+	// L = S'*L
+// HINT to enable OpenMP: add flag -fopenmp (to compilation and linker flags in CMakeCache)
+// likewise in setup.py for python wrapper (into extra_compile_args and extra_link_args list -- setuptools)
+// NOTE: OpenMP was an attempt not really useful because no speedup was noticed in two or four threads (running on 4-core CPU)
+// The code is nevertheless kept here, just in case, to use it you need to set the compilation/preprocessor constant OPT_UPDATE_L_OMP
+#ifdef OPT_UPDATE_L_OMP
+#pragma omp parallel private(c, s, choice_id, p, q, i)
+#endif
+	{
+		Eigen::SparseMatrix<FPP, Eigen::RowMajor> L_vec_p, L_vec_q;
+#ifdef OPT_UPDATE_L_OMP
+		int num_per_th = fact_nrots/omp_get_num_threads();
+		int th_id = omp_get_thread_num();
+		int th_offset = fact_nrots/omp_get_num_threads()*th_id;
+		if(th_id == omp_get_num_threads() - 1) num_per_th += fact_nrots % omp_get_num_threads();
+#else
+		int num_per_th = fact_nrots; // a single thread is used (no openmp)
+		int th_id = 0;
+		int th_offset = 0;
+#endif
+//first attempt to openmp-ize the loops (abandoned for the single section started above)
+//#ifdef OPT_UPDATE_L_OMP
+//#pragma omp parallel for schedule(static) private(c, s, choice_id, p, q, i) default(shared) // num_threads(4)
+//#endif OPT_UPDATE_L_OMP
+		for(i=th_offset; i < th_offset+num_per_th; i++)
+		{
+			// applying first the last rotation submatrix
+			choice_id = this->coord_choices.size()-1-i;
+			c = *(this->fact_mod_values.end()-1-4*i); // cos(theta)
+			s = *(this->fact_mod_values.end()-2*(2*i+1)); // sin(theta)
+			p = this->coord_choices[choice_id].first;
+			q = this->coord_choices[choice_id].second;
+			//rely on parent for doing the job
+			this->update_L_first(L_vec_p, L_vec_q, c, s, p, q, L);
+		}
+#ifdef OPT_UPDATE_L_OMP
+#pragma omp barrier //S'*L must be computed before starting L*S computation
+#endif
+		// L = L*S
+//#ifdef OPT_UPDATE_L_OMP
+//#pragma omp parallel for schedule(static) private(c, s, choice_id, p, q, i) default(shared) // num_threads(4)
+//#endif
+//		for(i=th_offset; i < th_offset+num_per_th; i++)
+//		{
+//			// applying first the last rotation submatrix
+//			choice_id = this->coord_choices.size()-1-i;
+//			c = *(this->fact_mod_values.end()-1-4*i); // cos(theta)
+//			s = *(this->fact_mod_values.end()-2*(2*i+1)); // sin(theta)
+//			p = this->coord_choices[choice_id].first;
+//			q = this->coord_choices[choice_id].second;
+//			//rely on parent for doing the job
+//			this->update_L_second(L_vec_p, L_vec_q, c, s, p, q, L);
+//			L.update_dim();
+//		}
+		L.multiplyRight(this->facts[this->ite]);
+//		L.update_dim();
+	}
+#else
+	// L = S'*L*S
 	this->facts[this->ite].multiply(L, 'T');
 	L.multiplyRight(this->facts[this->ite]);
+#endif
 }



--- a/src/faust_linear_operator/CPU/faust_MatSparse.h
+++ b/src/faust_linear_operator/CPU/faust_MatSparse.h
@@ -100,11 +100,17 @@ namespace Faust

 	//! Faust::MatDense class template of dense matrix
 	template<typename FPP,Device DEVICE> class MatDense;
+	template<typename FPP, Device DEVICE, typename FPP2> class GivensFGFT;
+	template<typename FPP, Device DEVICE, typename FPP2> class GivensFGFTParallel;

 	template<typename FPP>
 		class MatSparse<FPP,Cpu> : public Faust::MatGeneric<FPP,Cpu>
 		{

+			friend Faust::GivensFGFT<FPP,Cpu, double>;
+			friend Faust::GivensFGFTParallel<FPP,Cpu, double>;
+			friend Faust::GivensFGFT<FPP,Cpu, float>;
+			friend Faust::GivensFGFTParallel<FPP,Cpu, float>;
 			friend Faust::TransformHelper<FPP,Cpu>; // TODO: limit to needed member functions only
 			friend void Faust::wht_factors<>(unsigned int n, vector<MatGeneric<FPP,Cpu>*>&  factors, const bool, const bool);
 			friend class MatDense<FPP,Cpu>;