Optimize Faust::TransformeHelperPoly::multiply(Vect) (F@x in python).

Decrease the number of copies. ======== Mini benchmark hinria  …  build  wrapper  python  for i in {1..10}; do python3 test_poly_cpp.py | grep speedup;done speedup 90.75602805478192 speedup 89.09036938690689 speedup 82.56148989913501 speedup 90.64531438391626 speedup 91.66248597807245 speedup 90.10036697594744 speedup 75.61254492055296 speedup 89.96955925554978 speedup 95.22901711420815 speedup 90.92116442476765 hinria  …  build  wrapper  python  for i in {1..10}; do python3 test_poly_cpp.py | grep speedup;done speedup 123.0277183801693 speedup 103.32878272246883 speedup 127.7827667275423 speedup 124.11150187961708 speedup 113.07702426409236 speedup 87.57467702373462 speedup 127.40517991813421 speedup 99.10773177189199 speedup 127.26056623369436 speedup 129.75727501516909 ======== https://gitlab.inria.fr/faustgrp/faust/-/snippets/704

Optimize Faust::TransformeHelperPoly::multiply(Vect) (F@x in python).
72880940 · hhakim · 32d965ae · 72880940 · 72880940 · 72880940
Commit 72880940 authored 4 years ago by hhakim
--- a/src/faust_linear_operator/CPU/faust_TransformHelperPoly.h
+++ b/src/faust_linear_operator/CPU/faust_TransformHelperPoly.h
@@ -19,8 +19,10 @@ namespace Faust
 					const FPP lambda_, const bool optimizedCopy, const bool cloning_fact,
 					const bool internal_call) : TransformHelper<FPP,Cpu>(facts, lambda_, optimizedCopy, cloning_fact, internal_call) {}
-			Vect<FPP,Cpu> multiply(const Vect<FPP,Cpu> x, const bool transpose=false, const bool conjugate=false);
+			Vect<FPP,Cpu> multiply(const Vect<FPP,Cpu> &x, const bool transpose=false, const bool conjugate=false);
-			MatDense<FPP, Cpu> multiply(MatDense<FPP,Cpu> X, const bool transpose=false, const bool conjugate=false);
+			Vect<FPP,Cpu> multiply(const FPP* x, const bool transpose=false, const bool conjugate=false);
+			void multiply(const FPP* x, FPP* y, const bool transpose=false, const bool conjugate=false);
+			MatDense<FPP, Cpu> multiply(const MatDense<FPP,Cpu> &X, const bool transpose=false, const bool conjugate=false);
 			Vect<FPP, Cpu> poly(MatDense<FPP,Cpu> & basisX, Vect<FPP, Cpu> coeffs);
 			MatDense<FPP, Cpu> poly(int n, MatDense<FPP,Cpu> & basisX, Vect<FPP, Cpu> coeffs);
 			void poly(int d, int n, const FPP* basisX, const FPP* coeffs, FPP* out);

--- a/src/faust_linear_operator/CPU/faust_TransformHelperPoly.h.bak
+++ b/src/faust_linear_operator/CPU/faust_TransformHelperPoly.h.bak
+#ifndef __FAUST_TRANSFORM_HELPER_POLY__
+#define __FAUST_TRANSFORM_HELPER_POLY__
+#include "faust_TransformHelper.h"
+namespace Faust
+{
+	/**
+	 * \brief This class aims to represent a Chebyshev polynomial basis as a "Faust".
+	 *
+	 * It overrides some operations to optimize the performance by taking into account the factors specific structure.
+	 */
+	template<typename FPP>
+		class TransformHelperPoly : public TransformHelper<FPP,Cpu>
+		{
+			public:
+			MatSparse<FPP, Cpu> L; //TODO: must be private (and basisChebyshev a friend of this class)
+			MatSparse<FPP, Cpu> twoL; //TODO: must be private
+			TransformHelperPoly(const std::vector<MatGeneric<FPP,Cpu> *>& facts,
+					const FPP lambda_, const bool optimizedCopy, const bool cloning_fact,
+					const bool internal_call) : TransformHelper<FPP,Cpu>(facts, lambda_, optimizedCopy, cloning_fact, internal_call) {}
+			Vect<FPP,Cpu> multiply(const Vect<FPP,Cpu> x, const bool transpose=false, const bool conjugate=false);
+			MatDense<FPP, Cpu> multiply(MatDense<FPP,Cpu> X, const bool transpose=false, const bool conjugate=false);
+			Vect<FPP, Cpu> poly(MatDense<FPP,Cpu> & basisX, Vect<FPP, Cpu> coeffs);
+			MatDense<FPP, Cpu> poly(int n, MatDense<FPP,Cpu> & basisX, Vect<FPP, Cpu> coeffs);
+			void poly(int d, int n, const FPP* basisX, const FPP* coeffs, FPP* out);
+			TransformHelper<FPP, Cpu>* polyFaust(const FPP* coeffs);
+		};
+	template<typename FPP>
+		TransformHelper<FPP, Cpu>* basisChebyshev(MatSparse<FPP,Cpu>* L, int32_t K);
+	template<typename FPP>
+		void poly(int d, int K, int n, const FPP* basisX, const FPP* coeffs, FPP* out);
+}
+#include "faust_TransformHelperPoly.hpp"
+#endif
--- a/src/faust_linear_operator/CPU/faust_TransformHelperPoly.hpp
+++ b/src/faust_linear_operator/CPU/faust_TransformHelperPoly.hpp
@@ -2,11 +2,25 @@
 #include "faust_linear_algebra.h"
 namespace Faust
 {
+	template<typename FPP>
+	Vect<FPP,Cpu> TransformHelperPoly<FPP>::multiply(const Vect<FPP,Cpu> &x, const bool transpose/*=false*/, const bool conjugate/*=false*/)
+	{
+		return std::move(this->multiply(x.getData(), transpose, conjugate));
+	}
 	template<typename FPP>
-	Vect<FPP,Cpu> TransformHelperPoly<FPP>::multiply(const Vect<FPP,Cpu> x, const bool transpose/*=false*/, const bool conjugate/*=false*/)
+	Vect<FPP,Cpu> TransformHelperPoly<FPP>::multiply(const FPP* x, const bool transpose/*=false*/, const bool conjugate/*=false*/)
 	{
+		int d = L.getNbRow();
+		int K = this->size();
+		Vect<FPP, Cpu> v0(d*(K+1));
+		multiply(x, v0.getData(), transpose, conjugate);
+		return std::move(v0);
+	}
+	template<typename FPP>
+	void TransformHelperPoly<FPP>::multiply(const FPP* x, FPP* y, const bool transpose/*=false*/, const bool conjugate/*=false*/)
+	{
 //		std::cout << "TransformHelperPoly<FPP>::multiply(Vect)" << std::endl;
 		/**
 		 * Recurrence relation (k=1 to K):
@@ -22,25 +36,22 @@ namespace Faust
 		 * Fx = [v_{0, K} ; v_{1, K} ; v_{2, K}]
 		 */
 		int d = L.getNbRow();
-		Vect<FPP,Cpu> v1(d), v2, tmp;
+		Vect<FPP,Cpu> v2, tmp;
 		int K = this->size();
-		Vect<FPP, Cpu> v0(d*(K+1));
 		// seeing how basisChebyshev method is instantiating this class
 		// K can't be strictly lower than two
-		memcpy(v1.getData(), x.getData(), sizeof(FPP)*d);
+		v2 = L*Vect<FPP,Cpu>(d, x);//L*x
-		v2 = L*x;
+		memcpy(y, x, sizeof(FPP)*d);
-		for(int i=2;i<=K;i++)
+		memcpy(y+d, v2.getData(), sizeof(FPP)*d);
+		for(int i=3;i<=K+1;i++)
 		{
-			memcpy(v0.getData()+d*(i-2), v1.getData(), sizeof(FPP)*d);
+			int offset = d*(i-3);
-			tmp = v1;
-			v1 = v2;
 			//			gemv(L, v2, tmp, FPP(1), FPP(-1));
-			v2 = twoL*v2;
+			//			v2 = twoL*v2;
-			v2 -= tmp;
+			v2.multiplyLeft(twoL);
+			v2 -= y+offset;
+			memcpy(y+d*(i-1), v2.getData(), sizeof(FPP)*d);
 		}
-		memcpy(v0.getData()+d*(K-1), v1.getData(), sizeof(FPP)*d);
-		memcpy(v0.getData()+d*K, v2.getData(), sizeof(FPP)*d);
-		return v0;
 	}
 // Slower method but kept commented here until further tests
@@ -80,7 +91,7 @@ namespace Faust
 //			}
 	template<typename FPP>
-			MatDense<FPP, Cpu> TransformHelperPoly<FPP>::multiply(MatDense<FPP,Cpu> X, const bool transpose/*=false*/, const bool conjugate/*=false*/)
+			MatDense<FPP, Cpu> TransformHelperPoly<FPP>::multiply(const MatDense<FPP,Cpu> &X, const bool transpose/*=false*/, const bool conjugate/*=false*/)
 			{
 				//				std::cout << "TransformHelperPoly<FPP>::multiply(MatDense)" << std::endl;
 				int d = L.getNbRow();
@@ -227,7 +238,9 @@ namespace Faust
 			{
 				Vect<FPP,Cpu> _coeffs(K_plus_1, coeffs);
 				MatDense<FPP,Cpu> basisXi(d, K_plus_1);
+				// reshape a d_K_plus_1 vector into a d*K_plus_1 matrix
 				memcpy(basisXi.getData(), basisX+d_K_plus_1*i, sizeof(FPP)*d_K_plus_1);
+				// compute the linear combination
 				_coeffs.multiplyLeft(basisXi);
 				memcpy(out+i*d, _coeffs.getData(), sizeof(FPP)*d);
 			}

--- a/src/faust_linear_operator/CPU/faust_TransformHelperPoly.hpp.bak
+++ b/src/faust_linear_operator/CPU/faust_TransformHelperPoly.hpp.bak
+#include <cstdlib>
+#include "faust_linear_algebra.h"
+namespace Faust
+{
+	template<typename FPP>
+	Vect<FPP,Cpu> TransformHelperPoly<FPP>::multiply(const Vect<FPP,Cpu> x, const bool transpose/*=false*/, const bool conjugate/*=false*/)
+	{
+//		std::cout << "TransformHelperPoly<FPP>::multiply(Vect)" << std::endl;
+		/**
+		 * Recurrence relation (k=1 to K):
+		 * v_{0,k} := [v_{0,k-1} ; v_{1, k-1} ] // concatenation
+		 * v_{1,k} := v_{2,k-1}
+		 * v_{2,k} := 2Lv_{2,k-1} - v_{1,k-1}
+		 *
+		 * First terms:
+		 * v_{0,1} = []
+		 * v_{1,1} = x
+		 * v_{2,1} = Lx
+		 *
+		 * Fx = [v_{0, K} ; v_{1, K} ; v_{2, K}]
+		 */
+		int d = L.getNbRow();
+		Vect<FPP,Cpu> v1(d), v2, tmp;
+		int K = this->size();
+		Vect<FPP, Cpu> v0(d*(K+1));
+		// seeing how basisChebyshev method is instantiating this class
+		// K can't be strictly lower than two
+		memcpy(v1.getData(), x.getData(), sizeof(FPP)*d);
+		v2 = L*x;
+		for(int i=2;i<=K;i++)
+		{
+			memcpy(v0.getData()+d*(i-2), v1.getData(), sizeof(FPP)*d);
+			tmp = v1;
+			v1 = v2;
+			//			gemv(L, v2, tmp, FPP(1), FPP(-1));
+			v2 = twoL*v2;
+			v2 -= tmp;
+		}
+		memcpy(v0.getData()+d*(K-1), v1.getData(), sizeof(FPP)*d);
+		memcpy(v0.getData()+d*K, v2.getData(), sizeof(FPP)*d);
+		return v0;
+	}
+// Slower method but kept commented here until further tests
+//	template<typename FPP>
+//			MatDense<FPP, Cpu> TransformHelperPoly<FPP>::multiply(MatDense<FPP,Cpu> X, const bool transpose/*=false*/, const bool conjugate/*=false*/)
+//			{
+////				std::cout << "TransformHelperPoly<FPP>::multiply(MatDense)" << std::endl;
+//				int d = L.getNbRow();
+//				int n = X.getNbCol();
+//				int K = this->size();
+//				MatDense<FPP, Cpu> V0(d*(K+1), n);
+//				MatDense<FPP,Cpu> V1(d, n), V2 = X, tmp;
+//				memcpy(V1.getData(), X.getData(), sizeof(FPP)*d*n);
+//				L.multiply(V2, 'N');
+//				for(int i=2;i<=K;i++)
+//				{
+//					memcpy(V0.getData()+d*n*(i-2), V1.getData(), sizeof(FPP)*d*n);
+//					tmp = V1;
+//					V1 = V2;
+//					twoL.multiply(V2, 'N');
+//					V2 -= tmp;
+////					gemm(twoL, V2, tmp, (FPP)1.0, (FPP)-1.0, 'N', 'N');
+////					V2  = tmp;
+//				}
+//				memcpy(V0.getData()+d*n*(K-1), V1.getData(), sizeof(FPP)*d*n);
+//				memcpy(V0.getData()+d*n*K, V2.getData(), sizeof(FPP)*d*n);
+//				// put bocks in proper order
+//				MatDense<FPP,Cpu> V0_ord(d*(K+1), n);
+//				#pragma omp parallel for
+//				for(int i=0;i<(K+1);i++)
+//				{
+//					auto i_offset = i*n*d;
+//					for(int j=0;j<n;j++)
+//						memcpy(V0_ord.getData()+(K+1)*d*j+d*i, V0.getData()+j*d+i_offset, sizeof(FPP)*d);
+//				}
+//				return V0_ord;
+//			}
+	template<typename FPP>
+			MatDense<FPP, Cpu> TransformHelperPoly<FPP>::multiply(MatDense<FPP,Cpu> X, const bool transpose/*=false*/, const bool conjugate/*=false*/)
+			{
+				//				std::cout << "TransformHelperPoly<FPP>::multiply(MatDense)" << std::endl;
+				int d = L.getNbRow();
+				int K = this->size();
+				int n = X.getNbCol();
+				MatDense<FPP,Cpu> V0_ord(d*(K+1), n);
+				auto scale = (K+1)*d;
+				#pragma omp parallel for
+				for(int i=0;i<n;i++)
+				{
+					Vect<FPP,Cpu> x(d, X.getData()+i*d);
+					auto y = multiply(x);
+					memcpy(V0_ord.getData()+scale*i, y.getData(), sizeof(FPP)*scale);
+				}
+				return V0_ord;
+			}
+	template<typename FPP>
+		Vect<FPP, Cpu> TransformHelperPoly<FPP>::poly(MatDense<FPP,Cpu> & basisX, Vect<FPP, Cpu> coeffs)
+		{
+			coeffs.multiplyLeft(basisX);
+			return coeffs;
+		}
+	template<typename FPP>
+		MatDense<FPP, Cpu> TransformHelperPoly<FPP>::poly(int n, MatDense<FPP,Cpu> & basisX, Vect<FPP, Cpu> coeffs)
+		{
+			int d = L.getNbRow();
+			MatDense<FPP,Cpu> Y(d, n);
+			this->poly(d, n, basisX.getData(), coeffs.getData(), Y.getData());
+			return Y;
+		}
+	template<typename FPP>
+		void TransformHelperPoly<FPP>::poly(int d, int n, const FPP* basisX, const FPP* coeffs, FPP* out)
+		{
+			int K = this->size();
+			Faust::poly(d, K, n, basisX, coeffs, out);
+		}
+	template<typename FPP>
+		TransformHelper<FPP, Cpu>* TransformHelperPoly<FPP>::polyFaust(const FPP* coeffs)
+		{
+//            Id = sp.eye(L.shape[1], format="csr")
+//				            scoeffs = sp.hstack(tuple(Id*coeffs[i] for i in range(0, K+1)),
+//									                                format="csr")
+			MatSparse<FPP,Cpu> Id, Id1;
+			MatSparse<FPP,Cpu> coeffDiags;
+			auto d = L.getNbRow();
+			int K = this->size();
+			Id.resize(d, d, d);
+			Id.setEyes();
+			auto Id0 = Id;
+			Id0 *= coeffs[0];
+			// K >= 2
+			for(int i=1;i < K+1; i++)
+			{
+				Id1 = Id;
+				Id1 *= coeffs[i];
+				MatSparse<FPP,Cpu> coeffDiags;
+				coeffDiags.hstack(Id0, Id1);
+				Id0 = coeffDiags;
+			}
+			std::vector<MatGeneric<FPP,Cpu>*> facts(K+1);
+			facts.resize(K+1);
+			facts[0] = new MatSparse<FPP,Cpu>(Id0);
+			for(int i=1;i<=K;i++)
+			{
+				facts[i] = this->get_gen_fact_nonconst(i-1);
+			}
+			return new TransformHelper<FPP,Cpu>(facts, (FPP) 1.0,
+					/* optimizedCopy */ false,
+					/* cloning_fact */ false,
+					/* internal_call */ true);
+		}
+	template<typename FPP>
+		TransformHelper<FPP,Cpu>* basisChebyshev(MatSparse<FPP,Cpu>* L, int32_t K)
+		{
+			// assuming L is symmetric
+			MatSparse<FPP,Cpu> *T1, *T2;
+			auto d = L->getNbRow();
+			MatSparse<FPP,Cpu> Id, twoL, minus_Id, rR, R, zero;
+			std::vector<MatGeneric<FPP,Cpu>*> facts(K);
+			if(K == 0)
+				return TransformHelper<FPP,Cpu>::eyeFaust(d,d);
+			facts.resize(K); //normally it'd be K+1 but T0 is ignored
+			// build the chebyshev polynomials by factor
+			// K > 1 ignore the first one (for T0 0-degree), which is the identity
+			// Identity
+			Id.resize(d, d, d);
+			Id.setEyes();
+			// T1
+			T1 = new MatSparse<FPP,Cpu>();
+			T1->vstack(Id, *L);
+			facts[K-1] = T1;
+			if(K == 1)
+				return new TransformHelper<FPP,Cpu>(facts, (FPP)1.0,
+						/* optimizedCopy */ false,
+						/* cloning_fact */ false,
+						/* internal_call */ true);
+			// rR block
+			twoL = *L;
+			twoL *= FPP(2);
+			minus_Id.resize(d,d,d);
+			minus_Id.setEyes();
+			minus_Id *= FPP(-1);
+			rR.hstack(minus_Id, twoL);
+			// T2
+			Id.resize(2*d, 2*d, 2*d);
+			Id.setEyes();
+			T2 = new MatSparse<FPP,Cpu>();
+			T2->vstack(Id, rR);
+			facts[K-2] = T2;
+			// T3 to TK
+			#pragma omp parallel for private(Id, zero, R)
+			for(int i=3;i<K+1;i++)
+			{
+				Id.resize(i*d, i*d, i*d);
+				Id.setEyes();
+				zero.resize(0, d, (i-2)*d);
+				R.hstack(zero, rR);
+				auto Ti = new MatSparse<FPP,Cpu>();
+				Ti->vstack(Id, R);
+				facts[K-i] = Ti;
+			}
+			auto basisP = new TransformHelperPoly<FPP>(facts, (FPP)1.0,
+					/* optimizedCopy */ false,
+					/* cloning_fact */ false,
+					/* internal_call */ true);
+			basisP->L = *L;
+			basisP->twoL = *L;
+			basisP->twoL *= 2;
+			return basisP;
+		}
+	template<typename FPP>
+		void poly(int d, int K, int n, const FPP* basisX, const FPP* coeffs, FPP* out)
+		{
+			auto K_plus_1 = K+1;
+			auto d_K_plus_1 = d*K_plus_1;
+			#pragma omp parallel for
+			for(int i=0;i<n;i++)
+			{
+				Vect<FPP,Cpu> _coeffs(K_plus_1, coeffs);
+				MatDense<FPP,Cpu> basisXi(d, K_plus_1);
+				memcpy(basisXi.getData(), basisX+d_K_plus_1*i, sizeof(FPP)*d_K_plus_1);
+				_coeffs.multiplyLeft(basisXi);
+				memcpy(out+i*d, _coeffs.getData(), sizeof(FPP)*d);
+			}
+		}
+}