Add in tests AVX, OpenMP SIMD, Eigen (template exps) to the ways of computing...

Add in tests AVX, OpenMP SIMD, Eigen (template exps) to the ways of computing the Faust DFT-vector product using diagonal opt. Issue #275.

Add in tests AVX, OpenMP SIMD, Eigen (template exps) to the ways of computing...
1a646960 · hhakim · 47f44bfe · 1a646960
Commit 1a646960 authored 2 years ago by hhakim
--- a/misc/test/src/C++/faust_butterfly_transform.cpp.in
+++ b/misc/test/src/C++/faust_butterfly_transform.cpp.in
@@ -4,6 +4,14 @@
 #include <vector>
 #include <ctime>
 #include <chrono>
+#include <fstream>
+#include <cstdio>
+#include <iostream>
+#include "Eigen/Core"
+#include <complex>
+#include <string>
+
+
 typedef @TEST_FPP@ FPP;

 using namespace Faust;
@@ -19,9 +27,170 @@ using Vec = Eigen::Matrix<T, Eigen::Dynamic, 1>;
 template<typename T>
 using DenseMatMap = Eigen::Map<DenseMat<T>>;

+
+using CplxVec = Matrix<complex<double>, Eigen::Dynamic, 1>;
+
+/************************************* avx*/
+#ifdef USE_AVX // it needs -mavx, -mavx2, -mfma
+#include <immintrin.h>
+
+using namespace Eigen;
+using namespace std;
+
+using DblVec = Matrix<double, Eigen::Dynamic, 1>;
+using IntVec = Matrix<int, Eigen::Dynamic, 1>;
+using CplxVecMap = Map<CplxVec>;
+
+CplxVec avx_cplx_add(CplxVec &vec1, CplxVec &vec2)
+{
+	__m256d v1, v2, v3;
+
+	size_t size = vec1.rows();
+	CplxVec vec3(size);
+
+	for(int i=0; i<size; i +=2)
+	{
+		v1 = _mm256_load_pd(reinterpret_cast<double*>(vec1.data()+i));
+		v2 = _mm256_load_pd(reinterpret_cast<double*>(vec2.data()+i));
+		v3 = _mm256_add_pd(v1, v2);
+		memcpy(vec3.data()+i, reinterpret_cast<complex<double>*>(&v3), 2*sizeof(complex<double>));
+	}
+	return vec3;
+}
+
+__m256d avx_cplx_mul(__m256d &v1, __m256d& v2)
+{
+	__m256d  v3, v4;
+	static __m256d neg = _mm256_setr_pd(1.0, -1.0, 1.0, -1.0);
+
+	v3 = _mm256_mul_pd(v1, v2);
+
+	v2 = _mm256_permute_pd(v2, 0x5);
+
+	v2 = _mm256_mul_pd(v2, neg);
+
+	v4 = _mm256_mul_pd(v1, v2);
+
+	v2 = _mm256_hsub_pd(v3, v4);
+	return v2;
+}
+
+CplxVec avx_cplx_mul(CplxVec &vec1, CplxVec &vec2)
+{
+	__m256d v1, v2;
+
+	size_t size = vec1.rows();
+	CplxVec vec3(size);
+
+
+	for(int i=0; i<size; i +=2)
+	{
+		v1 = _mm256_load_pd(reinterpret_cast<double*>(vec1.data()+i));
+		v2 = _mm256_load_pd(reinterpret_cast<double*>(vec2.data()+i));
+		v2 = avx_cplx_mul(v1, v2);
+		memcpy(vec3.data()+i, reinterpret_cast<complex<double>*>(&v2), 2*sizeof(complex<double>));
+	}
+	return vec3;
+}
+
+CplxVec avx_cplx_mul2(CplxVec &vec1, CplxVec &vec2)
+{
+
+	// de-interleave input vectors
+	DblVec vec1_r = vec1.real();
+	DblVec vec1_i = vec1.imag();
+
+	DblVec vec2_r = vec2.real();
+	DblVec vec2_i = vec2.imag();
+
+	__m256d v1r, v2r, v1i, v2i, tmp, res;
+
+	size_t size = vec2.rows();
+	CplxVec vec3(size);
+
+
+	for(int i=0; i<size; i += 4)
+	{
+		v1i = _mm256_load_pd(vec1_i.data()+i);
+		v2i = _mm256_load_pd(vec2_i.data()+i);
+		v1r = _mm256_load_pd(vec1_r.data()+i);
+		v2r = _mm256_load_pd(vec2_r.data()+i);
+
+		//compute real part
+		tmp = _mm256_mul_pd(v1i, v2i);
+		res = _mm256_fmsub_pd(v1r, v2r, tmp); // real part
+
+		double *resptr = (double*) &res;
+		double* vec3ptr = reinterpret_cast<double*>(vec3.data()+i);
+		// copy real part into vec3
+		vec3ptr[0] = resptr[0];
+		vec3ptr[2] = resptr[1];
+		vec3ptr[4] = resptr[2];
+		vec3ptr[6] = resptr[3];
+
+		//compute imag part
+		tmp = _mm256_mul_pd(v1r, v2i);
+		res = _mm256_fmadd_pd(v1i, v2r, tmp);
+
+		resptr = (double*) &res;
+		// copy imag part into vec3
+		vec3ptr[1] = resptr[0];
+		vec3ptr[3] = resptr[1];
+		vec3ptr[5] = resptr[2];
+		vec3ptr[7] = resptr[3];
+	}
+
+	return vec3;
+}
+
+CplxVec avx_cplx_diag_prod(const CplxVec &d1, const CplxVec &d2, const CplxVec &x, const int* x_ids)
+{
+	__m256d v1, v2, v3, v4;
+
+	size_t size = d1.rows();
+	CplxVec vec3(size);
+	double sub_x_ids[4];
+	double *ptr;
+
+	for(int i=0; i<size; i +=2)
+	{
+
+		// compute  y = d2*x[x_ids]
+		v1 = _mm256_load_pd(reinterpret_cast<const double*>(d2.data()+i));
+		ptr = (double*) (x.data()+x_ids[i]);
+		sub_x_ids[0] = ptr[0];
+		sub_x_ids[1] = ptr[1];
+		ptr = (double*) (x.data()+x_ids[i+1]);
+		sub_x_ids[2] = ptr[0];
+		sub_x_ids[3] = ptr[1];
+		v2 = _mm256_loadu_pd(sub_x_ids); //unaligned
+		v2 = avx_cplx_mul(v1, v2);
+		// compute z = d1*x
+		v3 = _mm256_load_pd(reinterpret_cast<const double*>(d1.data()+i));
+		v4 = _mm256_load_pd(reinterpret_cast<const double*>(x.data()+i));
+		v4 = avx_cplx_mul(v3, v4);
+
+		// y + z
+		v2 = _mm256_add_pd(v2, v4);
+		memcpy(vec3.data()+i, reinterpret_cast<complex<double>*>(&v2), 2*sizeof(complex<double>));
+	}
+	return vec3;
+}
+#endif // USE_AVX
+
+CplxVec eigen_cplx_diag_prod(const CplxVec &d1, const CplxVec &d2, const CplxVec &x, const int* x_ids)
+{
+	CplxVec z(d1.rows());
+	for(int i=0;i < d1.rows(); i++)
+		z[i] = x[x_ids[i]];
+	z = d2.array() * z.array() + d1.array() * x.array();
+	return z;
+}
+
 class ButterflyMat
 {

+	protected:
 	Vec<FPP> d1;
 	Vec<FPP> d2;
 	vector<int> subdiag_ids;
@@ -64,18 +233,63 @@ class ButterflyMat
 		this->level = level;
 	}

-	Vec<FPP> multiply(Vec<FPP>& x) const
+	virtual Vec<FPP> multiply(Vec<FPP>& x) const
 	{
 		Vec<FPP> z(x.rows());
 		//auto y = x(subdiag_ids, Eigen::placeholders::all).array();
-		#pragma omp parallel for
+		// 1st meth
+//		#pragma omp parallel for
 		for(int i=0;i < x.rows(); i++)
-			z[i] = d1[i] * x[i] + d2[i] * x[subdiag_ids[i]];
 //			z[i] = d1[i] * x[i] + d2[i] * y[i];
-//		z = d1.array() * x.array() + d2.array() * x(subdiag_ids, Eigen::placeholders::all).array();
+			z[i] = d1[i] * x[i] + d2[i] * x[subdiag_ids[i]];
+		return z;
+	}
+
+};
+
+#ifdef USE_AVX
+class ButterflyMatAVX: public ButterflyMat
+{
+
+
+	virtual Vec<FPP> multiply(Vec<FPP>& x) const
+	{
+		return avx_cplx_diag_prod(d1, d2,  x, &subdiag_ids[0]);
+	}
+
+	public:
+	ButterflyMatAVX(const MatSparse<FPP, Cpu> &factor, int level) : ButterflyMat(factor, level) {}
+};
+#endif
+
+class ButterflyMatEig: public ButterflyMat
+{
+
+
+	virtual Vec<FPP> multiply(Vec<FPP>& x) const
+	{
+		return eigen_cplx_diag_prod(d1, d2,  x, &subdiag_ids[0]);
+	}
+
+	public:
+	ButterflyMatEig(const MatSparse<FPP, Cpu> &factor, int level) : ButterflyMat(factor, level) {}
+};
+
+class ButterflyMatOMPSIMD: public ButterflyMat
+{
+
+
+	virtual Vec<FPP> multiply(Vec<FPP>& x) const
+	{
+		Vec<FPP> z(x.rows());
+		#pragma omp for simd
+		for(int i=0;i < x.rows(); i++)
+			z[i] = d1[i] * x[i] + d2[i] * x[subdiag_ids[i]];
 		return z;
 	}

+	public:
+	ButterflyMatOMPSIMD(const MatSparse<FPP, Cpu> &factor, int level) : ButterflyMat(factor, level) {}
 };

 class ButterflyPermFaust
@@ -86,13 +300,24 @@ class ButterflyPermFaust
 	TransformHelper<FPP, Cpu>& csr_faust;

 	public:
-	ButterflyPermFaust(TransformHelper<FPP, Cpu>& csr_faust) : csr_faust(csr_faust)
+	ButterflyPermFaust(TransformHelper<FPP, Cpu>& csr_faust, const bool avx = false, const bool eigen = false, const bool omp = false) : csr_faust(csr_faust)
 	{
 		int i = 0;
+		assert(!avx || !eigen);
 		for(auto csr_fac: csr_faust)
 		{
 			if(i < csr_faust.size()-1)
-				opt_factors.insert(opt_factors.begin(), ButterflyMat(*dynamic_cast<const MatSparse<FPP, Cpu>*>(csr_fac), i++));
+#ifdef USE_AVX
+				if(avx)
+					opt_factors.insert(opt_factors.begin(), ButterflyMatAVX(*dynamic_cast<const MatSparse<FPP, Cpu>*>(csr_fac), i++));
+				else
+#endif
+					if(omp)
+						opt_factors.insert(opt_factors.begin(), ButterflyMatOMPSIMD(*dynamic_cast<const MatSparse<FPP, Cpu>*>(csr_fac), i++));
+					else if(eigen)
+						opt_factors.insert(opt_factors.begin(), ButterflyMatEig(*dynamic_cast<const MatSparse<FPP, Cpu>*>(csr_fac), i++));
+					else
+						opt_factors.insert(opt_factors.begin(), ButterflyMat(*dynamic_cast<const MatSparse<FPP, Cpu>*>(csr_fac), i++));
 		}
 		// set the permutation factor
 		auto csr_fac = *(csr_faust.end()-1);
@@ -109,22 +334,31 @@ class ButterflyPermFaust

 	Vec<FPP> multiply(Vec<FPP>& x) const
 	{
-//		Vec<FPP> y = x(bitrev_perm, Eigen::placeholders::all).array();
-//		y = perm_d.array() * y.array();
+		// 1st method: rely on eigen
+		//		Vec<FPP> y = x(bitrev_perm, Eigen::placeholders::all).array();
+		//		y = perm_d.array() * y.array();
+		//		2nd method: manual eltwise prod
+		// 2nd method
+//		Vec<FPP> y(x.rows());
+////#pragma omp parallel for
+//		for(int i=0;i < x.rows(); i++)
+//			y[i] = perm_d[i] * x[bitrev_perm[i]];
+		// 3rd method
 		Vec<FPP> y(x.rows());
-		#pragma omp parallel for
 		for(int i=0;i < x.rows(); i++)
-			y[i] = perm_d[i] * x[bitrev_perm[i]];
+			y[i] = x[bitrev_perm[i]];
+		y = perm_d.array() * y.array();
 		int i = csr_faust.size()-2;
 		for(auto fac: opt_factors)
 		{
-			y = fac.multiply(y);
+			y = std::move(fac.multiply(y));
 			i--;
 		}
 		return y;
 	}
 };

+
 int main(int argc, char** argv)
 {
 	int nsamples = 100;
@@ -142,30 +376,71 @@ int main(int argc, char** argv)
 //	DFT->display();
 	auto size = DFT->getNbRow();
 	ButterflyPermFaust opt_DFT(*DFT);
+#ifdef USE_AVX
+	ButterflyPermFaust opt_DFT_avx(*DFT, true);
+#endif
+	ButterflyPermFaust opt_DFT_eig(*DFT, false, true);
+	ButterflyPermFaust opt_DFT_omp(*DFT, false, false, true);
 	Vec<FPP> x1 = Vec<FPP>::Random(size);
 //	Vec<FPP> x1 = Vec<FPP>::Ones(size);
 	Vect<FPP, Cpu> x1_(size, x1.data());
-	Vec<FPP> y1;
+#ifdef USE_AVX
+	Vec<FPP> y1, y1_avx, y1_eig, y1_omp;
+#else
+	Vec<FPP> y1, y1_eig, y1_omp;
+#endif
 	auto start = std::chrono::steady_clock::now();
 	for(int i=0;i<nsamples; i++)
 		y1 = opt_DFT.multiply(x1);
 	auto end = std::chrono::steady_clock::now();
 	std::chrono::duration<double> elapsed_seconds = end-start;
-	std::cout << "Optimized DFT product time:" << elapsed_seconds.count() << std::endl;
-	Vect<FPP, Cpu> y1_ref;
+	std::cout << "Optimized DFT product time (for-loop on elts):" << elapsed_seconds.count() << std::endl;
+#ifdef USE_AVX
+	start = std::chrono::steady_clock::now();
+	for(int i=0;i<nsamples; i++)
+		y1_avx = opt_DFT_avx.multiply(x1);
+	end = std::chrono::steady_clock::now();
+	elapsed_seconds = end-start;
+	std::cout << "Optimized DFT product time (AVX):" << elapsed_seconds.count() << std::endl;
+#endif
+	start = std::chrono::steady_clock::now();
+	for(int i=0;i<nsamples; i++)
+		y1_omp = opt_DFT_omp.multiply(x1);
+	end = std::chrono::steady_clock::now();
+	elapsed_seconds = end-start;
+	std::cout << "Optimized DFT product time (OMP SIMD):" << elapsed_seconds.count() << std::endl;
+	start = std::chrono::steady_clock::now();
+	for(int i=0;i<nsamples; i++)
+		y1_eig = opt_DFT_eig.multiply(x1);
+	end = std::chrono::steady_clock::now();
+	elapsed_seconds = end-start;
+	std::cout << "Optimized DFT product time (eigen):" << elapsed_seconds.count() << std::endl;
+	Vect<FPP, Cpu> y1_ref(size);
 	start = std::chrono::steady_clock::now();
 	for(int i=0;i<nsamples; i++)
-		y1_ref = DFT->multiply(x1_);
+		DFT->multiply(x1_.getData(), y1_ref.getData());
 	end = std::chrono::steady_clock::now();
 	elapsed_seconds = end-start;
 	std::cout << "Faust DFT product time:" << elapsed_seconds.count() << std::endl;
-	Vect<FPP, Cpu> y1_test(size, y1.data());
-//	std::cout << y1_ref.norm() << std::endl;
-//	std::cout << y1_test.norm() << std::endl;
-//	y1_ref.Display();
-//	std::cout << y1 << std::endl;
-	y1_ref -= y1_test;
-	assert(y1_ref.norm() < 1e-6);
-	std::cout << "prod tested OK" << std::endl;
+#if USE_AVX
+	auto checked_vecs = {y1, y1_eig, y1_avx, y1_omp};
+#else
+	auto checked_vecs = {y1, y1_eig, y1_omp};
+#endif
+	for(auto y1_: checked_vecs)
+	{
+
+		Vect<FPP, Cpu> y1_test(size, y1_.data());
+		auto err = y1_ref;
+		//	std::cout << y1_ref.norm() << std::endl;
+		//	std::cout << y1_test.norm() << std::endl;
+		//	y1_ref.Display();
+		//	std::cout << y1 << std::endl;
+		err -= y1_test;
+		assert(err.norm() < 1e-6);
+	}
+
+	std::cout << "all prods tested OK" << std::endl;
 	return EXIT_SUCCESS;
 }
+