Mentions légales du service

Skip to content
Snippets Groups Projects
Commit 1a646960 authored by hhakim's avatar hhakim
Browse files

Add in tests AVX, OpenMP SIMD, Eigen (template exps) to the ways of computing...

Add in tests AVX, OpenMP SIMD, Eigen (template exps) to the ways of computing the Faust DFT-vector product using diagonal opt.

Issue #275.
parent 47f44bfe
Branches
Tags
No related merge requests found
......@@ -4,6 +4,14 @@
#include <vector>
#include <ctime>
#include <chrono>
#include <fstream>
#include <cstdio>
#include <iostream>
#include "Eigen/Core"
#include <complex>
#include <string>
typedef @TEST_FPP@ FPP;
using namespace Faust;
......@@ -19,9 +27,170 @@ using Vec = Eigen::Matrix<T, Eigen::Dynamic, 1>;
template<typename T>
using DenseMatMap = Eigen::Map<DenseMat<T>>;
using CplxVec = Matrix<complex<double>, Eigen::Dynamic, 1>;
/************************************* avx*/
#ifdef USE_AVX // it needs -mavx, -mavx2, -mfma
#include <immintrin.h>
using namespace Eigen;
using namespace std;
using DblVec = Matrix<double, Eigen::Dynamic, 1>;
using IntVec = Matrix<int, Eigen::Dynamic, 1>;
using CplxVecMap = Map<CplxVec>;
CplxVec avx_cplx_add(CplxVec &vec1, CplxVec &vec2)
{
__m256d v1, v2, v3;
size_t size = vec1.rows();
CplxVec vec3(size);
for(int i=0; i<size; i +=2)
{
v1 = _mm256_load_pd(reinterpret_cast<double*>(vec1.data()+i));
v2 = _mm256_load_pd(reinterpret_cast<double*>(vec2.data()+i));
v3 = _mm256_add_pd(v1, v2);
memcpy(vec3.data()+i, reinterpret_cast<complex<double>*>(&v3), 2*sizeof(complex<double>));
}
return vec3;
}
__m256d avx_cplx_mul(__m256d &v1, __m256d& v2)
{
__m256d v3, v4;
static __m256d neg = _mm256_setr_pd(1.0, -1.0, 1.0, -1.0);
v3 = _mm256_mul_pd(v1, v2);
v2 = _mm256_permute_pd(v2, 0x5);
v2 = _mm256_mul_pd(v2, neg);
v4 = _mm256_mul_pd(v1, v2);
v2 = _mm256_hsub_pd(v3, v4);
return v2;
}
CplxVec avx_cplx_mul(CplxVec &vec1, CplxVec &vec2)
{
__m256d v1, v2;
size_t size = vec1.rows();
CplxVec vec3(size);
for(int i=0; i<size; i +=2)
{
v1 = _mm256_load_pd(reinterpret_cast<double*>(vec1.data()+i));
v2 = _mm256_load_pd(reinterpret_cast<double*>(vec2.data()+i));
v2 = avx_cplx_mul(v1, v2);
memcpy(vec3.data()+i, reinterpret_cast<complex<double>*>(&v2), 2*sizeof(complex<double>));
}
return vec3;
}
CplxVec avx_cplx_mul2(CplxVec &vec1, CplxVec &vec2)
{
// de-interleave input vectors
DblVec vec1_r = vec1.real();
DblVec vec1_i = vec1.imag();
DblVec vec2_r = vec2.real();
DblVec vec2_i = vec2.imag();
__m256d v1r, v2r, v1i, v2i, tmp, res;
size_t size = vec2.rows();
CplxVec vec3(size);
for(int i=0; i<size; i += 4)
{
v1i = _mm256_load_pd(vec1_i.data()+i);
v2i = _mm256_load_pd(vec2_i.data()+i);
v1r = _mm256_load_pd(vec1_r.data()+i);
v2r = _mm256_load_pd(vec2_r.data()+i);
//compute real part
tmp = _mm256_mul_pd(v1i, v2i);
res = _mm256_fmsub_pd(v1r, v2r, tmp); // real part
double *resptr = (double*) &res;
double* vec3ptr = reinterpret_cast<double*>(vec3.data()+i);
// copy real part into vec3
vec3ptr[0] = resptr[0];
vec3ptr[2] = resptr[1];
vec3ptr[4] = resptr[2];
vec3ptr[6] = resptr[3];
//compute imag part
tmp = _mm256_mul_pd(v1r, v2i);
res = _mm256_fmadd_pd(v1i, v2r, tmp);
resptr = (double*) &res;
// copy imag part into vec3
vec3ptr[1] = resptr[0];
vec3ptr[3] = resptr[1];
vec3ptr[5] = resptr[2];
vec3ptr[7] = resptr[3];
}
return vec3;
}
CplxVec avx_cplx_diag_prod(const CplxVec &d1, const CplxVec &d2, const CplxVec &x, const int* x_ids)
{
__m256d v1, v2, v3, v4;
size_t size = d1.rows();
CplxVec vec3(size);
double sub_x_ids[4];
double *ptr;
for(int i=0; i<size; i +=2)
{
// compute y = d2*x[x_ids]
v1 = _mm256_load_pd(reinterpret_cast<const double*>(d2.data()+i));
ptr = (double*) (x.data()+x_ids[i]);
sub_x_ids[0] = ptr[0];
sub_x_ids[1] = ptr[1];
ptr = (double*) (x.data()+x_ids[i+1]);
sub_x_ids[2] = ptr[0];
sub_x_ids[3] = ptr[1];
v2 = _mm256_loadu_pd(sub_x_ids); //unaligned
v2 = avx_cplx_mul(v1, v2);
// compute z = d1*x
v3 = _mm256_load_pd(reinterpret_cast<const double*>(d1.data()+i));
v4 = _mm256_load_pd(reinterpret_cast<const double*>(x.data()+i));
v4 = avx_cplx_mul(v3, v4);
// y + z
v2 = _mm256_add_pd(v2, v4);
memcpy(vec3.data()+i, reinterpret_cast<complex<double>*>(&v2), 2*sizeof(complex<double>));
}
return vec3;
}
#endif // USE_AVX
CplxVec eigen_cplx_diag_prod(const CplxVec &d1, const CplxVec &d2, const CplxVec &x, const int* x_ids)
{
CplxVec z(d1.rows());
for(int i=0;i < d1.rows(); i++)
z[i] = x[x_ids[i]];
z = d2.array() * z.array() + d1.array() * x.array();
return z;
}
class ButterflyMat
{
protected:
Vec<FPP> d1;
Vec<FPP> d2;
vector<int> subdiag_ids;
......@@ -64,18 +233,63 @@ class ButterflyMat
this->level = level;
}
Vec<FPP> multiply(Vec<FPP>& x) const
virtual Vec<FPP> multiply(Vec<FPP>& x) const
{
Vec<FPP> z(x.rows());
//auto y = x(subdiag_ids, Eigen::placeholders::all).array();
#pragma omp parallel for
// 1st meth
// #pragma omp parallel for
for(int i=0;i < x.rows(); i++)
z[i] = d1[i] * x[i] + d2[i] * x[subdiag_ids[i]];
// z[i] = d1[i] * x[i] + d2[i] * y[i];
// z = d1.array() * x.array() + d2.array() * x(subdiag_ids, Eigen::placeholders::all).array();
z[i] = d1[i] * x[i] + d2[i] * x[subdiag_ids[i]];
return z;
}
};
#ifdef USE_AVX
class ButterflyMatAVX: public ButterflyMat
{
virtual Vec<FPP> multiply(Vec<FPP>& x) const
{
return avx_cplx_diag_prod(d1, d2, x, &subdiag_ids[0]);
}
public:
ButterflyMatAVX(const MatSparse<FPP, Cpu> &factor, int level) : ButterflyMat(factor, level) {}
};
#endif
class ButterflyMatEig: public ButterflyMat
{
virtual Vec<FPP> multiply(Vec<FPP>& x) const
{
return eigen_cplx_diag_prod(d1, d2, x, &subdiag_ids[0]);
}
public:
ButterflyMatEig(const MatSparse<FPP, Cpu> &factor, int level) : ButterflyMat(factor, level) {}
};
class ButterflyMatOMPSIMD: public ButterflyMat
{
virtual Vec<FPP> multiply(Vec<FPP>& x) const
{
Vec<FPP> z(x.rows());
#pragma omp for simd
for(int i=0;i < x.rows(); i++)
z[i] = d1[i] * x[i] + d2[i] * x[subdiag_ids[i]];
return z;
}
public:
ButterflyMatOMPSIMD(const MatSparse<FPP, Cpu> &factor, int level) : ButterflyMat(factor, level) {}
};
class ButterflyPermFaust
......@@ -86,13 +300,24 @@ class ButterflyPermFaust
TransformHelper<FPP, Cpu>& csr_faust;
public:
ButterflyPermFaust(TransformHelper<FPP, Cpu>& csr_faust) : csr_faust(csr_faust)
ButterflyPermFaust(TransformHelper<FPP, Cpu>& csr_faust, const bool avx = false, const bool eigen = false, const bool omp = false) : csr_faust(csr_faust)
{
int i = 0;
assert(!avx || !eigen);
for(auto csr_fac: csr_faust)
{
if(i < csr_faust.size()-1)
opt_factors.insert(opt_factors.begin(), ButterflyMat(*dynamic_cast<const MatSparse<FPP, Cpu>*>(csr_fac), i++));
#ifdef USE_AVX
if(avx)
opt_factors.insert(opt_factors.begin(), ButterflyMatAVX(*dynamic_cast<const MatSparse<FPP, Cpu>*>(csr_fac), i++));
else
#endif
if(omp)
opt_factors.insert(opt_factors.begin(), ButterflyMatOMPSIMD(*dynamic_cast<const MatSparse<FPP, Cpu>*>(csr_fac), i++));
else if(eigen)
opt_factors.insert(opt_factors.begin(), ButterflyMatEig(*dynamic_cast<const MatSparse<FPP, Cpu>*>(csr_fac), i++));
else
opt_factors.insert(opt_factors.begin(), ButterflyMat(*dynamic_cast<const MatSparse<FPP, Cpu>*>(csr_fac), i++));
}
// set the permutation factor
auto csr_fac = *(csr_faust.end()-1);
......@@ -109,22 +334,31 @@ class ButterflyPermFaust
Vec<FPP> multiply(Vec<FPP>& x) const
{
// Vec<FPP> y = x(bitrev_perm, Eigen::placeholders::all).array();
// y = perm_d.array() * y.array();
// 1st method: rely on eigen
// Vec<FPP> y = x(bitrev_perm, Eigen::placeholders::all).array();
// y = perm_d.array() * y.array();
// 2nd method: manual eltwise prod
// 2nd method
// Vec<FPP> y(x.rows());
////#pragma omp parallel for
// for(int i=0;i < x.rows(); i++)
// y[i] = perm_d[i] * x[bitrev_perm[i]];
// 3rd method
Vec<FPP> y(x.rows());
#pragma omp parallel for
for(int i=0;i < x.rows(); i++)
y[i] = perm_d[i] * x[bitrev_perm[i]];
y[i] = x[bitrev_perm[i]];
y = perm_d.array() * y.array();
int i = csr_faust.size()-2;
for(auto fac: opt_factors)
{
y = fac.multiply(y);
y = std::move(fac.multiply(y));
i--;
}
return y;
}
};
int main(int argc, char** argv)
{
int nsamples = 100;
......@@ -142,30 +376,71 @@ int main(int argc, char** argv)
// DFT->display();
auto size = DFT->getNbRow();
ButterflyPermFaust opt_DFT(*DFT);
#ifdef USE_AVX
ButterflyPermFaust opt_DFT_avx(*DFT, true);
#endif
ButterflyPermFaust opt_DFT_eig(*DFT, false, true);
ButterflyPermFaust opt_DFT_omp(*DFT, false, false, true);
Vec<FPP> x1 = Vec<FPP>::Random(size);
// Vec<FPP> x1 = Vec<FPP>::Ones(size);
Vect<FPP, Cpu> x1_(size, x1.data());
Vec<FPP> y1;
#ifdef USE_AVX
Vec<FPP> y1, y1_avx, y1_eig, y1_omp;
#else
Vec<FPP> y1, y1_eig, y1_omp;
#endif
auto start = std::chrono::steady_clock::now();
for(int i=0;i<nsamples; i++)
y1 = opt_DFT.multiply(x1);
auto end = std::chrono::steady_clock::now();
std::chrono::duration<double> elapsed_seconds = end-start;
std::cout << "Optimized DFT product time:" << elapsed_seconds.count() << std::endl;
Vect<FPP, Cpu> y1_ref;
std::cout << "Optimized DFT product time (for-loop on elts):" << elapsed_seconds.count() << std::endl;
#ifdef USE_AVX
start = std::chrono::steady_clock::now();
for(int i=0;i<nsamples; i++)
y1_avx = opt_DFT_avx.multiply(x1);
end = std::chrono::steady_clock::now();
elapsed_seconds = end-start;
std::cout << "Optimized DFT product time (AVX):" << elapsed_seconds.count() << std::endl;
#endif
start = std::chrono::steady_clock::now();
for(int i=0;i<nsamples; i++)
y1_omp = opt_DFT_omp.multiply(x1);
end = std::chrono::steady_clock::now();
elapsed_seconds = end-start;
std::cout << "Optimized DFT product time (OMP SIMD):" << elapsed_seconds.count() << std::endl;
start = std::chrono::steady_clock::now();
for(int i=0;i<nsamples; i++)
y1_eig = opt_DFT_eig.multiply(x1);
end = std::chrono::steady_clock::now();
elapsed_seconds = end-start;
std::cout << "Optimized DFT product time (eigen):" << elapsed_seconds.count() << std::endl;
Vect<FPP, Cpu> y1_ref(size);
start = std::chrono::steady_clock::now();
for(int i=0;i<nsamples; i++)
y1_ref = DFT->multiply(x1_);
DFT->multiply(x1_.getData(), y1_ref.getData());
end = std::chrono::steady_clock::now();
elapsed_seconds = end-start;
std::cout << "Faust DFT product time:" << elapsed_seconds.count() << std::endl;
Vect<FPP, Cpu> y1_test(size, y1.data());
// std::cout << y1_ref.norm() << std::endl;
// std::cout << y1_test.norm() << std::endl;
// y1_ref.Display();
// std::cout << y1 << std::endl;
y1_ref -= y1_test;
assert(y1_ref.norm() < 1e-6);
std::cout << "prod tested OK" << std::endl;
#if USE_AVX
auto checked_vecs = {y1, y1_eig, y1_avx, y1_omp};
#else
auto checked_vecs = {y1, y1_eig, y1_omp};
#endif
for(auto y1_: checked_vecs)
{
Vect<FPP, Cpu> y1_test(size, y1_.data());
auto err = y1_ref;
// std::cout << y1_ref.norm() << std::endl;
// std::cout << y1_test.norm() << std::endl;
// y1_ref.Display();
// std::cout << y1 << std::endl;
err -= y1_test;
assert(err.norm() < 1e-6);
}
std::cout << "all prods tested OK" << std::endl;
return EXIT_SUCCESS;
}
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment