Mentions légales du service

Skip to content
Snippets Groups Projects
Commit 72880940 authored by hhakim's avatar hhakim
Browse files

Optimize Faust::TransformeHelperPoly::multiply(Vect) (F@x in python).

Decrease the number of copies.

======== Mini benchmark
 hinria  …  build  wrapper  python  for i in {1..10}; do python3
test_poly_cpp.py | grep speedup;done
speedup 90.75602805478192
speedup 89.09036938690689
speedup 82.56148989913501
speedup 90.64531438391626
speedup 91.66248597807245
speedup 90.10036697594744
speedup 75.61254492055296
speedup 89.96955925554978
speedup 95.22901711420815
speedup 90.92116442476765
 hinria  …  build  wrapper  python  for i in {1..10}; do python3
test_poly_cpp.py | grep speedup;done
speedup 123.0277183801693
speedup 103.32878272246883
speedup 127.7827667275423
speedup 124.11150187961708
speedup 113.07702426409236
speedup 87.57467702373462
speedup 127.40517991813421
speedup 99.10773177189199
speedup 127.26056623369436
speedup 129.75727501516909

========

https://gitlab.inria.fr/faustgrp/faust/-/snippets/704
parent 32d965ae
No related branches found
No related tags found
No related merge requests found
...@@ -19,8 +19,10 @@ namespace Faust ...@@ -19,8 +19,10 @@ namespace Faust
const FPP lambda_, const bool optimizedCopy, const bool cloning_fact, const FPP lambda_, const bool optimizedCopy, const bool cloning_fact,
const bool internal_call) : TransformHelper<FPP,Cpu>(facts, lambda_, optimizedCopy, cloning_fact, internal_call) {} const bool internal_call) : TransformHelper<FPP,Cpu>(facts, lambda_, optimizedCopy, cloning_fact, internal_call) {}
Vect<FPP,Cpu> multiply(const Vect<FPP,Cpu> x, const bool transpose=false, const bool conjugate=false); Vect<FPP,Cpu> multiply(const Vect<FPP,Cpu> &x, const bool transpose=false, const bool conjugate=false);
MatDense<FPP, Cpu> multiply(MatDense<FPP,Cpu> X, const bool transpose=false, const bool conjugate=false); Vect<FPP,Cpu> multiply(const FPP* x, const bool transpose=false, const bool conjugate=false);
void multiply(const FPP* x, FPP* y, const bool transpose=false, const bool conjugate=false);
MatDense<FPP, Cpu> multiply(const MatDense<FPP,Cpu> &X, const bool transpose=false, const bool conjugate=false);
Vect<FPP, Cpu> poly(MatDense<FPP,Cpu> & basisX, Vect<FPP, Cpu> coeffs); Vect<FPP, Cpu> poly(MatDense<FPP,Cpu> & basisX, Vect<FPP, Cpu> coeffs);
MatDense<FPP, Cpu> poly(int n, MatDense<FPP,Cpu> & basisX, Vect<FPP, Cpu> coeffs); MatDense<FPP, Cpu> poly(int n, MatDense<FPP,Cpu> & basisX, Vect<FPP, Cpu> coeffs);
void poly(int d, int n, const FPP* basisX, const FPP* coeffs, FPP* out); void poly(int d, int n, const FPP* basisX, const FPP* coeffs, FPP* out);
......
#ifndef __FAUST_TRANSFORM_HELPER_POLY__
#define __FAUST_TRANSFORM_HELPER_POLY__
#include "faust_TransformHelper.h"
namespace Faust
{
/**
* \brief This class aims to represent a Chebyshev polynomial basis as a "Faust".
*
* It overrides some operations to optimize the performance by taking into account the factors specific structure.
*/
template<typename FPP>
class TransformHelperPoly : public TransformHelper<FPP,Cpu>
{
public:
MatSparse<FPP, Cpu> L; //TODO: must be private (and basisChebyshev a friend of this class)
MatSparse<FPP, Cpu> twoL; //TODO: must be private
TransformHelperPoly(const std::vector<MatGeneric<FPP,Cpu> *>& facts,
const FPP lambda_, const bool optimizedCopy, const bool cloning_fact,
const bool internal_call) : TransformHelper<FPP,Cpu>(facts, lambda_, optimizedCopy, cloning_fact, internal_call) {}
Vect<FPP,Cpu> multiply(const Vect<FPP,Cpu> x, const bool transpose=false, const bool conjugate=false);
MatDense<FPP, Cpu> multiply(MatDense<FPP,Cpu> X, const bool transpose=false, const bool conjugate=false);
Vect<FPP, Cpu> poly(MatDense<FPP,Cpu> & basisX, Vect<FPP, Cpu> coeffs);
MatDense<FPP, Cpu> poly(int n, MatDense<FPP,Cpu> & basisX, Vect<FPP, Cpu> coeffs);
void poly(int d, int n, const FPP* basisX, const FPP* coeffs, FPP* out);
TransformHelper<FPP, Cpu>* polyFaust(const FPP* coeffs);
};
template<typename FPP>
TransformHelper<FPP, Cpu>* basisChebyshev(MatSparse<FPP,Cpu>* L, int32_t K);
template<typename FPP>
void poly(int d, int K, int n, const FPP* basisX, const FPP* coeffs, FPP* out);
}
#include "faust_TransformHelperPoly.hpp"
#endif
...@@ -2,11 +2,25 @@ ...@@ -2,11 +2,25 @@
#include "faust_linear_algebra.h" #include "faust_linear_algebra.h"
namespace Faust namespace Faust
{ {
template<typename FPP>
Vect<FPP,Cpu> TransformHelperPoly<FPP>::multiply(const Vect<FPP,Cpu> &x, const bool transpose/*=false*/, const bool conjugate/*=false*/)
{
return std::move(this->multiply(x.getData(), transpose, conjugate));
}
template<typename FPP> template<typename FPP>
Vect<FPP,Cpu> TransformHelperPoly<FPP>::multiply(const Vect<FPP,Cpu> x, const bool transpose/*=false*/, const bool conjugate/*=false*/) Vect<FPP,Cpu> TransformHelperPoly<FPP>::multiply(const FPP* x, const bool transpose/*=false*/, const bool conjugate/*=false*/)
{ {
int d = L.getNbRow();
int K = this->size();
Vect<FPP, Cpu> v0(d*(K+1));
multiply(x, v0.getData(), transpose, conjugate);
return std::move(v0);
}
template<typename FPP>
void TransformHelperPoly<FPP>::multiply(const FPP* x, FPP* y, const bool transpose/*=false*/, const bool conjugate/*=false*/)
{
// std::cout << "TransformHelperPoly<FPP>::multiply(Vect)" << std::endl; // std::cout << "TransformHelperPoly<FPP>::multiply(Vect)" << std::endl;
/** /**
* Recurrence relation (k=1 to K): * Recurrence relation (k=1 to K):
...@@ -22,25 +36,22 @@ namespace Faust ...@@ -22,25 +36,22 @@ namespace Faust
* Fx = [v_{0, K} ; v_{1, K} ; v_{2, K}] * Fx = [v_{0, K} ; v_{1, K} ; v_{2, K}]
*/ */
int d = L.getNbRow(); int d = L.getNbRow();
Vect<FPP,Cpu> v1(d), v2, tmp; Vect<FPP,Cpu> v2, tmp;
int K = this->size(); int K = this->size();
Vect<FPP, Cpu> v0(d*(K+1));
// seeing how basisChebyshev method is instantiating this class // seeing how basisChebyshev method is instantiating this class
// K can't be strictly lower than two // K can't be strictly lower than two
memcpy(v1.getData(), x.getData(), sizeof(FPP)*d); v2 = L*Vect<FPP,Cpu>(d, x);//L*x
v2 = L*x; memcpy(y, x, sizeof(FPP)*d);
for(int i=2;i<=K;i++) memcpy(y+d, v2.getData(), sizeof(FPP)*d);
for(int i=3;i<=K+1;i++)
{ {
memcpy(v0.getData()+d*(i-2), v1.getData(), sizeof(FPP)*d); int offset = d*(i-3);
tmp = v1;
v1 = v2;
// gemv(L, v2, tmp, FPP(1), FPP(-1)); // gemv(L, v2, tmp, FPP(1), FPP(-1));
v2 = twoL*v2; // v2 = twoL*v2;
v2 -= tmp; v2.multiplyLeft(twoL);
v2 -= y+offset;
memcpy(y+d*(i-1), v2.getData(), sizeof(FPP)*d);
} }
memcpy(v0.getData()+d*(K-1), v1.getData(), sizeof(FPP)*d);
memcpy(v0.getData()+d*K, v2.getData(), sizeof(FPP)*d);
return v0;
} }
// Slower method but kept commented here until further tests // Slower method but kept commented here until further tests
...@@ -80,7 +91,7 @@ namespace Faust ...@@ -80,7 +91,7 @@ namespace Faust
// } // }
template<typename FPP> template<typename FPP>
MatDense<FPP, Cpu> TransformHelperPoly<FPP>::multiply(MatDense<FPP,Cpu> X, const bool transpose/*=false*/, const bool conjugate/*=false*/) MatDense<FPP, Cpu> TransformHelperPoly<FPP>::multiply(const MatDense<FPP,Cpu> &X, const bool transpose/*=false*/, const bool conjugate/*=false*/)
{ {
// std::cout << "TransformHelperPoly<FPP>::multiply(MatDense)" << std::endl; // std::cout << "TransformHelperPoly<FPP>::multiply(MatDense)" << std::endl;
int d = L.getNbRow(); int d = L.getNbRow();
...@@ -227,7 +238,9 @@ namespace Faust ...@@ -227,7 +238,9 @@ namespace Faust
{ {
Vect<FPP,Cpu> _coeffs(K_plus_1, coeffs); Vect<FPP,Cpu> _coeffs(K_plus_1, coeffs);
MatDense<FPP,Cpu> basisXi(d, K_plus_1); MatDense<FPP,Cpu> basisXi(d, K_plus_1);
// reshape a d_K_plus_1 vector into a d*K_plus_1 matrix
memcpy(basisXi.getData(), basisX+d_K_plus_1*i, sizeof(FPP)*d_K_plus_1); memcpy(basisXi.getData(), basisX+d_K_plus_1*i, sizeof(FPP)*d_K_plus_1);
// compute the linear combination
_coeffs.multiplyLeft(basisXi); _coeffs.multiplyLeft(basisXi);
memcpy(out+i*d, _coeffs.getData(), sizeof(FPP)*d); memcpy(out+i*d, _coeffs.getData(), sizeof(FPP)*d);
} }
......
#include <cstdlib>
#include "faust_linear_algebra.h"
namespace Faust
{
template<typename FPP>
Vect<FPP,Cpu> TransformHelperPoly<FPP>::multiply(const Vect<FPP,Cpu> x, const bool transpose/*=false*/, const bool conjugate/*=false*/)
{
// std::cout << "TransformHelperPoly<FPP>::multiply(Vect)" << std::endl;
/**
* Recurrence relation (k=1 to K):
* v_{0,k} := [v_{0,k-1} ; v_{1, k-1} ] // concatenation
* v_{1,k} := v_{2,k-1}
* v_{2,k} := 2Lv_{2,k-1} - v_{1,k-1}
*
* First terms:
* v_{0,1} = []
* v_{1,1} = x
* v_{2,1} = Lx
*
* Fx = [v_{0, K} ; v_{1, K} ; v_{2, K}]
*/
int d = L.getNbRow();
Vect<FPP,Cpu> v1(d), v2, tmp;
int K = this->size();
Vect<FPP, Cpu> v0(d*(K+1));
// seeing how basisChebyshev method is instantiating this class
// K can't be strictly lower than two
memcpy(v1.getData(), x.getData(), sizeof(FPP)*d);
v2 = L*x;
for(int i=2;i<=K;i++)
{
memcpy(v0.getData()+d*(i-2), v1.getData(), sizeof(FPP)*d);
tmp = v1;
v1 = v2;
// gemv(L, v2, tmp, FPP(1), FPP(-1));
v2 = twoL*v2;
v2 -= tmp;
}
memcpy(v0.getData()+d*(K-1), v1.getData(), sizeof(FPP)*d);
memcpy(v0.getData()+d*K, v2.getData(), sizeof(FPP)*d);
return v0;
}
// Slower method but kept commented here until further tests
// template<typename FPP>
// MatDense<FPP, Cpu> TransformHelperPoly<FPP>::multiply(MatDense<FPP,Cpu> X, const bool transpose/*=false*/, const bool conjugate/*=false*/)
// {
//// std::cout << "TransformHelperPoly<FPP>::multiply(MatDense)" << std::endl;
// int d = L.getNbRow();
// int n = X.getNbCol();
// int K = this->size();
// MatDense<FPP, Cpu> V0(d*(K+1), n);
// MatDense<FPP,Cpu> V1(d, n), V2 = X, tmp;
// memcpy(V1.getData(), X.getData(), sizeof(FPP)*d*n);
// L.multiply(V2, 'N');
// for(int i=2;i<=K;i++)
// {
// memcpy(V0.getData()+d*n*(i-2), V1.getData(), sizeof(FPP)*d*n);
// tmp = V1;
// V1 = V2;
// twoL.multiply(V2, 'N');
// V2 -= tmp;
//// gemm(twoL, V2, tmp, (FPP)1.0, (FPP)-1.0, 'N', 'N');
//// V2 = tmp;
// }
// memcpy(V0.getData()+d*n*(K-1), V1.getData(), sizeof(FPP)*d*n);
// memcpy(V0.getData()+d*n*K, V2.getData(), sizeof(FPP)*d*n);
// // put bocks in proper order
// MatDense<FPP,Cpu> V0_ord(d*(K+1), n);
// #pragma omp parallel for
// for(int i=0;i<(K+1);i++)
// {
// auto i_offset = i*n*d;
// for(int j=0;j<n;j++)
// memcpy(V0_ord.getData()+(K+1)*d*j+d*i, V0.getData()+j*d+i_offset, sizeof(FPP)*d);
// }
// return V0_ord;
// }
template<typename FPP>
MatDense<FPP, Cpu> TransformHelperPoly<FPP>::multiply(MatDense<FPP,Cpu> X, const bool transpose/*=false*/, const bool conjugate/*=false*/)
{
// std::cout << "TransformHelperPoly<FPP>::multiply(MatDense)" << std::endl;
int d = L.getNbRow();
int K = this->size();
int n = X.getNbCol();
MatDense<FPP,Cpu> V0_ord(d*(K+1), n);
auto scale = (K+1)*d;
#pragma omp parallel for
for(int i=0;i<n;i++)
{
Vect<FPP,Cpu> x(d, X.getData()+i*d);
auto y = multiply(x);
memcpy(V0_ord.getData()+scale*i, y.getData(), sizeof(FPP)*scale);
}
return V0_ord;
}
template<typename FPP>
Vect<FPP, Cpu> TransformHelperPoly<FPP>::poly(MatDense<FPP,Cpu> & basisX, Vect<FPP, Cpu> coeffs)
{
coeffs.multiplyLeft(basisX);
return coeffs;
}
template<typename FPP>
MatDense<FPP, Cpu> TransformHelperPoly<FPP>::poly(int n, MatDense<FPP,Cpu> & basisX, Vect<FPP, Cpu> coeffs)
{
int d = L.getNbRow();
MatDense<FPP,Cpu> Y(d, n);
this->poly(d, n, basisX.getData(), coeffs.getData(), Y.getData());
return Y;
}
template<typename FPP>
void TransformHelperPoly<FPP>::poly(int d, int n, const FPP* basisX, const FPP* coeffs, FPP* out)
{
int K = this->size();
Faust::poly(d, K, n, basisX, coeffs, out);
}
template<typename FPP>
TransformHelper<FPP, Cpu>* TransformHelperPoly<FPP>::polyFaust(const FPP* coeffs)
{
// Id = sp.eye(L.shape[1], format="csr")
// scoeffs = sp.hstack(tuple(Id*coeffs[i] for i in range(0, K+1)),
// format="csr")
MatSparse<FPP,Cpu> Id, Id1;
MatSparse<FPP,Cpu> coeffDiags;
auto d = L.getNbRow();
int K = this->size();
Id.resize(d, d, d);
Id.setEyes();
auto Id0 = Id;
Id0 *= coeffs[0];
// K >= 2
for(int i=1;i < K+1; i++)
{
Id1 = Id;
Id1 *= coeffs[i];
MatSparse<FPP,Cpu> coeffDiags;
coeffDiags.hstack(Id0, Id1);
Id0 = coeffDiags;
}
std::vector<MatGeneric<FPP,Cpu>*> facts(K+1);
facts.resize(K+1);
facts[0] = new MatSparse<FPP,Cpu>(Id0);
for(int i=1;i<=K;i++)
{
facts[i] = this->get_gen_fact_nonconst(i-1);
}
return new TransformHelper<FPP,Cpu>(facts, (FPP) 1.0,
/* optimizedCopy */ false,
/* cloning_fact */ false,
/* internal_call */ true);
}
template<typename FPP>
TransformHelper<FPP,Cpu>* basisChebyshev(MatSparse<FPP,Cpu>* L, int32_t K)
{
// assuming L is symmetric
MatSparse<FPP,Cpu> *T1, *T2;
auto d = L->getNbRow();
MatSparse<FPP,Cpu> Id, twoL, minus_Id, rR, R, zero;
std::vector<MatGeneric<FPP,Cpu>*> facts(K);
if(K == 0)
return TransformHelper<FPP,Cpu>::eyeFaust(d,d);
facts.resize(K); //normally it'd be K+1 but T0 is ignored
// build the chebyshev polynomials by factor
// K > 1 ignore the first one (for T0 0-degree), which is the identity
// Identity
Id.resize(d, d, d);
Id.setEyes();
// T1
T1 = new MatSparse<FPP,Cpu>();
T1->vstack(Id, *L);
facts[K-1] = T1;
if(K == 1)
return new TransformHelper<FPP,Cpu>(facts, (FPP)1.0,
/* optimizedCopy */ false,
/* cloning_fact */ false,
/* internal_call */ true);
// rR block
twoL = *L;
twoL *= FPP(2);
minus_Id.resize(d,d,d);
minus_Id.setEyes();
minus_Id *= FPP(-1);
rR.hstack(minus_Id, twoL);
// T2
Id.resize(2*d, 2*d, 2*d);
Id.setEyes();
T2 = new MatSparse<FPP,Cpu>();
T2->vstack(Id, rR);
facts[K-2] = T2;
// T3 to TK
#pragma omp parallel for private(Id, zero, R)
for(int i=3;i<K+1;i++)
{
Id.resize(i*d, i*d, i*d);
Id.setEyes();
zero.resize(0, d, (i-2)*d);
R.hstack(zero, rR);
auto Ti = new MatSparse<FPP,Cpu>();
Ti->vstack(Id, R);
facts[K-i] = Ti;
}
auto basisP = new TransformHelperPoly<FPP>(facts, (FPP)1.0,
/* optimizedCopy */ false,
/* cloning_fact */ false,
/* internal_call */ true);
basisP->L = *L;
basisP->twoL = *L;
basisP->twoL *= 2;
return basisP;
}
template<typename FPP>
void poly(int d, int K, int n, const FPP* basisX, const FPP* coeffs, FPP* out)
{
auto K_plus_1 = K+1;
auto d_K_plus_1 = d*K_plus_1;
#pragma omp parallel for
for(int i=0;i<n;i++)
{
Vect<FPP,Cpu> _coeffs(K_plus_1, coeffs);
MatDense<FPP,Cpu> basisXi(d, K_plus_1);
memcpy(basisXi.getData(), basisX+d_K_plus_1*i, sizeof(FPP)*d_K_plus_1);
_coeffs.multiplyLeft(basisXi);
memcpy(out+i*d, _coeffs.getData(), sizeof(FPP)*d);
}
}
}
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment