Commit fe2adbd4 authored by Berenger Bramas's avatar Berenger Bramas

Update exponential - use const expr instead of array for coef and write factor...

Update exponential - use const expr instead of array for coef and write factor computation in a single line
parent 8cc89dfa
......@@ -15,9 +15,8 @@
#include <iostream>
int main(int /*argc*/, char* /*argv*/ []) {
using RealType = double;
template <class RealType>
void compareExpTime(){
const size_t NbOverLoop = 5;
const size_t NbExp = 1000000;
......@@ -89,6 +88,14 @@ int main(int /*argc*/, char* /*argv*/ []) {
std::cout << "Vector low acc " << InaVecBestType<RealType>::GetName() << " for " << NbExp * NbOverLoop
<< " exp took " << timer.getElapsed() << "s (" << timer.getElapsed()/double(NbExp * NbOverLoop) << "s per exp)\n";
}
}
int main(int /*argc*/, char* /*argv*/ []) {
std::cout << "In Float:" << std::endl;
compareExpTime<float>();
std::cout << "In Double:" << std::endl;
compareExpTime<double>();
return 0;
}
......@@ -54,16 +54,11 @@ for n = 2:10
disp(x0)
end
printf("constexpr static std::array<double,%d> GetCoefficient%d() {\n", n+1, n+1);
printf(" return {{\n");
for i = 1:(n+1)
sep=","
if i == n+1 then
sep=""
end
printf(" %.20e%s\n", -x0(i), sep);
for i = 1:(n+1)
printf("inline constexpr static double GetCoefficient%d_%d() {\n", n+1, i-1);
printf(" return %.20e;\n", -x0(i));
printf("}\n");
end
printf(" }};\n");
printf("}\n\n");
printf("\n");
end
......@@ -258,32 +258,28 @@ public:
#ifdef __INTEL_COMPILER
return _mm256_exp_pd(vec);
#else
static const __m256d COEFF_LOG2E = _mm256_set1_pd(double(InaFastExp::CoeffLog2E()));
static const __m256d COEFF_A = _mm256_set1_pd(double(InaFastExp::CoeffA64()));
static const __m256d COEFF_B = _mm256_set1_pd(double(InaFastExp::CoeffB64()));
static const __m256d COEFF_P5_X = _mm256_set1_pd(double(InaFastExp::GetCoefficient9()[8]));
static const __m256d COEFF_P5_Y = _mm256_set1_pd(double(InaFastExp::GetCoefficient9()[7]));
static const __m256d COEFF_P5_Z = _mm256_set1_pd(double(InaFastExp::GetCoefficient9()[6]));
static const __m256d COEFF_P5_A = _mm256_set1_pd(double(InaFastExp::GetCoefficient9()[5]));
static const __m256d COEFF_P5_B = _mm256_set1_pd(double(InaFastExp::GetCoefficient9()[4]));
static const __m256d COEFF_P5_C = _mm256_set1_pd(double(InaFastExp::GetCoefficient9()[3]));
static const __m256d COEFF_P5_D = _mm256_set1_pd(double(InaFastExp::GetCoefficient9()[2]));
static const __m256d COEFF_P5_E = _mm256_set1_pd(double(InaFastExp::GetCoefficient9()[1]));
static const __m256d COEFF_P5_F = _mm256_set1_pd(double(InaFastExp::GetCoefficient9()[0]));
const __m256d COEFF_LOG2E = _mm256_set1_pd(double(InaFastExp::CoeffLog2E()));
const __m256d COEFF_A = _mm256_set1_pd(double(InaFastExp::CoeffA64()));
const __m256d COEFF_B = _mm256_set1_pd(double(InaFastExp::CoeffB64()));
const __m256d COEFF_P5_X = _mm256_set1_pd(double(InaFastExp::GetCoefficient9_8()));
const __m256d COEFF_P5_Y = _mm256_set1_pd(double(InaFastExp::GetCoefficient9_7()));
const __m256d COEFF_P5_Z = _mm256_set1_pd(double(InaFastExp::GetCoefficient9_6()));
const __m256d COEFF_P5_A = _mm256_set1_pd(double(InaFastExp::GetCoefficient9_5()));
const __m256d COEFF_P5_B = _mm256_set1_pd(double(InaFastExp::GetCoefficient9_4()));
const __m256d COEFF_P5_C = _mm256_set1_pd(double(InaFastExp::GetCoefficient9_3()));
const __m256d COEFF_P5_D = _mm256_set1_pd(double(InaFastExp::GetCoefficient9_2()));
const __m256d COEFF_P5_E = _mm256_set1_pd(double(InaFastExp::GetCoefficient9_1()));
const __m256d COEFF_P5_F = _mm256_set1_pd(double(InaFastExp::GetCoefficient9_0()));
__m256d x = vec * COEFF_LOG2E;
const __m256d fractional_part = x - InaVecAVX(x).floor().vec;
__m256d factor = COEFF_P5_X;
factor = (factor * fractional_part + COEFF_P5_Y);
factor = (factor * fractional_part + COEFF_P5_Z);
factor = (factor * fractional_part + COEFF_P5_A);
factor = (factor * fractional_part + COEFF_P5_B);
factor = (factor * fractional_part + COEFF_P5_C);
factor = (factor * fractional_part + COEFF_P5_D);
factor = (factor * fractional_part + COEFF_P5_E);
factor = (factor * fractional_part + COEFF_P5_F);
__m256d factor = ((((((((COEFF_P5_X * fractional_part + COEFF_P5_Y)
* fractional_part + COEFF_P5_Z) * fractional_part + COEFF_P5_A)
* fractional_part + COEFF_P5_B) * fractional_part + COEFF_P5_C)
* fractional_part + COEFF_P5_D) * fractional_part + COEFF_P5_E)
* fractional_part + COEFF_P5_F);
x -= factor;
......@@ -303,22 +299,21 @@ public:
}
inline InaVecAVX expLowAcc() const {
static const __m256d COEFF_LOG2E = _mm256_set1_pd(double(InaFastExp::CoeffLog2E()));
static const __m256d COEFF_A = _mm256_set1_pd(double(InaFastExp::CoeffA64()));
static const __m256d COEFF_B = _mm256_set1_pd(double(InaFastExp::CoeffB64()));
static const __m256d COEFF_P5_C = _mm256_set1_pd(double(InaFastExp::GetCoefficient4()[3]));
static const __m256d COEFF_P5_D = _mm256_set1_pd(double(InaFastExp::GetCoefficient4()[2]));
static const __m256d COEFF_P5_E = _mm256_set1_pd(double(InaFastExp::GetCoefficient4()[1]));
static const __m256d COEFF_P5_F = _mm256_set1_pd(double(InaFastExp::GetCoefficient4()[0]));
const __m256d COEFF_LOG2E = _mm256_set1_pd(double(InaFastExp::CoeffLog2E()));
const __m256d COEFF_A = _mm256_set1_pd(double(InaFastExp::CoeffA64()));
const __m256d COEFF_B = _mm256_set1_pd(double(InaFastExp::CoeffB64()));
const __m256d COEFF_P5_C = _mm256_set1_pd(double(InaFastExp::GetCoefficient4_3()));
const __m256d COEFF_P5_D = _mm256_set1_pd(double(InaFastExp::GetCoefficient4_2()));
const __m256d COEFF_P5_E = _mm256_set1_pd(double(InaFastExp::GetCoefficient4_1()));
const __m256d COEFF_P5_F = _mm256_set1_pd(double(InaFastExp::GetCoefficient4_0()));
__m256d x = vec * COEFF_LOG2E;
const __m256d fractional_part = x - InaVecAVX(x).floor().vec;
__m256d factor = COEFF_P5_C;
factor = (factor * fractional_part + COEFF_P5_D);
factor = (factor * fractional_part + COEFF_P5_E);
factor = (factor * fractional_part + COEFF_P5_F);
__m256d factor = (((COEFF_P5_C * fractional_part + COEFF_P5_D)
* fractional_part + COEFF_P5_E)
* fractional_part + COEFF_P5_F);
x -= factor;
......
......@@ -270,26 +270,25 @@ public:
#ifdef __INTEL_COMPILER
return _mm256_exp_ps(vec);
#else
static const __m256 COEFF_LOG2E = _mm256_set1_ps(float(InaFastExp::CoeffLog2E()));
static const __m256 COEFF_A = _mm256_set1_ps(float(InaFastExp::CoeffA32()));
static const __m256 COEFF_B = _mm256_set1_ps(float(InaFastExp::CoeffB32()));
static const __m256 COEFF_P5_A = _mm256_set1_ps(float(InaFastExp::GetCoefficient6()[5]));
static const __m256 COEFF_P5_B = _mm256_set1_ps(float(InaFastExp::GetCoefficient6()[4]));
static const __m256 COEFF_P5_C = _mm256_set1_ps(float(InaFastExp::GetCoefficient6()[3]));
static const __m256 COEFF_P5_D = _mm256_set1_ps(float(InaFastExp::GetCoefficient6()[2]));
static const __m256 COEFF_P5_E = _mm256_set1_ps(float(InaFastExp::GetCoefficient6()[1]));
static const __m256 COEFF_P5_F = _mm256_set1_ps(float(InaFastExp::GetCoefficient6()[0]));
const __m256 COEFF_LOG2E = _mm256_set1_ps(float(InaFastExp::CoeffLog2E()));
const __m256 COEFF_A = _mm256_set1_ps(float(InaFastExp::CoeffA32()));
const __m256 COEFF_B = _mm256_set1_ps(float(InaFastExp::CoeffB32()));
const __m256 COEFF_P5_A = _mm256_set1_ps(float(InaFastExp::GetCoefficient6_5()));
const __m256 COEFF_P5_B = _mm256_set1_ps(float(InaFastExp::GetCoefficient6_4()));
const __m256 COEFF_P5_C = _mm256_set1_ps(float(InaFastExp::GetCoefficient6_3()));
const __m256 COEFF_P5_D = _mm256_set1_ps(float(InaFastExp::GetCoefficient6_2()));
const __m256 COEFF_P5_E = _mm256_set1_ps(float(InaFastExp::GetCoefficient6_1()));
const __m256 COEFF_P5_F = _mm256_set1_ps(float(InaFastExp::GetCoefficient6_0()));
__m256 x = vec * COEFF_LOG2E;
const __m256 fractional_part = x - InaVecAVX(x).floor().vec;
__m256 factor = COEFF_P5_A;
factor = (factor * fractional_part + COEFF_P5_B);
factor = (factor * fractional_part + COEFF_P5_C);
factor = (factor * fractional_part + COEFF_P5_D);
factor = (factor * fractional_part + COEFF_P5_E);
factor = (factor * fractional_part + COEFF_P5_F);
__m256 factor = (((((COEFF_P5_A * fractional_part + COEFF_P5_B)
* fractional_part + COEFF_P5_C)
* fractional_part + COEFF_P5_D)
* fractional_part + COEFF_P5_E)
* fractional_part + COEFF_P5_F);
x -= factor;
......@@ -300,20 +299,18 @@ public:
}
inline InaVecAVX expLowAcc() const {
static const __m256 COEFF_LOG2E = _mm256_set1_ps(float(InaFastExp::CoeffLog2E()));
static const __m256 COEFF_A = _mm256_set1_ps(float(InaFastExp::CoeffA32()));
static const __m256 COEFF_B = _mm256_set1_ps(float(InaFastExp::CoeffB32()));
static const __m256 COEFF_P5_D = _mm256_set1_ps(float(InaFastExp::GetCoefficient3()[2]));
static const __m256 COEFF_P5_E = _mm256_set1_ps(float(InaFastExp::GetCoefficient3()[1]));
static const __m256 COEFF_P5_F = _mm256_set1_ps(float(InaFastExp::GetCoefficient3()[0]));
const __m256 COEFF_LOG2E = _mm256_set1_ps(float(InaFastExp::CoeffLog2E()));
const __m256 COEFF_A = _mm256_set1_ps(float(InaFastExp::CoeffA32()));
const __m256 COEFF_B = _mm256_set1_ps(float(InaFastExp::CoeffB32()));
const __m256 COEFF_P5_D = _mm256_set1_ps(float(InaFastExp::GetCoefficient3_2()));
const __m256 COEFF_P5_E = _mm256_set1_ps(float(InaFastExp::GetCoefficient3_1()));
const __m256 COEFF_P5_F = _mm256_set1_ps(float(InaFastExp::GetCoefficient3_0()));
__m256 x = vec * COEFF_LOG2E;
const __m256 fractional_part = x - InaVecAVX(x).floor().vec;
__m256 factor = COEFF_P5_D;
factor = (factor * fractional_part + COEFF_P5_E);
factor = (factor * fractional_part + COEFF_P5_F);
__m256 factor = ((COEFF_P5_D * fractional_part + COEFF_P5_E) * fractional_part + COEFF_P5_F);
x -= factor;
......
......@@ -280,34 +280,28 @@ public:
}
inline InaVecAVX512KNL exp() const {
static const __m512d COEFF_LOG2E = _mm512_set1_pd(double(InaFastExp::CoeffLog2E()));
static const __m512d COEFF_A = _mm512_set1_pd(double(InaFastExp::CoeffA64()));
static const __m512d COEFF_B = _mm512_set1_pd(double(InaFastExp::CoeffB64()));
static const __m512d COEFF_P5_V = _mm512_set1_pd(double(InaFastExp::GetCoefficient9()[9]));
static const __m512d COEFF_P5_X = _mm512_set1_pd(double(InaFastExp::GetCoefficient9()[8]));
static const __m512d COEFF_P5_Y = _mm512_set1_pd(double(InaFastExp::GetCoefficient9()[7]));
static const __m512d COEFF_P5_Z = _mm512_set1_pd(double(InaFastExp::GetCoefficient9()[6]));
static const __m512d COEFF_P5_A = _mm512_set1_pd(double(InaFastExp::GetCoefficient9()[5]));
static const __m512d COEFF_P5_B = _mm512_set1_pd(double(InaFastExp::GetCoefficient9()[4]));
static const __m512d COEFF_P5_C = _mm512_set1_pd(double(InaFastExp::GetCoefficient9()[3]));
static const __m512d COEFF_P5_D = _mm512_set1_pd(double(InaFastExp::GetCoefficient9()[2]));
static const __m512d COEFF_P5_E = _mm512_set1_pd(double(InaFastExp::GetCoefficient9()[1]));
static const __m512d COEFF_P5_F = _mm512_set1_pd(double(InaFastExp::GetCoefficient9()[0]));
const __m512d COEFF_LOG2E = _mm512_set1_pd(double(InaFastExp::CoeffLog2E()));
const __m512d COEFF_A = _mm512_set1_pd(double(InaFastExp::CoeffA64()));
const __m512d COEFF_B = _mm512_set1_pd(double(InaFastExp::CoeffB64()));
const __m512d COEFF_P5_X = _mm512_set1_pd(double(InaFastExp::GetCoefficient9_8()));
const __m512d COEFF_P5_Y = _mm512_set1_pd(double(InaFastExp::GetCoefficient9_7()));
const __m512d COEFF_P5_Z = _mm512_set1_pd(double(InaFastExp::GetCoefficient9_6()));
const __m512d COEFF_P5_A = _mm512_set1_pd(double(InaFastExp::GetCoefficient9_5()));
const __m512d COEFF_P5_B = _mm512_set1_pd(double(InaFastExp::GetCoefficient9_4()));
const __m512d COEFF_P5_C = _mm512_set1_pd(double(InaFastExp::GetCoefficient9_3()));
const __m512d COEFF_P5_D = _mm512_set1_pd(double(InaFastExp::GetCoefficient9_2()));
const __m512d COEFF_P5_E = _mm512_set1_pd(double(InaFastExp::GetCoefficient9_1()));
const __m512d COEFF_P5_F = _mm512_set1_pd(double(InaFastExp::GetCoefficient9_0()));
__m512d x = vec * COEFF_LOG2E;
const __m512d fractional_part = x - InaVecAVX512KNL(x).floor().vec;
__m512d factor = COEFF_P5_V;
factor = (factor * fractional_part + COEFF_P5_X);
factor = (factor * fractional_part + COEFF_P5_Y);
factor = (factor * fractional_part + COEFF_P5_Z);
factor = (factor * fractional_part + COEFF_P5_A);
factor = (factor * fractional_part + COEFF_P5_B);
factor = (factor * fractional_part + COEFF_P5_C);
factor = (factor * fractional_part + COEFF_P5_D);
factor = (factor * fractional_part + COEFF_P5_E);
factor = (factor * fractional_part + COEFF_P5_F);
__m512d factor = ((((((((COEFF_P5_X * fractional_part + COEFF_P5_Y)
* fractional_part + COEFF_P5_Z) * fractional_part + COEFF_P5_A)
* fractional_part + COEFF_P5_B) * fractional_part + COEFF_P5_C)
* fractional_part + COEFF_P5_D) * fractional_part + COEFF_P5_E)
* fractional_part + COEFF_P5_F);
x -= factor;
......@@ -325,22 +319,21 @@ public:
}
inline InaVecAVX512KNL expLowAcc() const {
static const __m512d COEFF_LOG2E = _mm512_set1_pd(double(InaFastExp::CoeffLog2E()));
static const __m512d COEFF_A = _mm512_set1_pd(double(InaFastExp::CoeffA64()));
static const __m512d COEFF_B = _mm512_set1_pd(double(InaFastExp::CoeffB64()));
static const __m512d COEFF_P5_C = _mm512_set1_pd(double(InaFastExp::GetCoefficient4()[3]));
static const __m512d COEFF_P5_D = _mm512_set1_pd(double(InaFastExp::GetCoefficient4()[2]));
static const __m512d COEFF_P5_E = _mm512_set1_pd(double(InaFastExp::GetCoefficient4()[1]));
static const __m512d COEFF_P5_F = _mm512_set1_pd(double(InaFastExp::GetCoefficient4()[0]));
const __m512d COEFF_LOG2E = _mm512_set1_pd(double(InaFastExp::CoeffLog2E()));
const __m512d COEFF_A = _mm512_set1_pd(double(InaFastExp::CoeffA64()));
const __m512d COEFF_B = _mm512_set1_pd(double(InaFastExp::CoeffB64()));
const __m512d COEFF_P5_C = _mm512_set1_pd(double(InaFastExp::GetCoefficient4_3()));
const __m512d COEFF_P5_D = _mm512_set1_pd(double(InaFastExp::GetCoefficient4_2()));
const __m512d COEFF_P5_E = _mm512_set1_pd(double(InaFastExp::GetCoefficient4_1()));
const __m512d COEFF_P5_F = _mm512_set1_pd(double(InaFastExp::GetCoefficient4_0()));
__m512d x = vec * COEFF_LOG2E;
const __m512d fractional_part = x - InaVecAVX512KNL(x).floor().vec;
__m512d factor = COEFF_P5_C;
factor = (factor * fractional_part + COEFF_P5_D);
factor = (factor * fractional_part + COEFF_P5_E);
factor = (factor * fractional_part + COEFF_P5_F);
__m512d factor = (((COEFF_P5_C * fractional_part + COEFF_P5_D)
* fractional_part + COEFF_P5_E)
* fractional_part + COEFF_P5_F);
x -= factor;
......
......@@ -305,26 +305,25 @@ public:
}
inline InaVecAVX512KNL exp() const {
static const __m512 COEFF_LOG2E = _mm512_set1_ps(float(InaFastExp::CoeffLog2E()));
static const __m512 COEFF_A = _mm512_set1_ps(float(InaFastExp::CoeffA32()));
static const __m512 COEFF_B = _mm512_set1_ps(float(InaFastExp::CoeffB32()));
static const __m512 COEFF_P5_A = _mm512_set1_ps(float(InaFastExp::GetCoefficient6()[5]));
static const __m512 COEFF_P5_B = _mm512_set1_ps(float(InaFastExp::GetCoefficient6()[4]));
static const __m512 COEFF_P5_C = _mm512_set1_ps(float(InaFastExp::GetCoefficient6()[3]));
static const __m512 COEFF_P5_D = _mm512_set1_ps(float(InaFastExp::GetCoefficient6()[2]));
static const __m512 COEFF_P5_E = _mm512_set1_ps(float(InaFastExp::GetCoefficient6()[1]));
static const __m512 COEFF_P5_F = _mm512_set1_ps(float(InaFastExp::GetCoefficient6()[0]));
const __m512 COEFF_LOG2E = _mm512_set1_ps(float(InaFastExp::CoeffLog2E()));
const __m512 COEFF_A = _mm512_set1_ps(float(InaFastExp::CoeffA32()));
const __m512 COEFF_B = _mm512_set1_ps(float(InaFastExp::CoeffB32()));
const __m512 COEFF_P5_A = _mm512_set1_ps(float(InaFastExp::GetCoefficient6_5()));
const __m512 COEFF_P5_B = _mm512_set1_ps(float(InaFastExp::GetCoefficient6_4()));
const __m512 COEFF_P5_C = _mm512_set1_ps(float(InaFastExp::GetCoefficient6_3()));
const __m512 COEFF_P5_D = _mm512_set1_ps(float(InaFastExp::GetCoefficient6_2()));
const __m512 COEFF_P5_E = _mm512_set1_ps(float(InaFastExp::GetCoefficient6_1()));
const __m512 COEFF_P5_F = _mm512_set1_ps(float(InaFastExp::GetCoefficient6_0()));
__m512 x = vec * COEFF_LOG2E;
const __m512 fractional_part = x - InaVecAVX512KNL(x).floor().vec;
__m512 factor = COEFF_P5_A;
factor = (factor * fractional_part + COEFF_P5_B);
factor = (factor * fractional_part + COEFF_P5_C);
factor = (factor * fractional_part + COEFF_P5_D);
factor = (factor * fractional_part + COEFF_P5_E);
factor = (factor * fractional_part + COEFF_P5_F);
__m512 factor = (((((COEFF_P5_A * fractional_part + COEFF_P5_B)
* fractional_part + COEFF_P5_C)
* fractional_part + COEFF_P5_D)
* fractional_part + COEFF_P5_E)
* fractional_part + COEFF_P5_F);
x -= factor;
......@@ -334,20 +333,18 @@ public:
}
inline InaVecAVX512KNL expLowAcc() const {
static const __m512 COEFF_LOG2E = _mm512_set1_ps(float(InaFastExp::CoeffLog2E()));
static const __m512 COEFF_A = _mm512_set1_ps(float(InaFastExp::CoeffA32()));
static const __m512 COEFF_B = _mm512_set1_ps(float(InaFastExp::CoeffB32()));
static const __m512 COEFF_P5_D = _mm512_set1_ps(float(InaFastExp::GetCoefficient3()[2]));
static const __m512 COEFF_P5_E = _mm512_set1_ps(float(InaFastExp::GetCoefficient3()[1]));
static const __m512 COEFF_P5_F = _mm512_set1_ps(float(InaFastExp::GetCoefficient3()[0]));
const __m512 COEFF_LOG2E = _mm512_set1_ps(float(InaFastExp::CoeffLog2E()));
const __m512 COEFF_A = _mm512_set1_ps(float(InaFastExp::CoeffA32()));
const __m512 COEFF_B = _mm512_set1_ps(float(InaFastExp::CoeffB32()));
const __m512 COEFF_P5_D = _mm512_set1_ps(float(InaFastExp::GetCoefficient3_2()));
const __m512 COEFF_P5_E = _mm512_set1_ps(float(InaFastExp::GetCoefficient3_1()));
const __m512 COEFF_P5_F = _mm512_set1_ps(float(InaFastExp::GetCoefficient3_0()));
__m512 x = vec * COEFF_LOG2E;
const __m512 fractional_part = x - InaVecAVX512KNL(x).floor().vec;
__m512 factor = COEFF_P5_D;
factor = (factor * fractional_part + COEFF_P5_E);
factor = (factor * fractional_part + COEFF_P5_F);
__m512 factor = ((COEFF_P5_D * fractional_part + COEFF_P5_E) * fractional_part + COEFF_P5_F);
x -= factor;
......
This diff is collapsed.
......@@ -247,32 +247,28 @@ public:
#ifdef __INTEL_COMPILER
return _mm_exp_pd(vec);
#else
static const __m128d COEFF_LOG2E = _mm_set1_pd(double(InaFastExp::CoeffLog2E()));
static const __m128d COEFF_A = _mm_set1_pd(double(InaFastExp::CoeffA64()));
static const __m128d COEFF_B = _mm_set1_pd(double(InaFastExp::CoeffB64()));
static const __m128d COEFF_P5_X = _mm_set1_pd(double(InaFastExp::GetCoefficient9()[8]));
static const __m128d COEFF_P5_Y = _mm_set1_pd(double(InaFastExp::GetCoefficient9()[7]));
static const __m128d COEFF_P5_Z = _mm_set1_pd(double(InaFastExp::GetCoefficient9()[6]));
static const __m128d COEFF_P5_A = _mm_set1_pd(double(InaFastExp::GetCoefficient9()[5]));
static const __m128d COEFF_P5_B = _mm_set1_pd(double(InaFastExp::GetCoefficient9()[4]));
static const __m128d COEFF_P5_C = _mm_set1_pd(double(InaFastExp::GetCoefficient9()[3]));
static const __m128d COEFF_P5_D = _mm_set1_pd(double(InaFastExp::GetCoefficient9()[2]));
static const __m128d COEFF_P5_E = _mm_set1_pd(double(InaFastExp::GetCoefficient9()[1]));
static const __m128d COEFF_P5_F = _mm_set1_pd(double(InaFastExp::GetCoefficient9()[0]));
const __m128d COEFF_LOG2E = _mm_set1_pd(double(InaFastExp::CoeffLog2E()));
const __m128d COEFF_A = _mm_set1_pd(double(InaFastExp::CoeffA64()));
const __m128d COEFF_B = _mm_set1_pd(double(InaFastExp::CoeffB64()));
const __m128d COEFF_P5_X = _mm_set1_pd(double(InaFastExp::GetCoefficient9_8()));
const __m128d COEFF_P5_Y = _mm_set1_pd(double(InaFastExp::GetCoefficient9_7()));
const __m128d COEFF_P5_Z = _mm_set1_pd(double(InaFastExp::GetCoefficient9_6()));
const __m128d COEFF_P5_A = _mm_set1_pd(double(InaFastExp::GetCoefficient9_5()));
const __m128d COEFF_P5_B = _mm_set1_pd(double(InaFastExp::GetCoefficient9_4()));
const __m128d COEFF_P5_C = _mm_set1_pd(double(InaFastExp::GetCoefficient9_3()));
const __m128d COEFF_P5_D = _mm_set1_pd(double(InaFastExp::GetCoefficient9_2()));
const __m128d COEFF_P5_E = _mm_set1_pd(double(InaFastExp::GetCoefficient9_1()));
const __m128d COEFF_P5_F = _mm_set1_pd(double(InaFastExp::GetCoefficient9_0()));
__m128d x = vec * COEFF_LOG2E;
const __m128d fractional_part = x - InaVecSSE3(x).floor().vec;
__m128d factor = COEFF_P5_X;
factor = (factor * fractional_part + COEFF_P5_Y);
factor = (factor * fractional_part + COEFF_P5_Z);
factor = (factor * fractional_part + COEFF_P5_A);
factor = (factor * fractional_part + COEFF_P5_B);
factor = (factor * fractional_part + COEFF_P5_C);
factor = (factor * fractional_part + COEFF_P5_D);
factor = (factor * fractional_part + COEFF_P5_E);
factor = (factor * fractional_part + COEFF_P5_F);
__m128d factor = ((((((((COEFF_P5_X * fractional_part + COEFF_P5_Y)
* fractional_part + COEFF_P5_Z) * fractional_part + COEFF_P5_A)
* fractional_part + COEFF_P5_B) * fractional_part + COEFF_P5_C)
* fractional_part + COEFF_P5_D) * fractional_part + COEFF_P5_E)
* fractional_part + COEFF_P5_F);
x -= factor;
......@@ -286,22 +282,21 @@ public:
}
inline InaVecSSE3 expLowAcc() const {
static const __m128d COEFF_LOG2E = _mm_set1_pd(double(InaFastExp::CoeffLog2E()));
static const __m128d COEFF_A = _mm_set1_pd(double(InaFastExp::CoeffA64()));
static const __m128d COEFF_B = _mm_set1_pd(double(InaFastExp::CoeffB64()));
static const __m128d COEFF_P5_C = _mm_set1_pd(double(InaFastExp::GetCoefficient4()[3]));
static const __m128d COEFF_P5_D = _mm_set1_pd(double(InaFastExp::GetCoefficient4()[2]));
static const __m128d COEFF_P5_E = _mm_set1_pd(double(InaFastExp::GetCoefficient4()[1]));
static const __m128d COEFF_P5_F = _mm_set1_pd(double(InaFastExp::GetCoefficient4()[0]));
const __m128d COEFF_LOG2E = _mm_set1_pd(double(InaFastExp::CoeffLog2E()));
const __m128d COEFF_A = _mm_set1_pd(double(InaFastExp::CoeffA64()));
const __m128d COEFF_B = _mm_set1_pd(double(InaFastExp::CoeffB64()));
const __m128d COEFF_P5_C = _mm_set1_pd(double(InaFastExp::GetCoefficient4_3()));
const __m128d COEFF_P5_D = _mm_set1_pd(double(InaFastExp::GetCoefficient4_2()));
const __m128d COEFF_P5_E = _mm_set1_pd(double(InaFastExp::GetCoefficient4_1()));
const __m128d COEFF_P5_F = _mm_set1_pd(double(InaFastExp::GetCoefficient4_0()));
__m128d x = vec * COEFF_LOG2E;
const __m128d fractional_part = x - InaVecSSE3(x).floor().vec;
__m128d factor = COEFF_P5_C;
factor = (factor * fractional_part + COEFF_P5_D);
factor = (factor * fractional_part + COEFF_P5_E);
factor = (factor * fractional_part + COEFF_P5_F);
__m128d factor = (((COEFF_P5_C * fractional_part + COEFF_P5_D)
* fractional_part + COEFF_P5_E)
* fractional_part + COEFF_P5_F);
x -= factor;
......
......@@ -254,26 +254,25 @@ public:
#ifdef __INTEL_COMPILER
return _mm_exp_ps(vec);
#else
static const __m128 COEFF_LOG2E = _mm_set1_ps(float(InaFastExp::CoeffLog2E()));
static const __m128 COEFF_A = _mm_set1_ps(float(InaFastExp::CoeffA32()));
static const __m128 COEFF_B = _mm_set1_ps(float(InaFastExp::CoeffB32()));
static const __m128 COEFF_P5_A = _mm_set1_ps(float(InaFastExp::GetCoefficient6()[5]));
static const __m128 COEFF_P5_B = _mm_set1_ps(float(InaFastExp::GetCoefficient6()[4]));
static const __m128 COEFF_P5_C = _mm_set1_ps(float(InaFastExp::GetCoefficient6()[3]));
static const __m128 COEFF_P5_D = _mm_set1_ps(float(InaFastExp::GetCoefficient6()[2]));
static const __m128 COEFF_P5_E = _mm_set1_ps(float(InaFastExp::GetCoefficient6()[1]));
static const __m128 COEFF_P5_F = _mm_set1_ps(float(InaFastExp::GetCoefficient6()[0]));
const __m128 COEFF_LOG2E = _mm_set1_ps(float(InaFastExp::CoeffLog2E()));
const __m128 COEFF_A = _mm_set1_ps(float(InaFastExp::CoeffA32()));
const __m128 COEFF_B = _mm_set1_ps(float(InaFastExp::CoeffB32()));
const __m128 COEFF_P5_A = _mm_set1_ps(float(InaFastExp::GetCoefficient6_5()));
const __m128 COEFF_P5_B = _mm_set1_ps(float(InaFastExp::GetCoefficient6_4()));
const __m128 COEFF_P5_C = _mm_set1_ps(float(InaFastExp::GetCoefficient6_3()));
const __m128 COEFF_P5_D = _mm_set1_ps(float(InaFastExp::GetCoefficient6_2()));
const __m128 COEFF_P5_E = _mm_set1_ps(float(InaFastExp::GetCoefficient6_1()));
const __m128 COEFF_P5_F = _mm_set1_ps(float(InaFastExp::GetCoefficient6_0()));
__m128 x = vec * COEFF_LOG2E;
const __m128 fractional_part = x - InaVecSSE3(x).floor().vec;
__m128 factor = COEFF_P5_A;
factor = (factor * fractional_part + COEFF_P5_B);
factor = (factor * fractional_part + COEFF_P5_C);
factor = (factor * fractional_part + COEFF_P5_D);
factor = (factor * fractional_part + COEFF_P5_E);
factor = (factor * fractional_part + COEFF_P5_F);
__m128 factor = (((((COEFF_P5_A * fractional_part + COEFF_P5_B)
* fractional_part + COEFF_P5_C)
* fractional_part + COEFF_P5_D)
* fractional_part + COEFF_P5_E)
* fractional_part + COEFF_P5_F);
x -= factor;
......@@ -284,20 +283,18 @@ public:
}
inline InaVecSSE3 expLowAcc() const {
static const __m128 COEFF_LOG2E = _mm_set1_ps(float(InaFastExp::CoeffLog2E()));
static const __m128 COEFF_A = _mm_set1_ps(float(InaFastExp::CoeffA32()));
static const __m128 COEFF_B = _mm_set1_ps(float(InaFastExp::CoeffB32()));
static const __m128 COEFF_P5_D = _mm_set1_ps(float(InaFastExp::GetCoefficient3()[2]));
static const __m128 COEFF_P5_E = _mm_set1_ps(float(InaFastExp::GetCoefficient3()[1]));
static const __m128 COEFF_P5_F = _mm_set1_ps(float(InaFastExp::GetCoefficient3()[0]));
const __m128 COEFF_LOG2E = _mm_set1_ps(float(InaFastExp::CoeffLog2E()));
const __m128 COEFF_A = _mm_set1_ps(float(InaFastExp::CoeffA32()));
const __m128 COEFF_B = _mm_set1_ps(float(InaFastExp::CoeffB32()));
const __m128 COEFF_P5_D = _mm_set1_ps(float(InaFastExp::GetCoefficient3_2()));
const __m128 COEFF_P5_E = _mm_set1_ps(float(InaFastExp::GetCoefficient3_1()));
const __m128 COEFF_P5_F = _mm_set1_ps(float(InaFastExp::GetCoefficient3_0()));
__m128 x = vec * COEFF_LOG2E;
const __m128 fractional_part = x - InaVecSSE3(x).floor().vec;
__m128 factor = COEFF_P5_D;
factor = (factor * fractional_part + COEFF_P5_E);
factor = (factor * fractional_part + COEFF_P5_F);
__m128 factor = ((COEFF_P5_D * fractional_part + COEFF_P5_E) * fractional_part + COEFF_P5_F);
x -= factor;
......
......@@ -36,32 +36,28 @@ public:
#ifdef __INTEL_COMPILER
return _mm_exp_pd(Parent::vec);
#else
static const __m128d COEFF_LOG2E = _mm_set1_pd(double(InaFastExp::CoeffLog2E()));
static const __m128d COEFF_A = _mm_set1_pd(double(InaFastExp::CoeffA64()));
static const __m128d COEFF_B = _mm_set1_pd(double(InaFastExp::CoeffB64()));
static const __m128d COEFF_P5_X = _mm_set1_pd(double(InaFastExp::GetCoefficient9()[8]));
static const __m128d COEFF_P5_Y = _mm_set1_pd(double(InaFastExp::GetCoefficient9()[7]));
static const __m128d COEFF_P5_Z = _mm_set1_pd(double(InaFastExp::GetCoefficient9()[6]));
static const __m128d COEFF_P5_A = _mm_set1_pd(double(InaFastExp::GetCoefficient9()[5]));
static const __m128d COEFF_P5_B = _mm_set1_pd(double(InaFastExp::GetCoefficient9()[4]));
static const __m128d COEFF_P5_C = _mm_set1_pd(double(InaFastExp::GetCoefficient9()[3]));
static const __m128d COEFF_P5_D = _mm_set1_pd(double(InaFastExp::GetCoefficient9()[2]));
static const __m128d COEFF_P5_E = _mm_set1_pd(double(InaFastExp::GetCoefficient9()[1]));
static const __m128d COEFF_P5_F = _mm_set1_pd(double(InaFastExp::GetCoefficient9()[0]));
const __m128d COEFF_LOG2E = _mm_set1_pd(double(InaFastExp::CoeffLog2E()));
const __m128d COEFF_A = _mm_set1_pd(double(InaFastExp::CoeffA64()));
const __m128d COEFF_B = _mm_set1_pd(double(InaFastExp::CoeffB64()));
const __m128d COEFF_P5_X = _mm_set1_pd(double(InaFastExp::GetCoefficient9_8()));
const __m128d COEFF_P5_Y = _mm_set1_pd(double(InaFastExp::GetCoefficient9_7()));
const __m128d COEFF_P5_Z = _mm_set1_pd(double(InaFastExp::GetCoefficient9_6()));
const __m128d COEFF_P5_A = _mm_set1_pd(double(InaFastExp::GetCoefficient9_5()));
const __m128d COEFF_P5_B = _mm_set1_pd(double(InaFastExp::GetCoefficient9_4()));
const __m128d COEFF_P5_C = _mm_set1_pd(double(InaFastExp::GetCoefficient9_3()));
const __m128d COEFF_P5_D = _mm_set1_pd(double(InaFastExp::GetCoefficient9_2()));
const __m128d COEFF_P5_E = _mm_set1_pd(double(InaFastExp::GetCoefficient9_1()));
const __m128d COEFF_P5_F = _mm_set1_pd(double(InaFastExp::GetCoefficient9_0()));
__m128d x = Parent::vec * COEFF_LOG2E;
const __m128d fractional_part = x - InaVecSSE41(x).floor().vec;
__m128d factor = COEFF_P5_X;
factor = (factor * fractional_part + COEFF_P5_Y);
factor = (factor * fractional_part + COEFF_P5_Z);
factor = (factor * fractional_part + COEFF_P5_A);
factor = (factor * fractional_part + COEFF_P5_B);
factor = (factor * fractional_part + COEFF_P5_C);
factor = (factor * fractional_part + COEFF_P5_D);
factor = (factor * fractional_part + COEFF_P5_E);
factor = (factor * fractional_part + COEFF_P5_F);
__m128d factor = ((((((((COEFF_P5_X * fractional_part + COEFF_P5_Y)
* fractional_part + COEFF_P5_Z) * fractional_part + COEFF_P5_A)
* fractional_part + COEFF_P5_B) * fractional_part + COEFF_P5_C)
* fractional_part + COEFF_P5_D) * fractional_part + COEFF_P5_E)
* fractional_part + COEFF_P5_F);
x -= factor;
......@@ -75,22 +71,21 @@ public:
}
inline InaVecSSE41<double> expLowAcc() const {
static const __m128d COEFF_LOG2E = _mm_set1_pd(double(InaFastExp::CoeffLog2E()));
static const __m128d COEFF_A = _mm_set1_pd(double(InaFastExp::CoeffA64()));
static const __m128d COEFF_B = _mm_set1_pd(double(InaFastExp::CoeffB64()));
static const __m128d COEFF_P5_C = _mm_set1_pd(double(InaFastExp::GetCoefficient4()[3]));
static const __m128d COEFF_P5_D = _mm_set1_pd(double(InaFastExp::GetCoefficient4()[2]));
static const __m128d COEFF_P5_E = _mm_set1_pd(double(InaFastExp::GetCoefficient4()[1]));
static const __m128d COEFF_P5_F = _mm_set1_pd(double(InaFastExp::GetCoefficient4()[0]));
const __m128d COEFF_LOG2E = _mm_set1_pd(double(InaFastExp::CoeffLog2E()));
const __m128d COEFF_A = _mm_set1_pd(double(InaFastExp::CoeffA64()));
const __m128d COEFF_B = _mm_set1_pd(double(InaFastExp::CoeffB64()));
const __m128d COEFF_P5_C = _mm_set1_pd(double(InaFastExp::GetCoefficient4_3()));
const __m128d COEFF_P5_D = _mm_set1_pd(double(InaFastExp::GetCoefficient4_2()));
const __m128d COEFF_P5_E = _mm_set1_pd(double(InaFastExp::GetCoefficient4_1()));
const __m128d COEFF_P5_F = _mm_set1_pd(double(InaFastExp::GetCoefficient4_0()));
__m128d x = Parent::vec * COEFF_LOG2E;
const __m128d fractional_part = x - InaVecSSE41(x).floor().vec;
__m128d factor = COEFF_P5_C;
factor = (factor * fractional_part + COEFF_P5_D);
factor = (factor * fractional_part + COEFF_P5_E);
factor = (factor * fractional_part + COEFF_P5_F);
__m128d factor = (((COEFF_P5_C * fractional_part + COEFF_P5_D)
* fractional_part + COEFF_P5_E)
* fractional_part + COEFF_P5_F);
x -= factor;
......
......@@ -37,26 +37,25 @@ public:
#ifdef __INTEL_COMPILER
return _mm_exp_ps(Parent::vec);
#else
static const __m128 COEFF_LOG2E = _mm_set1_ps(float(InaFastExp::CoeffLog2E()));
static const __m128 COEFF_A = _mm_set1_ps(float(InaFastExp::CoeffA32()));
static const __m128 COEFF_B = _mm_set1_ps(float(InaFastExp::CoeffB32()));
static const __m128 COEFF_P5_A = _mm_set1_ps(float(InaFastExp::GetCoefficient6()[5]));
static const __m128 COEFF_P5_B = _mm_set1_ps(float(InaFastExp::GetCoefficient6()[4]));
static const __m128 COEFF_P5_C = _mm_set1_ps(float(InaFastExp::GetCoefficient6()[3]));
static const __m128 COEFF_P5_D = _mm_set1_ps(float(InaFastExp::GetCoefficient6()[2]));
static const __m128 COEFF_P5_E = _mm_set1_ps(float(InaFastExp::GetCoefficient6()[1]));
static const __m128 COEFF_P5_F = _mm_set1_ps(float(InaFastExp::GetCoefficient6()[0]));
const __m128 COEFF_LOG2E = _mm_set1_ps(float(InaFastExp::CoeffLog2E()));
const __m128 COEFF_A = _mm_set1_ps(float(InaFastExp::CoeffA32()));
const __m128 COEFF_B = _mm_set1_ps(float(InaFastExp::CoeffB32()));
const __m128 COEFF_P5_A = _mm_set1_ps(float(InaFastExp::GetCoefficient6_5()));
const __m128 COEFF_P5_B = _mm_set1_ps(float(InaFastExp::GetCoefficient6_4()));
const __m128 COEFF_P5_C = _mm_set1_ps(float(InaFastExp::GetCoefficient6_3()));
const __m128 COEFF_P5_D = _mm_set1_ps(float(InaFastExp::GetCoefficient6_2()));
const __m128 COEFF_P5_E = _mm_set1_ps(float(InaFastExp::GetCoefficient6_1()));
const __m128 COEFF_P5_F = _mm_set1_ps(float(InaFastExp::GetCoefficient6_0()));