Commit fe2adbd4 authored by Berenger Bramas's avatar Berenger Bramas

Update exponential - use const expr instead of array for coef and write factor...

Update exponential - use const expr instead of array for coef and write factor computation in a single line
parent 8cc89dfa
...@@ -15,9 +15,8 @@ ...@@ -15,9 +15,8 @@
#include <iostream> #include <iostream>
int main(int /*argc*/, char* /*argv*/ []) { template <class RealType>
using RealType = double; void compareExpTime(){
const size_t NbOverLoop = 5; const size_t NbOverLoop = 5;
const size_t NbExp = 1000000; const size_t NbExp = 1000000;
...@@ -89,6 +88,14 @@ int main(int /*argc*/, char* /*argv*/ []) { ...@@ -89,6 +88,14 @@ int main(int /*argc*/, char* /*argv*/ []) {
std::cout << "Vector low acc " << InaVecBestType<RealType>::GetName() << " for " << NbExp * NbOverLoop std::cout << "Vector low acc " << InaVecBestType<RealType>::GetName() << " for " << NbExp * NbOverLoop
<< " exp took " << timer.getElapsed() << "s (" << timer.getElapsed()/double(NbExp * NbOverLoop) << "s per exp)\n"; << " exp took " << timer.getElapsed() << "s (" << timer.getElapsed()/double(NbExp * NbOverLoop) << "s per exp)\n";
} }
}
int main(int /*argc*/, char* /*argv*/ []) {
std::cout << "In Float:" << std::endl;
compareExpTime<float>();
std::cout << "In Double:" << std::endl;
compareExpTime<double>();
return 0; return 0;
} }
...@@ -54,16 +54,11 @@ for n = 2:10 ...@@ -54,16 +54,11 @@ for n = 2:10
disp(x0) disp(x0)
end end
printf("constexpr static std::array<double,%d> GetCoefficient%d() {\n", n+1, n+1); for i = 1:(n+1)
printf(" return {{\n"); printf("inline constexpr static double GetCoefficient%d_%d() {\n", n+1, i-1);
for i = 1:(n+1) printf(" return %.20e;\n", -x0(i));
sep="," printf("}\n");
if i == n+1 then
sep=""
end
printf(" %.20e%s\n", -x0(i), sep);
end end
printf(" }};\n"); printf("\n");
printf("}\n\n");
end end
...@@ -258,32 +258,28 @@ public: ...@@ -258,32 +258,28 @@ public:
#ifdef __INTEL_COMPILER #ifdef __INTEL_COMPILER
return _mm256_exp_pd(vec); return _mm256_exp_pd(vec);
#else #else
static const __m256d COEFF_LOG2E = _mm256_set1_pd(double(InaFastExp::CoeffLog2E())); const __m256d COEFF_LOG2E = _mm256_set1_pd(double(InaFastExp::CoeffLog2E()));
static const __m256d COEFF_A = _mm256_set1_pd(double(InaFastExp::CoeffA64())); const __m256d COEFF_A = _mm256_set1_pd(double(InaFastExp::CoeffA64()));
static const __m256d COEFF_B = _mm256_set1_pd(double(InaFastExp::CoeffB64())); const __m256d COEFF_B = _mm256_set1_pd(double(InaFastExp::CoeffB64()));
static const __m256d COEFF_P5_X = _mm256_set1_pd(double(InaFastExp::GetCoefficient9()[8])); const __m256d COEFF_P5_X = _mm256_set1_pd(double(InaFastExp::GetCoefficient9_8()));
static const __m256d COEFF_P5_Y = _mm256_set1_pd(double(InaFastExp::GetCoefficient9()[7])); const __m256d COEFF_P5_Y = _mm256_set1_pd(double(InaFastExp::GetCoefficient9_7()));
static const __m256d COEFF_P5_Z = _mm256_set1_pd(double(InaFastExp::GetCoefficient9()[6])); const __m256d COEFF_P5_Z = _mm256_set1_pd(double(InaFastExp::GetCoefficient9_6()));
static const __m256d COEFF_P5_A = _mm256_set1_pd(double(InaFastExp::GetCoefficient9()[5])); const __m256d COEFF_P5_A = _mm256_set1_pd(double(InaFastExp::GetCoefficient9_5()));
static const __m256d COEFF_P5_B = _mm256_set1_pd(double(InaFastExp::GetCoefficient9()[4])); const __m256d COEFF_P5_B = _mm256_set1_pd(double(InaFastExp::GetCoefficient9_4()));
static const __m256d COEFF_P5_C = _mm256_set1_pd(double(InaFastExp::GetCoefficient9()[3])); const __m256d COEFF_P5_C = _mm256_set1_pd(double(InaFastExp::GetCoefficient9_3()));
static const __m256d COEFF_P5_D = _mm256_set1_pd(double(InaFastExp::GetCoefficient9()[2])); const __m256d COEFF_P5_D = _mm256_set1_pd(double(InaFastExp::GetCoefficient9_2()));
static const __m256d COEFF_P5_E = _mm256_set1_pd(double(InaFastExp::GetCoefficient9()[1])); const __m256d COEFF_P5_E = _mm256_set1_pd(double(InaFastExp::GetCoefficient9_1()));
static const __m256d COEFF_P5_F = _mm256_set1_pd(double(InaFastExp::GetCoefficient9()[0])); const __m256d COEFF_P5_F = _mm256_set1_pd(double(InaFastExp::GetCoefficient9_0()));
__m256d x = vec * COEFF_LOG2E; __m256d x = vec * COEFF_LOG2E;
const __m256d fractional_part = x - InaVecAVX(x).floor().vec; const __m256d fractional_part = x - InaVecAVX(x).floor().vec;
__m256d factor = COEFF_P5_X; __m256d factor = ((((((((COEFF_P5_X * fractional_part + COEFF_P5_Y)
factor = (factor * fractional_part + COEFF_P5_Y); * fractional_part + COEFF_P5_Z) * fractional_part + COEFF_P5_A)
factor = (factor * fractional_part + COEFF_P5_Z); * fractional_part + COEFF_P5_B) * fractional_part + COEFF_P5_C)
factor = (factor * fractional_part + COEFF_P5_A); * fractional_part + COEFF_P5_D) * fractional_part + COEFF_P5_E)
factor = (factor * fractional_part + COEFF_P5_B); * fractional_part + COEFF_P5_F);
factor = (factor * fractional_part + COEFF_P5_C);
factor = (factor * fractional_part + COEFF_P5_D);
factor = (factor * fractional_part + COEFF_P5_E);
factor = (factor * fractional_part + COEFF_P5_F);
x -= factor; x -= factor;
...@@ -303,22 +299,21 @@ public: ...@@ -303,22 +299,21 @@ public:
} }
inline InaVecAVX expLowAcc() const { inline InaVecAVX expLowAcc() const {
static const __m256d COEFF_LOG2E = _mm256_set1_pd(double(InaFastExp::CoeffLog2E())); const __m256d COEFF_LOG2E = _mm256_set1_pd(double(InaFastExp::CoeffLog2E()));
static const __m256d COEFF_A = _mm256_set1_pd(double(InaFastExp::CoeffA64())); const __m256d COEFF_A = _mm256_set1_pd(double(InaFastExp::CoeffA64()));
static const __m256d COEFF_B = _mm256_set1_pd(double(InaFastExp::CoeffB64())); const __m256d COEFF_B = _mm256_set1_pd(double(InaFastExp::CoeffB64()));
static const __m256d COEFF_P5_C = _mm256_set1_pd(double(InaFastExp::GetCoefficient4()[3])); const __m256d COEFF_P5_C = _mm256_set1_pd(double(InaFastExp::GetCoefficient4_3()));
static const __m256d COEFF_P5_D = _mm256_set1_pd(double(InaFastExp::GetCoefficient4()[2])); const __m256d COEFF_P5_D = _mm256_set1_pd(double(InaFastExp::GetCoefficient4_2()));
static const __m256d COEFF_P5_E = _mm256_set1_pd(double(InaFastExp::GetCoefficient4()[1])); const __m256d COEFF_P5_E = _mm256_set1_pd(double(InaFastExp::GetCoefficient4_1()));
static const __m256d COEFF_P5_F = _mm256_set1_pd(double(InaFastExp::GetCoefficient4()[0])); const __m256d COEFF_P5_F = _mm256_set1_pd(double(InaFastExp::GetCoefficient4_0()));
__m256d x = vec * COEFF_LOG2E; __m256d x = vec * COEFF_LOG2E;
const __m256d fractional_part = x - InaVecAVX(x).floor().vec; const __m256d fractional_part = x - InaVecAVX(x).floor().vec;
__m256d factor = COEFF_P5_C; __m256d factor = (((COEFF_P5_C * fractional_part + COEFF_P5_D)
factor = (factor * fractional_part + COEFF_P5_D); * fractional_part + COEFF_P5_E)
factor = (factor * fractional_part + COEFF_P5_E); * fractional_part + COEFF_P5_F);
factor = (factor * fractional_part + COEFF_P5_F);
x -= factor; x -= factor;
......
...@@ -270,26 +270,25 @@ public: ...@@ -270,26 +270,25 @@ public:
#ifdef __INTEL_COMPILER #ifdef __INTEL_COMPILER
return _mm256_exp_ps(vec); return _mm256_exp_ps(vec);
#else #else
static const __m256 COEFF_LOG2E = _mm256_set1_ps(float(InaFastExp::CoeffLog2E())); const __m256 COEFF_LOG2E = _mm256_set1_ps(float(InaFastExp::CoeffLog2E()));
static const __m256 COEFF_A = _mm256_set1_ps(float(InaFastExp::CoeffA32())); const __m256 COEFF_A = _mm256_set1_ps(float(InaFastExp::CoeffA32()));
static const __m256 COEFF_B = _mm256_set1_ps(float(InaFastExp::CoeffB32())); const __m256 COEFF_B = _mm256_set1_ps(float(InaFastExp::CoeffB32()));
static const __m256 COEFF_P5_A = _mm256_set1_ps(float(InaFastExp::GetCoefficient6()[5])); const __m256 COEFF_P5_A = _mm256_set1_ps(float(InaFastExp::GetCoefficient6_5()));
static const __m256 COEFF_P5_B = _mm256_set1_ps(float(InaFastExp::GetCoefficient6()[4])); const __m256 COEFF_P5_B = _mm256_set1_ps(float(InaFastExp::GetCoefficient6_4()));
static const __m256 COEFF_P5_C = _mm256_set1_ps(float(InaFastExp::GetCoefficient6()[3])); const __m256 COEFF_P5_C = _mm256_set1_ps(float(InaFastExp::GetCoefficient6_3()));
static const __m256 COEFF_P5_D = _mm256_set1_ps(float(InaFastExp::GetCoefficient6()[2])); const __m256 COEFF_P5_D = _mm256_set1_ps(float(InaFastExp::GetCoefficient6_2()));
static const __m256 COEFF_P5_E = _mm256_set1_ps(float(InaFastExp::GetCoefficient6()[1])); const __m256 COEFF_P5_E = _mm256_set1_ps(float(InaFastExp::GetCoefficient6_1()));
static const __m256 COEFF_P5_F = _mm256_set1_ps(float(InaFastExp::GetCoefficient6()[0])); const __m256 COEFF_P5_F = _mm256_set1_ps(float(InaFastExp::GetCoefficient6_0()));
__m256 x = vec * COEFF_LOG2E; __m256 x = vec * COEFF_LOG2E;
const __m256 fractional_part = x - InaVecAVX(x).floor().vec; const __m256 fractional_part = x - InaVecAVX(x).floor().vec;
__m256 factor = COEFF_P5_A; __m256 factor = (((((COEFF_P5_A * fractional_part + COEFF_P5_B)
factor = (factor * fractional_part + COEFF_P5_B); * fractional_part + COEFF_P5_C)
factor = (factor * fractional_part + COEFF_P5_C); * fractional_part + COEFF_P5_D)
factor = (factor * fractional_part + COEFF_P5_D); * fractional_part + COEFF_P5_E)
factor = (factor * fractional_part + COEFF_P5_E); * fractional_part + COEFF_P5_F);
factor = (factor * fractional_part + COEFF_P5_F);
x -= factor; x -= factor;
...@@ -300,20 +299,18 @@ public: ...@@ -300,20 +299,18 @@ public:
} }
inline InaVecAVX expLowAcc() const { inline InaVecAVX expLowAcc() const {
static const __m256 COEFF_LOG2E = _mm256_set1_ps(float(InaFastExp::CoeffLog2E())); const __m256 COEFF_LOG2E = _mm256_set1_ps(float(InaFastExp::CoeffLog2E()));
static const __m256 COEFF_A = _mm256_set1_ps(float(InaFastExp::CoeffA32())); const __m256 COEFF_A = _mm256_set1_ps(float(InaFastExp::CoeffA32()));
static const __m256 COEFF_B = _mm256_set1_ps(float(InaFastExp::CoeffB32())); const __m256 COEFF_B = _mm256_set1_ps(float(InaFastExp::CoeffB32()));
static const __m256 COEFF_P5_D = _mm256_set1_ps(float(InaFastExp::GetCoefficient3()[2])); const __m256 COEFF_P5_D = _mm256_set1_ps(float(InaFastExp::GetCoefficient3_2()));
static const __m256 COEFF_P5_E = _mm256_set1_ps(float(InaFastExp::GetCoefficient3()[1])); const __m256 COEFF_P5_E = _mm256_set1_ps(float(InaFastExp::GetCoefficient3_1()));
static const __m256 COEFF_P5_F = _mm256_set1_ps(float(InaFastExp::GetCoefficient3()[0])); const __m256 COEFF_P5_F = _mm256_set1_ps(float(InaFastExp::GetCoefficient3_0()));
__m256 x = vec * COEFF_LOG2E; __m256 x = vec * COEFF_LOG2E;
const __m256 fractional_part = x - InaVecAVX(x).floor().vec; const __m256 fractional_part = x - InaVecAVX(x).floor().vec;
__m256 factor = COEFF_P5_D; __m256 factor = ((COEFF_P5_D * fractional_part + COEFF_P5_E) * fractional_part + COEFF_P5_F);
factor = (factor * fractional_part + COEFF_P5_E);
factor = (factor * fractional_part + COEFF_P5_F);
x -= factor; x -= factor;
......
...@@ -280,34 +280,28 @@ public: ...@@ -280,34 +280,28 @@ public:
} }
inline InaVecAVX512KNL exp() const { inline InaVecAVX512KNL exp() const {
static const __m512d COEFF_LOG2E = _mm512_set1_pd(double(InaFastExp::CoeffLog2E())); const __m512d COEFF_LOG2E = _mm512_set1_pd(double(InaFastExp::CoeffLog2E()));
static const __m512d COEFF_A = _mm512_set1_pd(double(InaFastExp::CoeffA64())); const __m512d COEFF_A = _mm512_set1_pd(double(InaFastExp::CoeffA64()));
static const __m512d COEFF_B = _mm512_set1_pd(double(InaFastExp::CoeffB64())); const __m512d COEFF_B = _mm512_set1_pd(double(InaFastExp::CoeffB64()));
static const __m512d COEFF_P5_V = _mm512_set1_pd(double(InaFastExp::GetCoefficient9()[9])); const __m512d COEFF_P5_X = _mm512_set1_pd(double(InaFastExp::GetCoefficient9_8()));
static const __m512d COEFF_P5_X = _mm512_set1_pd(double(InaFastExp::GetCoefficient9()[8])); const __m512d COEFF_P5_Y = _mm512_set1_pd(double(InaFastExp::GetCoefficient9_7()));
static const __m512d COEFF_P5_Y = _mm512_set1_pd(double(InaFastExp::GetCoefficient9()[7])); const __m512d COEFF_P5_Z = _mm512_set1_pd(double(InaFastExp::GetCoefficient9_6()));
static const __m512d COEFF_P5_Z = _mm512_set1_pd(double(InaFastExp::GetCoefficient9()[6])); const __m512d COEFF_P5_A = _mm512_set1_pd(double(InaFastExp::GetCoefficient9_5()));
static const __m512d COEFF_P5_A = _mm512_set1_pd(double(InaFastExp::GetCoefficient9()[5])); const __m512d COEFF_P5_B = _mm512_set1_pd(double(InaFastExp::GetCoefficient9_4()));
static const __m512d COEFF_P5_B = _mm512_set1_pd(double(InaFastExp::GetCoefficient9()[4])); const __m512d COEFF_P5_C = _mm512_set1_pd(double(InaFastExp::GetCoefficient9_3()));
static const __m512d COEFF_P5_C = _mm512_set1_pd(double(InaFastExp::GetCoefficient9()[3])); const __m512d COEFF_P5_D = _mm512_set1_pd(double(InaFastExp::GetCoefficient9_2()));
static const __m512d COEFF_P5_D = _mm512_set1_pd(double(InaFastExp::GetCoefficient9()[2])); const __m512d COEFF_P5_E = _mm512_set1_pd(double(InaFastExp::GetCoefficient9_1()));
static const __m512d COEFF_P5_E = _mm512_set1_pd(double(InaFastExp::GetCoefficient9()[1])); const __m512d COEFF_P5_F = _mm512_set1_pd(double(InaFastExp::GetCoefficient9_0()));
static const __m512d COEFF_P5_F = _mm512_set1_pd(double(InaFastExp::GetCoefficient9()[0]));
__m512d x = vec * COEFF_LOG2E; __m512d x = vec * COEFF_LOG2E;
const __m512d fractional_part = x - InaVecAVX512KNL(x).floor().vec; const __m512d fractional_part = x - InaVecAVX512KNL(x).floor().vec;
__m512d factor = COEFF_P5_V; __m512d factor = ((((((((COEFF_P5_X * fractional_part + COEFF_P5_Y)
factor = (factor * fractional_part + COEFF_P5_X); * fractional_part + COEFF_P5_Z) * fractional_part + COEFF_P5_A)
factor = (factor * fractional_part + COEFF_P5_Y); * fractional_part + COEFF_P5_B) * fractional_part + COEFF_P5_C)
factor = (factor * fractional_part + COEFF_P5_Z); * fractional_part + COEFF_P5_D) * fractional_part + COEFF_P5_E)
factor = (factor * fractional_part + COEFF_P5_A); * fractional_part + COEFF_P5_F);
factor = (factor * fractional_part + COEFF_P5_B);
factor = (factor * fractional_part + COEFF_P5_C);
factor = (factor * fractional_part + COEFF_P5_D);
factor = (factor * fractional_part + COEFF_P5_E);
factor = (factor * fractional_part + COEFF_P5_F);
x -= factor; x -= factor;
...@@ -325,22 +319,21 @@ public: ...@@ -325,22 +319,21 @@ public:
} }
inline InaVecAVX512KNL expLowAcc() const { inline InaVecAVX512KNL expLowAcc() const {
static const __m512d COEFF_LOG2E = _mm512_set1_pd(double(InaFastExp::CoeffLog2E())); const __m512d COEFF_LOG2E = _mm512_set1_pd(double(InaFastExp::CoeffLog2E()));
static const __m512d COEFF_A = _mm512_set1_pd(double(InaFastExp::CoeffA64())); const __m512d COEFF_A = _mm512_set1_pd(double(InaFastExp::CoeffA64()));
static const __m512d COEFF_B = _mm512_set1_pd(double(InaFastExp::CoeffB64())); const __m512d COEFF_B = _mm512_set1_pd(double(InaFastExp::CoeffB64()));
static const __m512d COEFF_P5_C = _mm512_set1_pd(double(InaFastExp::GetCoefficient4()[3])); const __m512d COEFF_P5_C = _mm512_set1_pd(double(InaFastExp::GetCoefficient4_3()));
static const __m512d COEFF_P5_D = _mm512_set1_pd(double(InaFastExp::GetCoefficient4()[2])); const __m512d COEFF_P5_D = _mm512_set1_pd(double(InaFastExp::GetCoefficient4_2()));
static const __m512d COEFF_P5_E = _mm512_set1_pd(double(InaFastExp::GetCoefficient4()[1])); const __m512d COEFF_P5_E = _mm512_set1_pd(double(InaFastExp::GetCoefficient4_1()));
static const __m512d COEFF_P5_F = _mm512_set1_pd(double(InaFastExp::GetCoefficient4()[0])); const __m512d COEFF_P5_F = _mm512_set1_pd(double(InaFastExp::GetCoefficient4_0()));
__m512d x = vec * COEFF_LOG2E; __m512d x = vec * COEFF_LOG2E;
const __m512d fractional_part = x - InaVecAVX512KNL(x).floor().vec; const __m512d fractional_part = x - InaVecAVX512KNL(x).floor().vec;
__m512d factor = COEFF_P5_C; __m512d factor = (((COEFF_P5_C * fractional_part + COEFF_P5_D)
factor = (factor * fractional_part + COEFF_P5_D); * fractional_part + COEFF_P5_E)
factor = (factor * fractional_part + COEFF_P5_E); * fractional_part + COEFF_P5_F);
factor = (factor * fractional_part + COEFF_P5_F);
x -= factor; x -= factor;
......
...@@ -305,26 +305,25 @@ public: ...@@ -305,26 +305,25 @@ public:
} }
inline InaVecAVX512KNL exp() const { inline InaVecAVX512KNL exp() const {
static const __m512 COEFF_LOG2E = _mm512_set1_ps(float(InaFastExp::CoeffLog2E())); const __m512 COEFF_LOG2E = _mm512_set1_ps(float(InaFastExp::CoeffLog2E()));
static const __m512 COEFF_A = _mm512_set1_ps(float(InaFastExp::CoeffA32())); const __m512 COEFF_A = _mm512_set1_ps(float(InaFastExp::CoeffA32()));
static const __m512 COEFF_B = _mm512_set1_ps(float(InaFastExp::CoeffB32())); const __m512 COEFF_B = _mm512_set1_ps(float(InaFastExp::CoeffB32()));
static const __m512 COEFF_P5_A = _mm512_set1_ps(float(InaFastExp::GetCoefficient6()[5])); const __m512 COEFF_P5_A = _mm512_set1_ps(float(InaFastExp::GetCoefficient6_5()));
static const __m512 COEFF_P5_B = _mm512_set1_ps(float(InaFastExp::GetCoefficient6()[4])); const __m512 COEFF_P5_B = _mm512_set1_ps(float(InaFastExp::GetCoefficient6_4()));
static const __m512 COEFF_P5_C = _mm512_set1_ps(float(InaFastExp::GetCoefficient6()[3])); const __m512 COEFF_P5_C = _mm512_set1_ps(float(InaFastExp::GetCoefficient6_3()));
static const __m512 COEFF_P5_D = _mm512_set1_ps(float(InaFastExp::GetCoefficient6()[2])); const __m512 COEFF_P5_D = _mm512_set1_ps(float(InaFastExp::GetCoefficient6_2()));
static const __m512 COEFF_P5_E = _mm512_set1_ps(float(InaFastExp::GetCoefficient6()[1])); const __m512 COEFF_P5_E = _mm512_set1_ps(float(InaFastExp::GetCoefficient6_1()));
static const __m512 COEFF_P5_F = _mm512_set1_ps(float(InaFastExp::GetCoefficient6()[0])); const __m512 COEFF_P5_F = _mm512_set1_ps(float(InaFastExp::GetCoefficient6_0()));
__m512 x = vec * COEFF_LOG2E; __m512 x = vec * COEFF_LOG2E;
const __m512 fractional_part = x - InaVecAVX512KNL(x).floor().vec; const __m512 fractional_part = x - InaVecAVX512KNL(x).floor().vec;
__m512 factor = COEFF_P5_A; __m512 factor = (((((COEFF_P5_A * fractional_part + COEFF_P5_B)
factor = (factor * fractional_part + COEFF_P5_B); * fractional_part + COEFF_P5_C)
factor = (factor * fractional_part + COEFF_P5_C); * fractional_part + COEFF_P5_D)
factor = (factor * fractional_part + COEFF_P5_D); * fractional_part + COEFF_P5_E)
factor = (factor * fractional_part + COEFF_P5_E); * fractional_part + COEFF_P5_F);
factor = (factor * fractional_part + COEFF_P5_F);
x -= factor; x -= factor;
...@@ -334,20 +333,18 @@ public: ...@@ -334,20 +333,18 @@ public:
} }
inline InaVecAVX512KNL expLowAcc() const { inline InaVecAVX512KNL expLowAcc() const {
static const __m512 COEFF_LOG2E = _mm512_set1_ps(float(InaFastExp::CoeffLog2E())); const __m512 COEFF_LOG2E = _mm512_set1_ps(float(InaFastExp::CoeffLog2E()));
static const __m512 COEFF_A = _mm512_set1_ps(float(InaFastExp::CoeffA32())); const __m512 COEFF_A = _mm512_set1_ps(float(InaFastExp::CoeffA32()));
static const __m512 COEFF_B = _mm512_set1_ps(float(InaFastExp::CoeffB32())); const __m512 COEFF_B = _mm512_set1_ps(float(InaFastExp::CoeffB32()));
static const __m512 COEFF_P5_D = _mm512_set1_ps(float(InaFastExp::GetCoefficient3()[2])); const __m512 COEFF_P5_D = _mm512_set1_ps(float(InaFastExp::GetCoefficient3_2()));
static const __m512 COEFF_P5_E = _mm512_set1_ps(float(InaFastExp::GetCoefficient3()[1])); const __m512 COEFF_P5_E = _mm512_set1_ps(float(InaFastExp::GetCoefficient3_1()));
static const __m512 COEFF_P5_F = _mm512_set1_ps(float(InaFastExp::GetCoefficient3()[0])); const __m512 COEFF_P5_F = _mm512_set1_ps(float(InaFastExp::GetCoefficient3_0()));
__m512 x = vec * COEFF_LOG2E; __m512 x = vec * COEFF_LOG2E;
const __m512 fractional_part = x - InaVecAVX512KNL(x).floor().vec; const __m512 fractional_part = x - InaVecAVX512KNL(x).floor().vec;
__m512 factor = COEFF_P5_D; __m512 factor = ((COEFF_P5_D * fractional_part + COEFF_P5_E) * fractional_part + COEFF_P5_F);
factor = (factor * fractional_part + COEFF_P5_E);
factor = (factor * fractional_part + COEFF_P5_F);
x -= factor; x -= factor;
......
This diff is collapsed.
...@@ -247,32 +247,28 @@ public: ...@@ -247,32 +247,28 @@ public:
#ifdef __INTEL_COMPILER #ifdef __INTEL_COMPILER
return _mm_exp_pd(vec); return _mm_exp_pd(vec);
#else #else
static const __m128d COEFF_LOG2E = _mm_set1_pd(double(InaFastExp::CoeffLog2E())); const __m128d COEFF_LOG2E = _mm_set1_pd(double(InaFastExp::CoeffLog2E()));
static const __m128d COEFF_A = _mm_set1_pd(double(InaFastExp::CoeffA64())); const __m128d COEFF_A = _mm_set1_pd(double(InaFastExp::CoeffA64()));
static const __m128d COEFF_B = _mm_set1_pd(double(InaFastExp::CoeffB64())); const __m128d COEFF_B = _mm_set1_pd(double(InaFastExp::CoeffB64()));
static const __m128d COEFF_P5_X = _mm_set1_pd(double(InaFastExp::GetCoefficient9()[8])); const __m128d COEFF_P5_X = _mm_set1_pd(double(InaFastExp::GetCoefficient9_8()));
static const __m128d COEFF_P5_Y = _mm_set1_pd(double(InaFastExp::GetCoefficient9()[7])); const __m128d COEFF_P5_Y = _mm_set1_pd(double(InaFastExp::GetCoefficient9_7()));
static const __m128d COEFF_P5_Z = _mm_set1_pd(double(InaFastExp::GetCoefficient9()[6])); const __m128d COEFF_P5_Z = _mm_set1_pd(double(InaFastExp::GetCoefficient9_6()));
static const __m128d COEFF_P5_A = _mm_set1_pd(double(InaFastExp::GetCoefficient9()[5])); const __m128d COEFF_P5_A = _mm_set1_pd(double(InaFastExp::GetCoefficient9_5()));
static const __m128d COEFF_P5_B = _mm_set1_pd(double(InaFastExp::GetCoefficient9()[4])); const __m128d COEFF_P5_B = _mm_set1_pd(double(InaFastExp::GetCoefficient9_4()));
static const __m128d COEFF_P5_C = _mm_set1_pd(double(InaFastExp::GetCoefficient9()[3])); const __m128d COEFF_P5_C = _mm_set1_pd(double(InaFastExp::GetCoefficient9_3()));
static const __m128d COEFF_P5_D = _mm_set1_pd(double(InaFastExp::GetCoefficient9()[2])); const __m128d COEFF_P5_D = _mm_set1_pd(double(InaFastExp::GetCoefficient9_2()));
static const __m128d COEFF_P5_E = _mm_set1_pd(double(InaFastExp::GetCoefficient9()[1])); const __m128d COEFF_P5_E = _mm_set1_pd(double(InaFastExp::GetCoefficient9_1()));
static const __m128d COEFF_P5_F = _mm_set1_pd(double(InaFastExp::GetCoefficient9()[0])); const __m128d COEFF_P5_F = _mm_set1_pd(double(InaFastExp::GetCoefficient9_0()));
__m128d x = vec * COEFF_LOG2E; __m128d x = vec * COEFF_LOG2E;
const __m128d fractional_part = x - InaVecSSE3(x).floor().vec; const __m128d fractional_part = x - InaVecSSE3(x).floor().vec;
__m128d factor = COEFF_P5_X; __m128d factor = ((((((((COEFF_P5_X * fractional_part + COEFF_P5_Y)
factor = (factor * fractional_part + COEFF_P5_Y); * fractional_part + COEFF_P5_Z) * fractional_part + COEFF_P5_A)
factor = (factor * fractional_part + COEFF_P5_Z); * fractional_part + COEFF_P5_B) * fractional_part + COEFF_P5_C)
factor = (factor * fractional_part + COEFF_P5_A); * fractional_part + COEFF_P5_D) * fractional_part + COEFF_P5_E)
factor = (factor * fractional_part + COEFF_P5_B); * fractional_part + COEFF_P5_F);
factor = (factor * fractional_part + COEFF_P5_C);
factor = (factor * fractional_part + COEFF_P5_D);
factor = (factor * fractional_part + COEFF_P5_E);
factor = (factor * fractional_part + COEFF_P5_F);
x -= factor; x -= factor;
...@@ -286,22 +282,21 @@ public: ...@@ -286,22 +282,21 @@ public:
} }
inline InaVecSSE3 expLowAcc() const { inline InaVecSSE3 expLowAcc() const {
static const __m128d COEFF_LOG2E = _mm_set1_pd(double(InaFastExp::CoeffLog2E())); const __m128d COEFF_LOG2E = _mm_set1_pd(double(InaFastExp::CoeffLog2E()));
static const __m128d COEFF_A = _mm_set1_pd(double(InaFastExp::CoeffA64())); const __m128d COEFF_A = _mm_set1_pd(double(InaFastExp::CoeffA64()));
static const __m128d COEFF_B = _mm_set1_pd(double(InaFastExp::CoeffB64())); const __m128d COEFF_B = _mm_set1_pd(double(InaFastExp::CoeffB64()));
static const __m128d COEFF_P5_C = _mm_set1_pd(double(InaFastExp::GetCoefficient4()[3])); const __m128d COEFF_P5_C = _mm_set1_pd(double(InaFastExp::GetCoefficient4_3()));
static const __m128d COEFF_P5_D = _mm_set1_pd(double(InaFastExp::GetCoefficient4()[2])); const __m128d COEFF_P5_D = _mm_set1_pd(double(InaFastExp::GetCoefficient4_2()));
static const __m128d COEFF_P5_E = _mm_set1_pd(double(InaFastExp::GetCoefficient4()[1])); const __m128d COEFF_P5_E = _mm_set1_pd(double(InaFastExp::GetCoefficient4_1()));
static const __m128d COEFF_P5_F = _mm_set1_pd(double(InaFastExp::GetCoefficient4()[0])); const __m128d COEFF_P5_F = _mm_set1_pd(double(InaFastExp::GetCoefficient4_0()));
__m128d x = vec * COEFF_LOG2E; __m128d x = vec * COEFF_LOG2E;
const __m128d fractional_part = x - InaVecSSE3(x).floor().vec; const __m128d fractional_part = x - InaVecSSE3(x).floor().vec;
__m128d factor = COEFF_P5_C; __m128d factor = (((COEFF_P5_C * fractional_part + COEFF_P5_D)
factor = (factor * fractional_part + COEFF_P5_D); * fractional_part + COEFF_P5_E)
factor = (factor * fractional_part + COEFF_P5_E); * fractional_part + COEFF_P5_F);
factor = (factor * fractional_part + COEFF_P5_F);
x -= factor; x -= factor;
......
...@@ -254,26 +254,25 @@ public: ...@@ -254,26 +254,25 @@ public:
#ifdef __INTEL_COMPILER #ifdef __INTEL_COMPILER
return _mm_exp_ps(vec); return _mm_exp_ps(vec);
#else #else
static const __m128 COEFF_LOG2E = _mm_set1_ps(float(InaFastExp::CoeffLog2E())); const __m128 COEFF_LOG2E = _mm_set1_ps(float(InaFastExp::CoeffLog2E()));
static const __m128 COEFF_A = _mm_set1_ps(float(InaFastExp::CoeffA32())); const __m128 COEFF_A = _mm_set1_ps(float(InaFastExp::CoeffA32()));
static const __m128 COEFF_B = _mm_set1_ps(float(InaFastExp::CoeffB32())); const __m128 COEFF_B = _mm_set1_ps(float(InaFastExp::CoeffB32()));
static const __m128 COEFF_P5_A = _mm_set1_ps(float(InaFastExp::GetCoefficient6()[5])); const __m128 COEFF_P5_A = _mm_set1_ps(float(InaFastExp::GetCoefficient6_5()));
static const __m128 COEFF_P5_B = _mm_set1_ps(float(InaFastExp::GetCoefficient6()[4])); const __m128 COEFF_P5_B = _mm_set1_ps(float(InaFastExp::GetCoefficient6_4()));
static const __m128 COEFF_P5_C = _mm_set1_ps(float(InaFastExp::GetCoefficient6()[3])); const __m128 COEFF_P5_C = _mm_set1_ps(float(InaFastExp::GetCoefficient6_3()));
static const __m128 COEFF_P5_D = _mm_set1_ps(float(InaFastExp::GetCoefficient6()[2])); const __m128 COEFF_P5_D = _mm_set1_ps(float(InaFastExp::GetCoefficient6_2()));
static const __m128 COEFF_P5_E = _mm_set1_ps(float(InaFastExp::GetCoefficient6()[1])); const __m128 COEFF_P5_E = _mm_set1_ps(float(InaFastExp::GetCoefficient6_1()));
static const __m128 COEFF_P5_F = _mm_set1_ps(float(InaFastExp::GetCoefficient6()[0])); const __m128 COEFF_P5_F = _mm_set1_ps(float(InaFastExp::GetCoefficient6_0()));
__m128 x = vec * COEFF_LOG2E; __m128 x = vec * COEFF_LOG2E;
const __m128 fractional_part = x - InaVecSSE3(x).floor().vec; const __m128 fractional_part = x - InaVecSSE3(x).floor().vec;
__m128 factor = COEFF_P5_A; __m128 factor = (((((COEFF_P5_A * fractional_part + COEFF_P5_B)
factor = (factor * fractional_part + COEFF_P5_B); * fractional_part + COEFF_P5_C)
factor = (factor * fractional_part + COEFF_P5_C); * fractional_part + COEFF_P5_D)
factor = (factor * fractional_part + COEFF_P5_D); * fractional_part + COEFF_P5_E)
factor = (factor * fractional_part + COEFF_P5_E); * fractional_part + COEFF_P5_F);
factor = (factor * fractional_part + COEFF_P5_F);
x -= factor; x -= factor;