Commit deef67ed authored by Berenger Bramas's avatar Berenger Bramas

Core update -- use zero upper only in horizontal operations, but not in...

Core update -- use zero upper only in horizontal operations, but not in exp-avx because it leads to bug in GCC (where some registers are not saved and so are erased)
parent 5e86182f
......@@ -234,16 +234,18 @@ public:
// Horizontal operation
inline double horizontalSum() const {
const __m128d valupper = _mm256_extractf128_pd(vec, 1);
_mm256_zeroupper(); // Could be moved after the _mm256_castpd256_pd128
const __m128d valval = _mm_add_pd(valupper, _mm256_castpd256_pd128(vec));
const __m128d rest = _mm256_castpd256_pd128(vec);
_mm256_zeroupper();
const __m128d valval = _mm_add_pd(valupper, rest);
const __m128d res = _mm_add_pd(_mm_permute_pd(valval, 1), valval);
return _mm_cvtsd_f64(res);
}
inline double horizontalMul() const {
const __m128d valupper = _mm256_extractf128_pd(vec, 1);
_mm256_zeroupper(); // Could be moved after the _mm256_castpd256_pd128
const __m128d valval = _mm_mul_pd(valupper, _mm256_castpd256_pd128(vec));
const __m128d rest = _mm256_castpd256_pd128(vec);
_mm256_zeroupper();
const __m128d valval = _mm_mul_pd(valupper, rest);
const __m128d res = _mm_mul_pd(_mm_permute_pd(valval, 1), valval);
return _mm_cvtsd_f64(res);
}
......@@ -284,8 +286,8 @@ public:
x = (COEFF_A * x + COEFF_B);
__m128d valupper = _mm256_extractf128_pd(x, 1);
_mm256_zeroupper();
__m128d vallower = _mm256_castpd256_pd128(x);
// Removed because it makes GCC bugging:_mm256_zeroupper();
alignas(64) long long int allvalint[VecLength] = { _mm_cvtsd_si64(vallower),
_mm_cvtsd_si64(_mm_shuffle_pd(vallower, vallower, 1)),
......@@ -318,8 +320,8 @@ public:
x = (COEFF_A * x + COEFF_B);
__m128d valupper = _mm256_extractf128_pd(x, 1);
_mm256_zeroupper();
__m128d vallower = _mm256_castpd256_pd128(x);
// Removed because it makes GCC bugging:_mm256_zeroupper();
alignas(64) long long int allvalint[VecLength] = { _mm_cvtsd_si64(vallower),
_mm_cvtsd_si64(_mm_shuffle_pd(vallower, vallower, 1)),
......
......@@ -242,9 +242,10 @@ public:
// Horizontal operation
inline float horizontalSum() const {
const __m128 valupper = _mm256_extractf128_ps(vec, 1);
_mm256_zeroupper(); // Could be moved after the _mm256_extractf128_ps
const __m128 rest = _mm256_extractf128_ps(vec, 0);
_mm256_zeroupper();
const __m128 valval = _mm_add_ps(valupper,
_mm256_extractf128_ps(vec, 0));
rest);
__m128 valsum = _mm_add_ps(_mm_permute_ps(valval, 0x1B), valval);
__m128 res = _mm_add_ps(_mm_permute_ps(valsum, 0xB1), valsum);
return _mm_cvtss_f32(res);
......@@ -252,9 +253,10 @@ public:
inline float horizontalMul() const {
const __m128 valupper = _mm256_extractf128_ps(vec, 1);
_mm256_zeroupper(); // Could be moved after the _mm256_extractf128_ps
const __m128 rest = _mm256_extractf128_ps(vec, 0);
_mm256_zeroupper();
const __m128 valval = _mm_mul_ps(valupper,
_mm256_extractf128_ps(vec, 0));
rest);
__m128 valsum = _mm_mul_ps(_mm_permute_ps(valval, 0x1B), valval);
__m128 res = _mm_mul_ps(_mm_permute_ps(valsum, 0xB1), valsum);
return _mm_cvtss_f32(res);
......
......@@ -252,8 +252,9 @@ public:
__m256d val = low + high;
const __m128d valupper = _mm256_extractf128_pd(val, 1);
_mm256_zeroupper(); // Could be moved after the _mm256_castpd256_pd128
const __m128d valval = _mm_add_pd(valupper, _mm256_castpd256_pd128(val));
const __m128d rest = _mm256_castpd256_pd128(val);
// Not in 512 _mm256_zeroupper();
const __m128d valval = _mm_add_pd(valupper, rest);
const __m128d res = _mm_add_pd(_mm_permute_pd(valval, 1), valval);
return _mm_cvtsd_f64(res);
#endif
......@@ -268,8 +269,9 @@ public:
__m256d val = low * high;
const __m128d valupper = _mm256_extractf128_pd(val, 1);
_mm256_zeroupper(); // Could be moved after the _mm256_castpd256_pd128
const __m128d valval = _mm_mul_pd(valupper, _mm256_castpd256_pd128(val));
const __m128d rest = _mm256_castpd256_pd128(val);
// Not in 512 _mm256_zeroupper();
const __m128d valval = _mm_mul_pd(valupper, rest);
const __m128d res = _mm_mul_pd(_mm_permute_pd(valval, 1), valval);
return _mm_cvtsd_f64(res);
#endif
......
......@@ -273,9 +273,10 @@ public:
__m256 val = low + high;
const __m128 valupper = _mm256_extractf128_ps(val, 1);
_mm256_zeroupper(); // Could be moved after the _mm256_extractf128_ps
const __m128 rest = _mm256_extractf128_ps(val, 0);
// Not in 512 _mm256_zeroupper();
const __m128 valval = _mm_add_ps(valupper,
_mm256_extractf128_ps(val, 0));
rest);
__m128 valsum = _mm_add_ps(_mm_permute_ps(valval, 0x1B), valval);
__m128 res = _mm_add_ps(_mm_permute_ps(valsum, 0xB1), valsum);
return _mm_cvtss_f32(res);
......@@ -291,9 +292,10 @@ public:
__m256 val = low * high;
const __m128 valupper = _mm256_extractf128_ps(val, 1);
_mm256_zeroupper(); // Could be moved after the _mm256_extractf128_ps
const __m128 rest = _mm256_extractf128_ps(val, 0);
// Not in 512 _mm256_zeroupper();
const __m128 valval = _mm_mul_ps(valupper,
_mm256_extractf128_ps(val, 0));
rest);
__m128 valsum = _mm_mul_ps(_mm_permute_ps(valval, 0x1B), valval);
__m128 res = _mm_mul_ps(_mm_permute_ps(valsum, 0xB1), valsum);
return _mm_cvtss_f32(res);
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment