Commit a3f28416 authored by Bramas, Berenger (bbramas)'s avatar Bramas, Berenger (bbramas)

Merge branch 'altivec-power8' into 'master'

Add altivec support (smx, IBM Power 8)

Targeting open power arch, we start porting the inastemp simd class to altivec.

It is currently a work in progress.


* [x]   Manage load/store (especially not alligned)
* [x]   Manage little/big Indian (in the load/store and the signof)

* [x]  Build with XL compiler

See merge request !3
parents 53dbe324 b10cb45b
......@@ -72,32 +72,52 @@ set(INASTEMP_VERSION "${INASTEMP_MAJOR_VERSION}.${INASTEMP_MINOR_VERSION}.${INA
#===========================================================================
# Options
#===========================================================================
# Ask CPU capacities
include(GetCpuInfos)
GetCpuInfos()
# Ask compiler capacities
include(GetCompilerInfos)
GetCompilerInfos()
# All types from worse to best (ADD-NEW-HERE)
set(ALL_TYPES "SSE3;SSSE3;SSE41;SSE42;AVX;AVX2;AVX512COMMON;AVX512KNL;AVX512SKL")
set(INASTEMP_USE_SCALAR ON)
set(INASTEMP_CXX_FLAGS "-std=c++11")
if(${CMAKE_SYSTEM_PROCESSOR} STREQUAL "ppc64le")
if($ENV{VERBOSE})
MESSAGE(STATUS "Main -- compile for ppc64le architecture")
endif()
# Set custom cpu <=> vec rules (maybe ADD-NEW-HERE if needed)
set(AVX512COMMON_CPU_RULES "AVX512F;AVX512ER")
set(AVX512KNL_CPU_RULES "AVX512F;AVX512ER;AVX512PF")
set(AVX512SKL_CPU_RULES "AVX512F;AVX512ER;AVX512VL;AVX512BW;AVX512DQ")
# Ask compiler capacities
include(GetCompilerInfos)
GetCompilerInfos()
# All types from worse to best (ADD-NEW-HERE)
set(ALL_TYPES "ALTIVEC")
set(ALTIVEC_CPU_RULES "")
# Dependencies between types (maybe ADD-NEW-HERE if needed)
set(SSSE3_DEP "SSE3")
set(SSE41_DEP "SSSE3")
set(SSE42_DEP "SSE41")
set(AVX2_DEP "AVX")
set(AVX512KNL_DEP "AVX512COMMON")
set(AVX512SKL_DEP "AVX512COMMON")
set(INASTEMP_USE_SCALAR ON)
set(INASTEMP_CXX_FLAGS "-std=c++11")
else()
if($ENV{VERBOSE})
MESSAGE(STATUS "Main -- compile for x86 architecture")
endif()
# Ask CPU capacities
include(GetCpuInfos)
GetCpuInfos()
# Ask compiler capacities
include(GetCompilerInfos)
GetCompilerInfos()
# All types from worse to best (ADD-NEW-HERE)
set(ALL_TYPES "SSE3;SSSE3;SSE41;SSE42;AVX;AVX2;AVX512COMMON;AVX512KNL;AVX512SKL")
set(INASTEMP_USE_SCALAR ON)
set(INASTEMP_CXX_FLAGS "-std=c++11")
# Set custom cpu <=> vec rules (maybe ADD-NEW-HERE if needed)
set(AVX512COMMON_CPU_RULES "AVX512F;AVX512ER")
set(AVX512KNL_CPU_RULES "AVX512F;AVX512ER;AVX512PF")
set(AVX512SKL_CPU_RULES "AVX512F;AVX512ER;AVX512VL;AVX512BW;AVX512DQ")
# Dependencies between types (maybe ADD-NEW-HERE if needed)
set(SSSE3_DEP "SSE3")
set(SSE41_DEP "SSSE3")
set(SSE42_DEP "SSE41")
set(AVX2_DEP "AVX")
set(AVX512KNL_DEP "AVX512COMMON")
set(AVX512SKL_DEP "AVX512COMMON")
endif()
# Enforce rules
set(ALL_TYPES_REVERSE ${ALL_TYPES})
......@@ -177,6 +197,11 @@ if(NOT INASTEMP_AS_SUBPROJECT)
set(INASTEMP_CXX_FLAGS "${INASTEMP_CXX_FLAGS} -fpic -Wextra -Wnon-virtual-dtor -Wshadow -Wpointer-arith -Wcast-qual -Wconversion -Wall -Wno-sign-conversion -pedantic -Woverloaded-virtual -Wpointer-arith -Wcast-qual -Wconversion -Wno-error")
if(${CMAKE_SYSTEM_PROCESSOR} STREQUAL "ppc64le")
# Builting functions are not correctly making variables/parameters used
set(INASTEMP_CXX_FLAGS "${INASTEMP_CXX_FLAGS} -Wno-unused-but-set-variable -Wno-unused-but-set-parameter")
endif()
if(CMAKE_SIZEOF_VOID_P EQUAL 8)
set(INASTEMP_CXX_FLAGS "${INASTEMP_CXX_FLAGS} -m64")
endif()
......
#include <altivec.h>
int main(){
{
__vector double res0d;
__vector double res1d;
__vector double res2d = vec_add(res0d, res1d);
res2d = vec_abs (res0d);
res2d = vec_rsqrt (res0d);
}
{
__vector float res0;
__vector float res1;
__vector float res2 = vec_add(res0, res1);
res2 = vec_abs (res0);
res2 = vec_rsqrt (res0);
}
return 0;
}
......@@ -44,58 +44,77 @@ endmacro(GetCompilerInfosCore)
###########################################################################################
macro(GetCompilerInfos)
SET( ARCH_NATIVE_FLAG "-march=native" CACHE STRING "Additional flag for the compiler capacities detection" )
# (ADD-NEW-HERE for each compilers)
if(CMAKE_CXX_COMPILER_ID STREQUAL "Intel")
if(APPLE) # INTEL APPLE
set(SSE3_FLAGS "-msse3 ${ARCH_NATIVE_FLAG}")
set(SSSE3_FLAGS "-mssse3 ${ARCH_NATIVE_FLAG}")
set(SSE41_FLAGS "-msse4 -msse4.1 ${ARCH_NATIVE_FLAG}")
set(SSE42_FLAGS "-msse4 -msse4.2 ${ARCH_NATIVE_FLAG}")
set(AVX_FLAGS "-mAVX ${ARCH_NATIVE_FLAG}")
set(AVX2_FLAGS "-march=core-avx2 ${ARCH_NATIVE_FLAG}")
set(AVX512COMMON_FLAGS "-xCOMMON-AVX512 ${ARCH_NATIVE_FLAG}")
set(AVX512KNL_FLAGS "-xCOMMON-AVX512 -xMIC-AVX512 ${ARCH_NATIVE_FLAG}")
set(AVX512SKL_FLAGS "-xCOMMON-AVX512 -xCORE-AVX512 ${ARCH_NATIVE_FLAG}")
else() # INTEL LINUX
set(SSE3_FLAGS "-msse3 ${ARCH_NATIVE_FLAG}")
set(SSSE3_FLAGS "-mssse3 ${ARCH_NATIVE_FLAG}")
set(SSE41_FLAGS "-msse4 -msse4.1 ${ARCH_NATIVE_FLAG}")
set(SSE42_FLAGS "-msse4 -msse4.2 ${ARCH_NATIVE_FLAG}")
set(AVX_FLAGS "-march=core-avx-i ${ARCH_NATIVE_FLAG}")
set(AVX2_FLAGS "-march=core-avx2 ${ARCH_NATIVE_FLAG}")
set(AVX512COMMON_FLAGS "-xCOMMON-AVX512 ${ARCH_NATIVE_FLAG}")
set(AVX512KNL_FLAGS "-xCOMMON-AVX512 -xMIC-AVX512 ${ARCH_NATIVE_FLAG}")
set(AVX512SKL_FLAGS "-xCOMMON-AVX512 -xCORE-AVX512 ${ARCH_NATIVE_FLAG}")
if(${CMAKE_SYSTEM_PROCESSOR} STREQUAL "ppc64le")
# POWERPC
if(CMAKE_CXX_COMPILER_ID STREQUAL "Clang")
SET( ARCH_NATIVE_FLAG "-mcpu=pwr8" CACHE STRING "Additional flag for the compiler capacities detection such as -mcpu=power8 for example" )
set(ALTIVEC_FLAGS "-faltivec ${ARCH_NATIVE_FLAG}")
elseif(CMAKE_CXX_COMPILER_ID STREQUAL "XL" OR CMAKE_CXX_COMPILER_ID STREQUAL "VisualAge" OR CMAKE_CXX_COMPILER_ID STREQUAL "zOS")
SET( ARCH_NATIVE_FLAG "-qarch=pwr8" CACHE STRING "Additional flag for the compiler capacities detection such as -mcpu=power8 for example" )
set(ALTIVEC_FLAGS "-qaltivec ${ARCH_NATIVE_FLAG}")
else()
SET( ARCH_NATIVE_FLAG "-mcpu=native" CACHE STRING "Additional flag for the compiler capacities detection such as -mcpu=power8 for example" )
set(ALTIVEC_FLAGS "-maltivec -mabi=altivec -mvsx ${ARCH_NATIVE_FLAG}")
endif()
set(ALL_TYPES "ALTIVEC")
else()
if(APPLE) # GCC APPLE
set(SSE3_FLAGS "-msse3 ${ARCH_NATIVE_FLAG}")
set(SSSE3_FLAGS "-mssse3 ${ARCH_NATIVE_FLAG}")
set(SSE41_FLAGS "-msse4 -msse4.1 ${ARCH_NATIVE_FLAG}")
set(SSE42_FLAGS "-msse4 -msse4.2 ${ARCH_NATIVE_FLAG}")
set(AVX_FLAGS "-mavx ${ARCH_NATIVE_FLAG}")
set(AVX2_FLAGS "-mavx2 ${ARCH_NATIVE_FLAG}")
set(AVX512COMMON_FLAGS "-mavx512f -mavx512er -mavx512cd ${ARCH_NATIVE_FLAG}")
set(AVX512KNL_FLAGS "-mavx512f -mavx512pf -mavx512er -mavx512cd ${ARCH_NATIVE_FLAG}")
set(AVX512SKL_FLAGS "-mavx512f -mavx512er -mavx512cd -mavx512vl -mavx512bw -mavx512dq ${ARCH_NATIVE_FLAG}")
else() # GCC LINUX
set(SSE3_FLAGS "-msse3 ${ARCH_NATIVE_FLAG}")
set(SSSE3_FLAGS "-mssse3 ${ARCH_NATIVE_FLAG}")
set(SSE41_FLAGS "-msse4 -msse4.1 ${ARCH_NATIVE_FLAG}")
set(SSE42_FLAGS "-msse4 -msse4.2 ${ARCH_NATIVE_FLAG}")
set(AVX_FLAGS "-mavx ${ARCH_NATIVE_FLAG}")
set(AVX2_FLAGS "-mavx2 ${ARCH_NATIVE_FLAG}")
set(AVX512COMMON_FLAGS "-mavx512f -mavx512er -mavx512cd ${ARCH_NATIVE_FLAG}")
set(AVX512KNL_FLAGS "-mavx512f -mavx512pf -mavx512er -mavx512cd ${ARCH_NATIVE_FLAG}")
set(AVX512SKL_FLAGS "-mavx512f -mavx512er -mavx512cd -mavx512vl -mavx512bw -mavx512dq ${ARCH_NATIVE_FLAG}")
endif(APPLE)
endif()
# X86
SET( ARCH_NATIVE_FLAG "-march=native" CACHE STRING "Additional flag for the compiler capacities detection" )
if(CMAKE_CXX_COMPILER_ID STREQUAL "Intel")
if(APPLE) # INTEL APPLE
set(SSE3_FLAGS "-msse3 ${ARCH_NATIVE_FLAG}")
set(SSSE3_FLAGS "-mssse3 ${ARCH_NATIVE_FLAG}")
set(SSE41_FLAGS "-msse4 -msse4.1 ${ARCH_NATIVE_FLAG}")
set(SSE42_FLAGS "-msse4 -msse4.2 ${ARCH_NATIVE_FLAG}")
set(AVX_FLAGS "-mAVX ${ARCH_NATIVE_FLAG}")
set(AVX2_FLAGS "-march=core-avx2 ${ARCH_NATIVE_FLAG}")
set(AVX512COMMON_FLAGS "-xCOMMON-AVX512 ${ARCH_NATIVE_FLAG}")
set(AVX512KNL_FLAGS "-xCOMMON-AVX512 -xMIC-AVX512 ${ARCH_NATIVE_FLAG}")
set(AVX512SKL_FLAGS "-xCOMMON-AVX512 -xCORE-AVX512 ${ARCH_NATIVE_FLAG}")
else() # INTEL LINUX
set(SSE3_FLAGS "-msse3 ${ARCH_NATIVE_FLAG}")
set(SSSE3_FLAGS "-mssse3 ${ARCH_NATIVE_FLAG}")
set(SSE41_FLAGS "-msse4 -msse4.1 ${ARCH_NATIVE_FLAG}")
set(SSE42_FLAGS "-msse4 -msse4.2 ${ARCH_NATIVE_FLAG}")
set(AVX_FLAGS "-march=core-avx-i ${ARCH_NATIVE_FLAG}")
set(AVX2_FLAGS "-march=core-avx2 ${ARCH_NATIVE_FLAG}")
set(AVX512COMMON_FLAGS "-xCOMMON-AVX512 ${ARCH_NATIVE_FLAG}")
set(AVX512KNL_FLAGS "-xCOMMON-AVX512 -xMIC-AVX512 ${ARCH_NATIVE_FLAG}")
set(AVX512SKL_FLAGS "-xCOMMON-AVX512 -xCORE-AVX512 ${ARCH_NATIVE_FLAG}")
endif()
else()
if(APPLE) # GCC APPLE
set(SSE3_FLAGS "-msse3 ${ARCH_NATIVE_FLAG}")
set(SSSE3_FLAGS "-mssse3 ${ARCH_NATIVE_FLAG}")
set(SSE41_FLAGS "-msse4 -msse4.1 ${ARCH_NATIVE_FLAG}")
set(SSE42_FLAGS "-msse4 -msse4.2 ${ARCH_NATIVE_FLAG}")
set(AVX_FLAGS "-mavx ${ARCH_NATIVE_FLAG}")
set(AVX2_FLAGS "-mavx2 ${ARCH_NATIVE_FLAG}")
set(AVX512COMMON_FLAGS "-mavx512f -mavx512er -mavx512cd ${ARCH_NATIVE_FLAG}")
set(AVX512KNL_FLAGS "-mavx512f -mavx512pf -mavx512er -mavx512cd ${ARCH_NATIVE_FLAG}")
set(AVX512SKL_FLAGS "-mavx512f -mavx512er -mavx512cd -mavx512vl -mavx512bw -mavx512dq ${ARCH_NATIVE_FLAG}")
else() # GCC LINUX
set(SSE3_FLAGS "-msse3 ${ARCH_NATIVE_FLAG}")
set(SSSE3_FLAGS "-mssse3 ${ARCH_NATIVE_FLAG}")
set(SSE41_FLAGS "-msse4 -msse4.1 ${ARCH_NATIVE_FLAG}")
set(SSE42_FLAGS "-msse4 -msse4.2 ${ARCH_NATIVE_FLAG}")
set(AVX_FLAGS "-mavx ${ARCH_NATIVE_FLAG}")
set(AVX2_FLAGS "-mavx2 ${ARCH_NATIVE_FLAG}")
set(AVX512COMMON_FLAGS "-mavx512f -mavx512er -mavx512cd ${ARCH_NATIVE_FLAG}")
set(AVX512KNL_FLAGS "-mavx512f -mavx512pf -mavx512er -mavx512cd ${ARCH_NATIVE_FLAG}")
set(AVX512SKL_FLAGS "-mavx512f -mavx512er -mavx512cd -mavx512vl -mavx512bw -mavx512dq ${ARCH_NATIVE_FLAG}")
endif(APPLE)
endif()
# (ADD-NEW-HERE)
set(ALL_TYPES "SSE3;SSSE3;SSE41;SSE42;AVX;AVX2;AVX512COMMON;AVX512KNL;AVX512SKL")
# (ADD-NEW-HERE)
set(ALL_TYPES "SSE3;SSSE3;SSE41;SSE42;AVX;AVX2;AVX512COMMON;AVX512KNL;AVX512SKL")
endif()
if($ENV{VERBOSE})
foreach(TYPE ${ALL_TYPES})
......
......@@ -272,6 +272,80 @@ inline void InaVecAVX512KNL_exp(const double inVal[], double outVal[]) {
}
#endif
#ifdef INASTEMP_USE_ALTIVEC
#include "ALTIVEC/InaVecALTIVECDouble.hpp"
#include "ALTIVEC/InaVecALTIVECFloat.hpp"
inline void InaVecALTIVEC_exp(const float inVal[], float outVal[]) {
__vector float vec = vec_xl(0, inVal);
const __vector float COEFF_LOG2E = vec_splats(float(InaFastExp::CoeffLog2E()));
const __vector float COEFF_A = vec_splats(float(InaFastExp::CoeffA32()));
const __vector float COEFF_B = vec_splats(float(InaFastExp::CoeffB32()));
const __vector float COEFF_P5_A = vec_splats(float(InaFastExp::GetCoefficient6_5()));
const __vector float COEFF_P5_B = vec_splats(float(InaFastExp::GetCoefficient6_4()));
const __vector float COEFF_P5_C = vec_splats(float(InaFastExp::GetCoefficient6_3()));
const __vector float COEFF_P5_D = vec_splats(float(InaFastExp::GetCoefficient6_2()));
const __vector float COEFF_P5_E = vec_splats(float(InaFastExp::GetCoefficient6_1()));
const __vector float COEFF_P5_F = vec_splats(float(InaFastExp::GetCoefficient6_0()));
__vector float x = vec * COEFF_LOG2E;
const __vector float fractional_part = x - vec_floor(x);
__vector float factor = (((((COEFF_P5_A * fractional_part + COEFF_P5_B)
* fractional_part + COEFF_P5_C)
* fractional_part + COEFF_P5_D)
* fractional_part + COEFF_P5_E)
* fractional_part + COEFF_P5_F);
x -= factor;
__vector int castedInteger = vec_cts(COEFF_A * x + COEFF_B, 0);
vec = reinterpret_cast<__vector float>(castedInteger);
vec_xst(vec, 0, outVal);
}
inline void InaVecALTIVEC_exp(const double inVal[], double outVal[]) {
__vector double vec = vec_xl(0, &inVal[0]);
const __vector double COEFF_LOG2E = vec_splats(double(InaFastExp::CoeffLog2E()));
const __vector double COEFF_A = vec_splats(double(InaFastExp::CoeffA64()));
const __vector double COEFF_B = vec_splats(double(InaFastExp::CoeffB64()));
const __vector double COEFF_P5_C = vec_splats(double(InaFastExp::GetCoefficient4_3()));
const __vector double COEFF_P5_D = vec_splats(double(InaFastExp::GetCoefficient4_2()));
const __vector double COEFF_P5_E = vec_splats(double(InaFastExp::GetCoefficient4_1()));
const __vector double COEFF_P5_F = vec_splats(double(InaFastExp::GetCoefficient4_0()));
__vector double x = vec * COEFF_LOG2E;
const __vector double fractional_part = x - vec_floor(x);
__vector double factor = (((COEFF_P5_C * fractional_part + COEFF_P5_D)
* fractional_part + COEFF_P5_E)
* fractional_part + COEFF_P5_F);
x -= factor;
x = COEFF_A * x + COEFF_B;
// TODO find conversion function
//__vector long castedInteger = vec_cts(x, 0);
//return reinterpret_cast<__vector double>(castedInteger);
alignas(16) double tmpptr[2];
vec_st( reinterpret_cast<__vector unsigned int>(x), 0, reinterpret_cast<unsigned int*>(tmpptr));
alignas(16) long ltmpptr[2];
ltmpptr[0] = long(tmpptr[0]);
ltmpptr[1] = long(tmpptr[1]);
vec = reinterpret_cast<__vector double>(vec_xl(0, ltmpptr));
vec_xst(reinterpret_cast<__vector unsigned int>(vec), 0, reinterpret_cast<unsigned int*>(outVal));
}
#endif
template <class VecType>
void GenericExpInavec(const size_t NbOverLoop, const size_t NbExp){
using RealType = typename VecType::RealType;
......@@ -406,6 +480,34 @@ void compareExpTime(const size_t NbOverLoop, const size_t NbExp){
}
std::cout << "\n";
#endif
#ifdef INASTEMP_USE_ALTIVEC
GenericExpInavec<InaVecALTIVEC<RealType>>(NbOverLoop, NbExp);
{
// Raw SIMD
const int VecLength = InaVecALTIVEC<RealType>::VecLength;
// Note : we increase the length of the vector to avoid checking the loop size
std::unique_ptr< RealType[] > resSimd(new RealType[NbExp + VecLength]);
InaTimer timer;
for (size_t idxLoop = 0; idxLoop < NbOverLoop; ++idxLoop) {
for (size_t idx = 0; idx < NbExp; idx += VecLength) {
alignas(64) RealType bufferX[VecLength];
// Copy value into a buffer since we do it on the fly
for (size_t idxX = 0; idxX < VecLength; ++idxX) {
bufferX[idxX] = static_cast<RealType>((idx + idxX) % 200);
}
InaVecALTIVEC_exp(bufferX, &resSimd[idx]);
}
}
timer.stop();
std::cout << "Vector " << "ALTIVEC" << " for " << NbExp * NbOverLoop
<< " exp took " << timer.getElapsed() << "s (" << timer.getElapsed()/double(NbExp * NbOverLoop) << "s per exp)\n";
}
std::cout << "\n";
#endif
}
int main(int /*argc*/, char* /*argv*/ []) {
......
......@@ -573,7 +573,136 @@ void InaVecAVX512KNL_ScalarGemmInaV2(const double* __restrict__ A, const double*
}
#endif
#ifdef INASTEMP_USE_ALTIVEC
#include "ALTIVEC/InaVecALTIVECDouble.hpp"
#include "ALTIVEC/InaVecALTIVECFloat.hpp"
template <size_t PanelSizeK, size_t PanelSizeiA,
size_t PanelSizejB, size_t VecTypeLength>
void InaVecALTIVEC_ScalarGemmInaV2(const float* __restrict__ A, const float* __restrict__ B,
float* __restrict__ C, const size_t size){
const int BlockSize = VecTypeLength;
static_assert(PanelSizeK >= BlockSize, "PanelSizeK must be greater than block");
static_assert(PanelSizeiA >= BlockSize, "PanelSizeiA must be greater than block");
static_assert(PanelSizejB >= BlockSize, "PanelSizejB must be greater than block");
static_assert((PanelSizeK/BlockSize)*BlockSize == PanelSizeK, "PanelSizeK must be a multiple of block");
static_assert((PanelSizeiA/BlockSize)*BlockSize == PanelSizeiA, "PanelSizeiA must be a multiple of block");
static_assert((PanelSizejB/BlockSize)*BlockSize == PanelSizejB, "PanelSizejB must be a multiple of block");
// Restrict to a multiple of panelsize for simplcity
assert((size/PanelSizeK)*PanelSizeK == size);
assert((size/PanelSizeiA)*PanelSizeiA == size);
assert((size/PanelSizejB)*PanelSizejB == size);
for(size_t ip = 0 ; ip < size ; ip += PanelSizeiA){
for(size_t jp = 0 ; jp < size ; jp += PanelSizejB){
for(size_t kp = 0 ; kp < size ; kp += PanelSizeK){
alignas(64) float panelA[PanelSizeiA*PanelSizeK];
alignas(64) float panelB[PanelSizeK*BlockSize];
for(size_t jb = 0 ; jb < PanelSizejB ; jb += BlockSize){
CopyMat<float, BlockSize>(panelB, PanelSizeK, &B[jp*size + kp], size);
for(size_t ib = 0 ; ib < PanelSizeiA ; ib += BlockSize){
if(jb == 0){
CopyMatT<float, BlockSize>(&panelA[ib], PanelSizeiA, PanelSizeK,
&A[(ib+ip)*size + kp], size);
}
__vector float sum[BlockSize];
for(size_t idxCol = 0 ; idxCol < BlockSize ; ++idxCol){
sum[idxCol] = vec_splats(0.f);
}
for(size_t idxK = 0 ; idxK < PanelSizeK ; ++idxK){
const __vector float valA = vec_xl(0, &panelA[idxK*PanelSizeiA + ib]);
for(size_t idxCol = 0 ; idxCol < BlockSize ; ++idxCol){
sum[idxCol] += valA * vec_xl(0, &panelB[idxCol*PanelSizeK + idxK]);
}
}
float* __restrict__ ptrC = &C[(jp+jb)*size + ip + ib];
for(size_t idxCol = 0 ; idxCol < BlockSize ; ++idxCol){
__vector float res = sum[idxCol] + vec_xl(0, &ptrC[idxCol*size]);
vec_xst(res, 0, &ptrC[idxCol*size]);
}
}
}
}
}
}
}
template <size_t PanelSizeK, size_t PanelSizeiA,
size_t PanelSizejB, size_t VecTypeLength>
void InaVecALTIVEC_ScalarGemmInaV2(const double* __restrict__ A, const double* __restrict__ B,
double* __restrict__ C, const size_t size){
const int BlockSize = VecTypeLength;
static_assert(PanelSizeK >= BlockSize, "PanelSizeK must be greater than block");
static_assert(PanelSizeiA >= BlockSize, "PanelSizeiA must be greater than block");
static_assert(PanelSizejB >= BlockSize, "PanelSizejB must be greater than block");
static_assert((PanelSizeK/BlockSize)*BlockSize == PanelSizeK, "PanelSizeK must be a multiple of block");
static_assert((PanelSizeiA/BlockSize)*BlockSize == PanelSizeiA, "PanelSizeiA must be a multiple of block");
static_assert((PanelSizejB/BlockSize)*BlockSize == PanelSizejB, "PanelSizejB must be a multiple of block");
// Restrict to a multiple of panelsize for simplcity
assert((size/PanelSizeK)*PanelSizeK == size);
assert((size/PanelSizeiA)*PanelSizeiA == size);
assert((size/PanelSizejB)*PanelSizejB == size);
for(size_t ip = 0 ; ip < size ; ip += PanelSizeiA){
for(size_t jp = 0 ; jp < size ; jp += PanelSizejB){
for(size_t kp = 0 ; kp < size ; kp += PanelSizeK){
alignas(64) double panelA[PanelSizeiA*PanelSizeK];
alignas(64) double panelB[PanelSizeK*BlockSize];
for(size_t jb = 0 ; jb < PanelSizejB ; jb += BlockSize){
CopyMat<double, BlockSize>(panelB, PanelSizeK, &B[jp*size + kp], size);
for(size_t ib = 0 ; ib < PanelSizeiA ; ib += BlockSize){
if(jb == 0){
CopyMatT<double, BlockSize>(&panelA[ib], PanelSizeiA, PanelSizeK,
&A[(ib+ip)*size + kp], size);
}
__vector double sum[BlockSize];
for(size_t idxCol = 0 ; idxCol < BlockSize ; ++idxCol){
sum[idxCol] = vec_splats(0.);
}
for(size_t idxK = 0 ; idxK < PanelSizeK ; ++idxK){
const __vector double valA = vec_xl(0, &panelA[idxK*PanelSizeiA + ib]);
for(size_t idxCol = 0 ; idxCol < BlockSize ; ++idxCol){
sum[idxCol] += valA * vec_xl(0, &panelB[idxCol*PanelSizeK + idxK]);
}
}
double* __restrict__ ptrC = &C[(jp+jb)*size + ip + ib];
for(size_t idxCol = 0 ; idxCol < BlockSize ; ++idxCol){
__vector double res = sum[idxCol] + vec_xl(0, &ptrC[idxCol*size]);
vec_xst( reinterpret_cast<__vector unsigned int>(res), 0, reinterpret_cast<unsigned int*>(&ptrC[idxCol*size]));
}
}
}
}
}
}
}
#endif
///////////////////////////////////////////////////////////////////////////////////
......@@ -879,6 +1008,27 @@ void compareGemmTime(const size_t NbOverLoop, const size_t matDim){
<< " took " << timer.getElapsed() << "s (" << (double(NbOverLoop*nbFlops)/timer.getElapsed())/1E9 << "GFlop/s)\n";
}
#endif
/////////////////////////////////////////////////////////////
#ifdef INASTEMP_USE_ALTIVEC
ComputeGemmIna<InaVecALTIVEC<RealType>, RealType, PanelSizeA, PanelSizeB, PanelSizeK>(NbOverLoop, matDim, nbFlops, A.get(), B.get());
{
std::unique_ptr< RealType[] > CIna(new RealType[matDim*matDim]);
memset(CIna.get(), 0, sizeof(RealType)*matDim*matDim);
InaTimer timer;
for(size_t idxLoop = 0 ; idxLoop < NbOverLoop ; ++idxLoop){
InaVecALTIVEC_ScalarGemmInaV2<PanelSizeK, PanelSizeA, PanelSizeB,
InaVecALTIVEC<RealType>::VecLength>(A.get(), B.get(), CIna.get(), matDim);
}
timer.stop();
std::cout << "Vector V2 " << "ALTIVEC" << " for size " << matDim
<< " took " << timer.getElapsed() << "s (" << (double(NbOverLoop*nbFlops)/timer.getElapsed())/1E9 << "GFlop/s)\n";
}
#endif
}
int main(int /*argc*/, char* /*argv*/ []) {
......
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
......@@ -26,6 +26,7 @@
#cmakedefine INASTEMP_USE_AVX512SKL
#cmakedefine INASTEMP_USE_ALTIVEC
// Inform about best one
#define INASTEMP_@INASTEMP_BESTTYPE@_IS_BEST
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment