Commit b126bba0 authored by Berenger Bramas's avatar Berenger Bramas

Merge branch 'use-inastemp-submodule' into 'develop'

Use inastemp

See merge request solverstack/ScalFMM!15
parents 2c5546a4 d9cc3c50
[submodule "CMakeModules/morse_cmake"] [submodule "CMakeModules/morse_cmake"]
path = CMakeModules/morse_cmake path = CMakeModules/morse_cmake
url = https://gitlab.inria.fr/solverstack/morse_cmake.git url = https://gitlab.inria.fr/solverstack/morse_cmake.git
[submodule "inastemp"]
path = inastemp
url = https://gitlab.inria.fr/coulaud/inastemp
This diff is collapsed.
###########################################################################################
# Berenger Bramas Inria
# This goes with the getCpuInfos.cpp
# This will create one CMAKE value per output option from the cpp file.
# For example the output of the CPP file can be:
# SSE3=TRUE;AVX=FALSE
# Then it will create:
# CPUOPTION_SSE3 = TRUE
# CPUOPTION_AVX = FALSE
#
# The binary should return 0 on success.
###########################################################################################
macro(GetCpuInfos)
# The original CPP file
set(GetCpuInfosFile "${PROJECT_SOURCE_DIR}/CMakeModules/getCpuInfos.cpp")
# Fatal error if the file does not exist
if(NOT EXISTS ${GetCpuInfosFile})
message(FATAL_ERROR "The GetCpuInfosFile does not exist (${GetCpuInfosFile})")
endif()
# Compile and execute the file
try_run(RUN_RESULT_VAR COMPILE_RESULT_VAR
${CMAKE_BINARY_DIR} ${GetCpuInfosFile} # [CMAKE_FLAGS <Flags>] [COMPILE_DEFINITIONS <flags>]
COMPILE_OUTPUT_VARIABLE comp
RUN_OUTPUT_VARIABLE run)
# If it has successfuly compiled an run
if(COMPILE_RESULT_VAR AND (RUN_RESULT_VAR EQUAL 0) )
set( CPU_OPTIONS ${run} )
# For each value
foreach(optionNode ${run})
# Get name and value
string(REPLACE "=" ";" optionNameAndValue ${optionNode})
list(LENGTH optionNameAndValue optionLength)
# If we get both
if(optionLength EQUAL 2)
list(GET optionNameAndValue 0 optionName)
list(GET optionNameAndValue 1 optionValue)
# create cmake variable
set(CPUOPTION_${optionName} ${optionValue})
else()
message(WARNING "GetCpuInfosFile wrong format for ${optionNode}.")
endif()
endforeach()
# output the sentence from the binrary
message(STATUS "CPUOPTION : ${CPU_OPTIONS}")
else()
message(WARNING "GetCpuInfosFile did not return correctly.")
endif()
endmacro(GetCpuInfos)
#include "immintrin.h"
int main() {
#ifdef __MIC__
__m512 tx, ty ;
tx += ty ;
#endif
return 0;
}
#include "immintrin.h"
int main() {
__m256d tx, ty ;
tx += ty ;
return 0;
}
#include <xmmintrin.h> // SSE
#include <emmintrin.h> //SSE2
#include <pmmintrin.h> //SSE3
#ifdef __SSSE3__
#include <tmmintrin.h> //SSSE3
#endif
#ifdef __SSSE4_1__
#include <smmintrin.h> // SSE4
#endif
int main() {
__m128d tx, ty ;
tx += ty ;
return 0;
}
#include <x86intrin.h>
#include <xmmintrin.h> // SSE
#include <emmintrin.h> // SSE2
#include <pmmintrin.h> // SSE3
#include <tmmintrin.h> // SSSE3
#include <smmintrin.h> // SSE4
#include <immintrin.h> // AVX
int main(){
{
__m256d res0d, res1d;
res0d = _mm256_hadd_pd(res0d, res1d);
__m256 res0, res1;
res0 = _mm256_hadd_ps(res0, res1);
}
{
__m128d res0d, res1d;
res0d = _mm_hadd_pd(res0d, res1d);
__m128 res0, res1;
res0 = _mm_hadd_ps(res0, res1);
}
return 0;
}
#include <x86intrin.h>
#include <xmmintrin.h> // SSE
#include <emmintrin.h> // SSE2
#include <pmmintrin.h> // SSE3
#include <tmmintrin.h> // SSSE3
#include <smmintrin.h> // SSE4
#include <immintrin.h> // AVX
int main(){
{
#ifdef __MIC__
__m512d res0d, res1d;
res0d = _mm512_hadd_pd(res0d, res1d);
__m512 res0, res1;
res0 = _mm512_hadd_ps(res0, res1);
#endif
}
{
__m256d res0d, res1d;
res0d = _mm256_hadd_pd(res0d, res1d);
__m256 res0, res1;
res0 = _mm256_hadd_ps(res0, res1);
}
{
__m128d res0d, res1d;
res0d = _mm_hadd_pd(res0d, res1d);
__m128 res0, res1;
res0 = _mm_hadd_ps(res0, res1);
}
return 0;
}
#include <x86intrin.h>
#include <xmmintrin.h> // SSE
#include <emmintrin.h> // SSE2
#include <pmmintrin.h> // SSE3
#ifdef __SSSE3__
#include <tmmintrin.h> //SSSE3
#endif
#ifdef __SSSE4_1__
#include <smmintrin.h> // SSE4
#endif
int main(){
__m128d res0d, res1d;
res0d = _mm_hadd_pd(res0d, res1d);
__m128 res0, res1;
res0 = _mm_hadd_ps(res0, res1);
return 0;
}
This diff is collapsed.
...@@ -30,7 +30,7 @@ The following are optional: ...@@ -30,7 +30,7 @@ The following are optional:
### Get and Build ScalFMM ### Get and Build ScalFMM
To use last development states of ScalFMM, please clone the develop To use last development states of ScalFMM, please clone the develop
branch. Note that ScalFMM contains a git submodule `morse_cmake`. branch. Note that ScalFMM contains two git submodules `morse_cmake` and `inastemp`.
To get sources please use these commands: To get sources please use these commands:
``` bash ``` bash
git clone --recursive git@gitlab.inria.fr:solverstack/ScalFMM.git -b develop git clone --recursive git@gitlab.inria.fr:solverstack/ScalFMM.git -b develop
......
...@@ -2,8 +2,7 @@ ...@@ -2,8 +2,7 @@
#define FADAPTCHEBKERNEL_HPP #define FADAPTCHEBKERNEL_HPP
#include "Kernels/Chebyshev/FChebSymKernel.hpp" #include "Kernels/Chebyshev/FChebSymKernel.hpp"
#include "InastempCompileConfig.h"
#include "FComputeClassDescriptor.hpp"
#ifdef _OPENMP #ifdef _OPENMP
#include <omp.h> #include <omp.h>
...@@ -69,8 +68,8 @@ public: ...@@ -69,8 +68,8 @@ public:
const ContainerClass* const particles, const ContainerClass* const particles,
const SymbolicData * const /*source_symb*/) const SymbolicData * const /*source_symb*/)
{ {
using ComputeClass = typename ComputeClassDescriptor<FReal>::type; using ComputeClass = InaVecBestType<FReal>;
constexpr std::size_t FRealCount = ComputeClassDescriptor<FReal>::count; constexpr int FRealCount = ComputeClass::VecLength;
// Target cell: local // Target cell: local
const FReal localCellWidth(FBase::BoxWidth / FReal(1 << symb->getLevel())); const FReal localCellWidth(FBase::BoxWidth / FReal(1 << symb->getLevel()));
...@@ -82,57 +81,35 @@ public: ...@@ -82,57 +81,35 @@ public:
FChebTensor<FReal,ORDER>::setRoots(localCellCenter, localCellWidth, X); FChebTensor<FReal,ORDER>::setRoots(localCellCenter, localCellWidth, X);
// Particles attributes // Particles attributes
const ComputeClass * const posX = (const ComputeClass * const) particles->getPositions()[0]; const FReal * const posX = particles->getPositions()[0];
const ComputeClass * const posY = (const ComputeClass * const) particles->getPositions()[1]; const FReal * const posY = particles->getPositions()[1];
const ComputeClass * const posZ = (const ComputeClass * const) particles->getPositions()[2]; const FReal * const posZ = particles->getPositions()[2];
const ComputeClass * const physicalValues = (const ComputeClass * const) particles->getPhysicalValues(); const FReal * const physicalValues = particles->getPhysicalValues();
const FReal* pX = particles->getPositions()[0]; // const FReal* pX = particles->getPositions()[0];
const FReal* pY = particles->getPositions()[1]; // const FReal* pY = particles->getPositions()[1];
const FReal* pZ = particles->getPositions()[2]; // const FReal* pZ = particles->getPositions()[2];
const FReal* pV = particles->getPhysicalValues(); // const FReal* pV = particles->getPhysicalValues();
// apply P2L // apply P2L
for(int idxRhs = 0 ; idxRhs < NVALS ; ++idxRhs){ for(int idxRhs = 0 ; idxRhs < NVALS ; ++idxRhs){
for (unsigned int m = 0; m<FBase::nnodes; ++m) { for (unsigned int m = 0; m<FBase::nnodes; ++m) {
ComputeClass XX = FMath::ConvertTo<ComputeClass>(X[m].getX()); ComputeClass XX = ComputeClass(X[m].getX());
ComputeClass XY = FMath::ConvertTo<ComputeClass>(X[m].getY()); ComputeClass XY = ComputeClass(X[m].getY());
ComputeClass XZ = FMath::ConvertTo<ComputeClass>(X[m].getZ()); ComputeClass XZ = ComputeClass(X[m].getZ());
std::size_t idxPart = 0;
// Compute using vectorization for all but the last array elements // Compute using vectorization for all but the last array elements
ComputeClass tmpLocalExp = FMath::Zero<ComputeClass>(); ComputeClass tmpLocalExp = ComputeClass::GetZero();
for (; for (std::size_t idxPart = 0 ; idxPart < particles->getNbParticles() ; idxPart += FRealCount)
idxPart < ((particles->getNbParticles())
/ FRealCount);
++idxPart)
{ {
tmpLocalExp += tmpLocalExp +=
FBase::MatrixKernel->evaluate( FBase::MatrixKernel->evaluate(
XX, XY, XZ, XX, XY, XZ,
posX[idxPart], posY[idxPart], posZ[idxPart]) ComputeClass(&posX[idxPart]), ComputeClass(&posY[idxPart]), ComputeClass(&posZ[idxPart]))
* physicalValues[idxPart]; * physicalValues[idxPart];
} }
local->get(idxRhs)[m] += FMath::ConvertTo<FReal>(tmpLocalExp); local->get(idxRhs)[m] += (tmpLocalExp.horizontalSum());
// Compute the last array elements one by one if they exist
if(idxPart < ((particles->getNbParticles() + FRealCount - 1) / FRealCount)) {
auto Xx = X[m].getX();
auto Xy = X[m].getY();
auto Xz = X[m].getZ();
for(idxPart = FRealCount * (particles->getNbParticles() / FRealCount);
idxPart < static_cast<std::size_t>(particles->getNbParticles());
++idxPart)
{
local->get(idxRhs)[m] +=
FBase::MatrixKernel->evaluate(
Xx, Xy, Xz,
pX[idxPart], pY[idxPart], pZ[idxPart])
* pV[idxPart];
}
}
} }
}// NVALS }// NVALS
} }
...@@ -143,8 +120,8 @@ public: ...@@ -143,8 +120,8 @@ public:
ContainerClass* const particles, ContainerClass* const particles,
const SymbolicData * const /*source_symb*/) const SymbolicData * const /*source_symb*/)
{ {
using ComputeClass = typename ComputeClassDescriptor<FReal>::type; using ComputeClass = InaVecBestType<FReal>;
constexpr std::size_t FRealCount = ComputeClassDescriptor<FReal>::count; constexpr int FRealCount = ComputeClass::VecLength;
// Source cell: pole // Source cell: pole
const FReal poleCellWidth(FBase::BoxWidth / FReal(1 << symb->getLevel())); const FReal poleCellWidth(FBase::BoxWidth / FReal(1 << symb->getLevel()));
...@@ -156,44 +133,45 @@ public: ...@@ -156,44 +133,45 @@ public:
FChebTensor<FReal,ORDER>::setRoots(poleCellCenter, poleCellWidth, Y); FChebTensor<FReal,ORDER>::setRoots(poleCellCenter, poleCellWidth, Y);
// read positions // read positions
const ComputeClass* const posX = (const ComputeClass* const)(particles->getPositions()[0]); const FReal* const posX = (particles->getPositions()[0]);
const ComputeClass* const posY = (const ComputeClass* const)(particles->getPositions()[1]); const FReal* const posY = (particles->getPositions()[1]);
const ComputeClass* const posZ = (const ComputeClass* const)(particles->getPositions()[2]); const FReal* const posZ = (particles->getPositions()[2]);
// get potential // get potential
ComputeClass* const physVal = (ComputeClass* const)(particles->getPhysicalValues()); FReal* const physVal = (particles->getPhysicalValues());
ComputeClass* const potentials = (ComputeClass* const)(particles->getPotentials()); FReal* const potentials = (particles->getPotentials());
ComputeClass* const fx = (ComputeClass* const)(particles->getForcesX()); FReal* const fx = (particles->getForcesX());
ComputeClass* const fy = (ComputeClass* const)(particles->getForcesY()); FReal* const fy = (particles->getForcesY());
ComputeClass* const fz = (ComputeClass* const)(particles->getForcesZ()); FReal* const fz = (particles->getForcesZ());
for(int idxRhs = 0 ; idxRhs < NVALS ; ++idxRhs){ for(int idxRhs = 0 ; idxRhs < NVALS ; ++idxRhs){
// apply M2P // apply M2P
for (unsigned int n=0; n<FBase::nnodes; ++n){ for (unsigned int n=0; n<FBase::nnodes; ++n){
ComputeClass MultipoleExpansion = ComputeClass MultipoleExpansion =
FMath::ConvertTo<ComputeClass, FReal>(pole->get(idxRhs)[n]); ComputeClass(pole->get(idxRhs)[n]);
ComputeClass YX = FMath::ConvertTo<ComputeClass, FReal>(Y[n].getX()); ComputeClass YX = ComputeClass(Y[n].getX());
ComputeClass YY = FMath::ConvertTo<ComputeClass, FReal>(Y[n].getY()); ComputeClass YY = ComputeClass(Y[n].getY());
ComputeClass YZ = FMath::ConvertTo<ComputeClass, FReal>(Y[n].getZ()); ComputeClass YZ = ComputeClass(Y[n].getZ());
for(std::size_t idxPart = 0; for(std::size_t idxPart = 0;
idxPart < ( (particles->getNbParticles() + FRealCount - 1) idxPart < particles->getNbParticles();
/ FRealCount); idxPart += FRealCount)
++idxPart)
{ {
ComputeClass Kxy[1]; ComputeClass Kxy[1];
ComputeClass dKxy[3]; ComputeClass dKxy[3];
FBase::MatrixKernel->evaluateBlockAndDerivative( FBase::MatrixKernel->evaluateBlockAndDerivative(
posX[idxPart], posY[idxPart], posZ[idxPart], ComputeClass(&posX[idxPart]),
ComputeClass(&posY[idxPart]),
ComputeClass(&posZ[idxPart]),
YX, YY, YZ, YX, YY, YZ,
Kxy,dKxy); Kxy,dKxy);
potentials[idxPart] += Kxy[0] * MultipoleExpansion; (ComputeClass(&potentials[idxPart]) + Kxy[0] * MultipoleExpansion).storeInArray(&potentials[idxPart]);
fx[idxPart] += dKxy[0] * physVal[idxPart] * MultipoleExpansion; (ComputeClass(&fx[idxPart]) + dKxy[0] * physVal[idxPart] * MultipoleExpansion).storeInArray(&fx[idxPart]);
fy[idxPart] += dKxy[1] * physVal[idxPart] * MultipoleExpansion; (ComputeClass(&fy[idxPart]) + dKxy[1] * physVal[idxPart] * MultipoleExpansion).storeInArray(&fy[idxPart]);
fz[idxPart] += dKxy[2] * physVal[idxPart] * MultipoleExpansion; (ComputeClass(&fz[idxPart]) + dKxy[2] * physVal[idxPart] * MultipoleExpansion).storeInArray(&fz[idxPart]);
} }
}// Particles }// Particles
}// NVALS }// NVALS
......
...@@ -4,11 +4,10 @@ ...@@ -4,11 +4,10 @@
#include <cassert> #include <cassert>
#include "Kernels/Uniform/FUnifKernel.hpp" #include "Kernels/Uniform/FUnifKernel.hpp"
#include "InastempCompileConfig.h"
#include "Utils/FMath.hpp" #include "Utils/FMath.hpp"
#include "FComputeClassDescriptor.hpp"
#include <fstream> #include <fstream>
...@@ -114,8 +113,8 @@ public: ...@@ -114,8 +113,8 @@ public:
const ContainerClass* const particles, const ContainerClass* const particles,
const SymbolicData * const /*source_symb*/) const SymbolicData * const /*source_symb*/)
{ {
using ComputeClass = typename ComputeClassDescriptor<FReal>::type; using ComputeClass = InaVecBestType<FReal>;
constexpr std::size_t FRealCount = ComputeClassDescriptor<FReal>::count; constexpr int FRealCount = ComputeClass::VecLength;
// Target cell: local // Target cell: local
const FReal localCellWidth(FBase::BoxWidth / FReal(1 << symb->getLevel())); const FReal localCellWidth(FBase::BoxWidth / FReal(1 << symb->getLevel()));
...@@ -127,55 +126,36 @@ public: ...@@ -127,55 +126,36 @@ public:
FUnifTensor<FReal,ORDER>::setRoots(localCellCenter, localCellWidth, X); FUnifTensor<FReal,ORDER>::setRoots(localCellCenter, localCellWidth, X);
// Particles attributes // Particles attributes
const ComputeClass * const posX = (const ComputeClass * const) particles->getPositions()[0]; const FReal * const posX = particles->getPositions()[0];
const ComputeClass * const posY = (const ComputeClass * const) particles->getPositions()[1]; const FReal * const posY = particles->getPositions()[1];
const ComputeClass * const posZ = (const ComputeClass * const) particles->getPositions()[2]; const FReal * const posZ = particles->getPositions()[2];
const ComputeClass * const physicalValues = (const ComputeClass * const) particles->getPhysicalValues(); const FReal * const physicalValues = particles->getPhysicalValues();
const FReal* pX = particles->getPositions()[0]; // const FReal* pX = particles->getPositions()[0];
const FReal* pY = particles->getPositions()[1]; // const FReal* pY = particles->getPositions()[1];
const FReal* pZ = particles->getPositions()[2]; // const FReal* pZ = particles->getPositions()[2];
const FReal* pV = particles->getPhysicalValues(); // const FReal* pV = particles->getPhysicalValues();
// apply P2L // apply P2L
for(int idxRhs = 0 ; idxRhs < NVALS ; ++idxRhs){ for(int idxRhs = 0 ; idxRhs < NVALS ; ++idxRhs){
for (unsigned int m = 0; m < FBase::nnodes; ++m) { for (unsigned int m = 0; m < FBase::nnodes; ++m) {
ComputeClass XX = FMath::ConvertTo<ComputeClass>(X[m].getX()); ComputeClass XX = ComputeClass(X[m].getX());
ComputeClass XY = FMath::ConvertTo<ComputeClass>(X[m].getY()); ComputeClass XY = ComputeClass(X[m].getY());
ComputeClass XZ = FMath::ConvertTo<ComputeClass>(X[m].getZ()); ComputeClass XZ = ComputeClass(X[m].getZ());
ComputeClass tmpLocalExp = FMath::Zero<ComputeClass>(); ComputeClass tmpLocalExp = ComputeClass::GetZero();
// Compute using vectorization for all but the last array elements // Compute using vectorization for all but the last array elements
std::size_t idxPart = 0;
for (; idxPart < (particles->getNbParticles() / FRealCount); for (std::size_t idxPart = 0 ; idxPart < particles->getNbParticles() ; idxPart += FRealCount)
++idxPart)
{ {
tmpLocalExp += tmpLocalExp +=
FBase::MatrixKernel->evaluate( FBase::MatrixKernel->evaluate(
XX, XY, XZ, XX, XY, XZ,
posX[idxPart], posY[idxPart], posZ[idxPart]) ComputeClass(&posX[idxPart]), ComputeClass(&posY[idxPart]), ComputeClass(&posZ[idxPart]))
* physicalValues[idxPart]; * physicalValues[idxPart];
} }
local->get(idxRhs)[m] += FMath::ConvertTo<FReal>(tmpLocalExp); local->get(idxRhs)[m] += (tmpLocalExp.horizontalSum());
// Compute the last array elements one by one if they exist
if(idxPart < ((particles->getNbParticles() + FRealCount - 1) / FRealCount)) {
auto Xx = X[m].getX();
auto Xy = X[m].getY();
auto Xz = X[m].getZ();
for(idxPart = FRealCount * (particles->getNbParticles() / FRealCount);
idxPart < static_cast<std::size_t>(particles->getNbParticles());
++idxPart)
{
local->get(idxRhs)[m] +=
FBase::MatrixKernel->evaluate(
Xx, Xy, Xz,
pX[idxPart], pY[idxPart], pZ[idxPart])
* pV[idxPart];
}
}
} }
}// NVALS }// NVALS
} }
...@@ -191,8 +171,8 @@ public: ...@@ -191,8 +171,8 @@ public:
ContainerClass* const particles, ContainerClass* const particles,
const SymbolicData * const /*target_symb*/) const SymbolicData * const /*target_symb*/)
{ {
using ComputeClass = typename ComputeClassDescriptor<FReal>::type; using ComputeClass = InaVecBestType<FReal>;
constexpr std::size_t FRealCount = ComputeClassDescriptor<FReal>::count; constexpr int FRealCount = ComputeClass::VecLength;
// Source cell: pole // Source cell: pole
const FReal poleCellWidth(FBase::BoxWidth / FReal(1 << symb->getLevel())); const FReal poleCellWidth(FBase::BoxWidth / FReal(1 << symb->getLevel()));
...@@ -204,16 +184,16 @@ public: ...@@ -204,16 +184,16 @@ public:
FUnifTensor<FReal,ORDER>::setRoots(poleCellCenter, poleCellWidth, Y); FUnifTensor<FReal,ORDER>::setRoots(poleCellCenter, poleCellWidth, Y);
// read positions // read positions
const ComputeClass* const posX = (const ComputeClass* const)(particles->getPositions()[0]); const FReal* const posX = (particles->getPositions()[0]);
const ComputeClass* const posY = (const ComputeClass* const)(particles->getPositions()[1]); const FReal* const posY = (particles->getPositions()[1]);
const ComputeClass* const posZ = (const ComputeClass* const)(particles->getPositions()[2]); const FReal* const posZ = (particles->getPositions()[2]);
// get potential // get potential
ComputeClass* const physVal = (ComputeClass* const)(particles->getPhysicalValues()); FReal* const physVal = (particles->getPhysicalValues());
ComputeClass* const potentials = (ComputeClass* const)(particles->getPotentials()); FReal* const potentials = (particles->getPotentials());
ComputeClass* const fx = (ComputeClass* const)(particles->getForcesX()); FReal* const fx = (particles->getForcesX());
ComputeClass* const fy = (ComputeClass* const)(particles->getForcesY()); FReal* const fy = (particles->getForcesY());
ComputeClass* const fz = (ComputeClass* const)(particles->getForcesZ()); FReal* const fz = (particles->getForcesZ());
for(int idxRhs = 0 ; idxRhs < NVALS ; ++idxRhs){ for(int idxRhs = 0 ; idxRhs < NVALS ; ++idxRhs){
...@@ -221,29 +201,30 @@ public: ...@@ -221,29 +201,30 @@ public:
for (unsigned int n=0; n<FBase::nnodes; ++n){ for (unsigned int n=0; n<FBase::nnodes; ++n){
ComputeClass MultipoleExpansion = ComputeClass MultipoleExpansion =
FMath::ConvertTo<ComputeClass, FReal>(pole->get(idxRhs)[n]); ComputeClass(pole->get(idxRhs)[n]);
ComputeClass YX = FMath::ConvertTo<ComputeClass, FReal>(Y[n].getX()); ComputeClass YX = ComputeClass(Y[n].getX());
ComputeClass YY = FMath::ConvertTo<ComputeClass, FReal>(Y[n].getY()); ComputeClass YY = ComputeClass(Y[n].getY());
ComputeClass YZ = FMath::ConvertTo<ComputeClass, FReal>(Y[n].getZ()); ComputeClass YZ = ComputeClass(Y[n].getZ());
for(std::size_t idxPart = 0; for(std::size_t idxPart = 0;
idxPart < ( (particles->getNbParticles() + FRealCount - 1) idxPart < particles->getNbParticles();
/ FRealCount); idxPart += FRealCount)
++idxPart)
{ {
ComputeClass Kxy[1]; ComputeClass Kxy[1];
ComputeClass dKxy[3]; ComputeClass dKxy[3];
FBase::MatrixKernel->evaluateBlockAndDerivative( FBase::MatrixKernel->evaluateBlockAndDerivative(
posX[idxPart], posY[idxPart], posZ[idxPart], ComputeClass(&posX[idxPart]),
ComputeClass(&posY[idxPart]),
ComputeClass(&posZ[idxPart]),
YX, YY, YZ, YX, YY, YZ,
Kxy,dKxy); Kxy,dKxy);
potentials[idxPart] += Kxy[0] * MultipoleExpansion; (ComputeClass(&potentials[idxPart]) + Kxy[0] * MultipoleExpansion).storeInArray(&potentials[idxPart]);
fx[idxPart] += dKxy[0] * physVal[idxPart] * MultipoleExpansion; (ComputeClass(&fx[idxPart]) + dKxy[0] * physVal[idxPart] * MultipoleExpansion).storeInArray(&fx[idxPart]);
fy[idxPart] += dKxy[1] * physVal[idxPart] * MultipoleExpansion; (ComputeClass(&fy[idxPart]) + dKxy[1] * physVal[idxPart] * MultipoleExpansion).storeInArray(&fy[idxPart]);
fz[idxPart] += dKxy[2] * physVal[idxPart] * MultipoleExpansion; (ComputeClass(&fz[idxPart]) + dKxy[2] * physVal[idxPart] * MultipoleExpansion).storeInArray(&fz[idxPart]);
} }
......
#ifndef FCOMPUTECLASSDESCRIPTOR_HPP
#define FCOMPUTECLASSDESCRIPTOR_HPP
template<typename FReal>
struct ComputeClassDescriptor {};
template<>
struct ComputeClassDescriptor<double> {
#if 0 // for easy macro reordering
#elif defined SCALFMM_USE_SSE
using type = __m128d;
enum {count = 2};
#elif defined SCALFMM_USE_AVX
using type = __m256d;
enum {count = 4};
#elif defined SCALFMM_USE_AVX2
using type = __m512d;
enum {count = 8};
#else
using type = double;
enum {count = 1};
#endif
};
template<>
struct ComputeClassDescriptor<float> {
#if 0
#elif defined SCALFMM_USE_SSE
using type = __m128;
enum {count = 4};
#elif defined SCALFMM_USE_AVX
using type = __m256;
enum {count = 8};
#elif defined SCALFMM_USE_AVX2
using type = __m512;
enum {count = 16};
#else
using type = float;
enum {count = 1};
#endif
};