diff --git a/.gitmodules b/.gitmodules index e1e8c690c3b08b1f2a8c52f94e9aae76828eaffb..2b1f354a63acc05660f82038e3c15c1b0170cc94 100644 --- a/.gitmodules +++ b/.gitmodules @@ -1,3 +1,6 @@ [submodule "CMakeModules/morse_cmake"] path = CMakeModules/morse_cmake url = https://gitlab.inria.fr/solverstack/morse_cmake.git +[submodule "inastemp"] + path = inastemp + url = https://gitlab.mpcdf.mpg.de/bbramas/inastemp.git diff --git a/CMakeLists.txt b/CMakeLists.txt index f0a4397ee7f70be74868b0800e6fe406871318ca..df83536abe2f8487f9a229eb21365269e6c3e1a5 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -22,10 +22,6 @@ SET(SCALFMM_CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/CMakeModules/) # # Adds the CMAKE_DEPENDENT_OPTION command INCLUDE(CMakeDependentOption) -# Add to check CPU info -include(GetCpuInfos) -GetCpuInfos() -# #=========================================================================== # Version Number #=========================================================================== @@ -89,18 +85,8 @@ if (MORSE_DISTRIB_DIR OR EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/CMakeModules/morse_ option( SCALFMM_USE_EZTRACE "Set to ON to compile with eztrace framwork" OFF ) option( SCALFMM_USE_STARPU "Set to ON to build SCALFMM with StarPU" OFF ) option( SCALFMM_BUILD_UTILS "Set to ON to build utils Tests" OFF ) - # - # VECTORISATION - # - if( APPLE ) # to fix problem with GCC and avx - CMAKE_DEPENDENT_OPTION( SCALFMM_USE_SSE "Set to ON to compile with SSE support (and use intrinsec SSE P2P)" ON "CPUOPTION_SSE3;NOT CPUOPTION_AVX2" OFF ) - CMAKE_DEPENDENT_OPTION( SCALFMM_USE_AVX "Set to ON to compile with AVX support (and use intrinsec AVX P2P)" OFF "CPUOPTION_AVX; NOT CPUOPTION_AVX2" OFF ) - else(APPLE) - CMAKE_DEPENDENT_OPTION( SCALFMM_USE_SSE "Set to ON to compile with SSE support (and use intrinsec SSE P2P)" ON "CPUOPTION_SSE3;NOT CPUOPTION_AVX;NOT CPUOPTION_AVX2" OFF ) - CMAKE_DEPENDENT_OPTION( SCALFMM_USE_AVX "Set to ON to compile with AVX support (and use intrinsec AVX P2P)" ON "CPUOPTION_AVX; NOT CPUOPTION_AVX2" OFF ) - endif(APPLE) - CMAKE_DEPENDENT_OPTION( SCALFMM_USE_AVX2 "Set to ON to compile with AVX support (and use intrinsec AVX2 P2P)" ON "CPUOPTION_AVX2" OFF ) - + + if( SCALFMM_ONLY_DEVEL ) if(CMAKE_CXX_COMPILER_ID STREQUAL "Intel") option( SCALFMM_USE_OMP4 "Set to ON to disable the gcc/intel omp4" OFF ) @@ -160,63 +146,21 @@ if (MORSE_DISTRIB_DIR OR EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/CMakeModules/morse_ set(SCALFMM_CXX_FLAGS "${SCALFMM_CXX_FLAGS} -m64") endif() ############################################################################## - # Compile options # + # Inastemp # ############################################################################## - # -xHost -mfpmath=sse - # -Wall Wnosign-conversion - # - # Set a fixed template depth - # Compilers don't use the same default for template-depth, we can enforce the same one everywhere. - # The magic number comes from GCC's default: https://gcc.gnu.org/onlinedocs/gcc/C_002b_002b-Dialect-Options.html#C_002b_002b-Dialect-Options - set(SCALFMM_CXX_FLAGS "${SCALFMM_CXX_FLAGS} -ftemplate-depth=900") - if(CMAKE_CXX_COMPILER_ID STREQUAL "Intel") - # INTEL - IF (APPLE) - IF( CPUOPTION_SSE42 ) - set(SSE_FLAGS "-msse4 -mfpmath=sse") # -mtune=native -march=native - ELSEIF (CPUOPTION_SSE3) - set(SSE_FLAGS "-msse3 -mfpmath=sse") # -mtune=native -march=native - ENDIF (CPUOPTION_SSE42) - else(APPLE) - set(AVX_FLAGS "-fp-model source -march=native -axCORE-AVX2,CORE-AVX-I,AVX") #-mavx - set(AVX2_FLAGS "-march=native -axCORE-AVX2,CORE-AVX-I,AVX") #-march=core-avx2 - set(SSE_FLAGS "-axSSE4.2 -march=native") - endif(APPLE) - set(SCALFMM_CXX_FLAGS "${SCALFMM_CXX_FLAGS} -fma -align -finline-functions") - #-Wshadow -Wpointer-arith -Wcast-qual -Wconversion -Wall -Wnosign-conversion ") - elseif(CMAKE_CXX_COMPILER_ID STREQUAL "XL") - set(SCALFMM_CXX_FLAGS "${SCALFMM_CXX_FLAGS} -mcpu=power8 -mtune=power8") - else() #if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU") - # NOT INTEL - if(NOT SCALFMM_USE_MPI) - include(CheckCCompilerFlag) - check_c_compiler_flag(-Wzero-as-null-pointer-constant HAS_WZERO_NULL_PTR_FLAG) - if(HAS_WZERO_NULL_PTR_FLAG) - set(SCALFMM_CXX_FLAGS "${SCALFMM_CXX_FLAGS} -Wzero-as-null-pointer-constant") - endif() - else() - include(CheckCCompilerFlag) - check_c_compiler_flag(-Wno-literal-suffix HAS_NO_LITERAL_SUFFIX_FLAG) - if(HAS_NO_LITERAL_SUFFIX_FLAG) - set(SCALFMM_CXX_FLAGS "${SCALFMM_CXX_FLAGS} -Wno-literal-suffix") - endif() - endif() - IF (APPLE) - # set(SSE_FLAGS "-msse4 -mfpmath=sse") # -mtune=native -march=native - IF( CPUOPTION_SSE42 ) - set(SSE_FLAGS "-msse4 -mfpmath=sse") # -mtune=native -march=native - ELSEIF (CPUOPTION_SSE3) - set(SSE_FLAGS "-msse3 -mfpmath=sse") # -mtune=native -march=native - ENDIF (CPUOPTION_SSE42) - set(AVX_FLAGS "-mtune=native -march=avx") - set(AVX2_FLAGS "-mtune=native -march=native -mmic") - else(APPLE) - set(SSE_FLAGS "-mtune=native -march=native") - set(AVX_FLAGS "-mtune=native -march=native") - set(AVX2_FLAGS "-mtune=native -march=native -mmic") - endif(APPLE) - endif() + set(INASTEMP_JUST_LIB TRUE) + # add the cmakelist directory + add_subdirectory(inastemp) + # use the filled variables from inastemp + INCLUDE_DIRECTORIES( + ${INASTEMP_BINARY_DIR}/Src + ${INASTEMP_SOURCE_DIR}/Src + ${INASTEMP_INCLUDE_DIR} + ${CMAKE_CURRENT_BINARY_DIR}/inastemp/Src + ) + # propagate the flags to be able to compile + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${INASTEMP_CXX_FLAGS}") ############################################################################## # FUSE list # @@ -637,129 +581,13 @@ if (MORSE_DISTRIB_DIR OR EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/CMakeModules/morse_ list(APPEND FUSE_LIST "STARPU") ################################################################## - # Use SSE # + # FUSE # ################################################################## - message( STATUS "SCALFMM_USE_SSE = ${SCALFMM_USE_SSE}" ) - if( SCALFMM_USE_SSE ) - if(NOT EXISTS ${SCALFMM_CMAKE_MODULE_PATH}/compileTestSse.cpp) - message(FATAL_ERROR "The CompileTestSseFile does not exist (${SCALFMM_CMAKE_MODULE_PATH}/compileTestSse.cpp)" ) - endif() - message( STATUS "SSE_FLAGS ${SSE_FLAGS} -- ${CMAKE_CXX_FLAGS} ") - try_compile(COMPILE_SSE ${CMAKE_CURRENT_BINARY_DIR} - ${SCALFMM_CMAKE_MODULE_PATH}/compileTestSse.cpp - COMPILE_DEFINITIONS "${CMAKE_CXX_FLAGS} ${SSE_FLAGS}" - OUTPUT_VARIABLE COMPILE_SSE_OUTPUT) - - if(${COMPILE_SSE}) - set(SCALFMM_CXX_FLAGS "${SCALFMM_CXX_FLAGS} ${SSE_FLAGS}") - - try_compile(COMPILE_RESULT_VAR ${CMAKE_CURRENT_BINARY_DIR} - ${SCALFMM_CMAKE_MODULE_PATH}/checkSSEpe.cpp - COMPILE_DEFINITIONS "${CMAKE_CXX_FLAGS} ${SSE_FLAGS}") - if( NOT ${COMPILE_RESULT_VAR}) - set(__SSEPE_INTEL_COMPILER ON) - endif() - #set(SCALFMM_USE_AVX OFF) - else(${COMPILE_SSE}) - message(FATAL_ERROR "SSE NOT SUPPORTED ; Set SCALFMM_USE_SSE to OFF \n Output from test is : ${COMPILE_SSE_OUTPUT}") - endif(${COMPILE_SSE}) - endif() list(APPEND FUSE_LIST "SSE") - - ################################################################## - # Use AVX # - ################################################################## - - message(STATUS "SCALFMM_USE_AVX = ${SCALFMM_USE_AVX}") - if(SCALFMM_USE_AVX) - if(NOT EXISTS ${SCALFMM_CMAKE_MODULE_PATH}/compileTestAvx.cpp) - - message(WARNING "SCALFMM_CMAKE_MODULE_PATH ${SCALFMM_CMAKE_MODULE_PATH}/compileTestAvx.cpp" ) - message(FATAL_ERROR "The CompileTestAvxFile does not exist (${SCALFMM_CMAKE_MODULE_PATH}/compileTestAvx.cpp)" ) - endif() - - try_compile(COMPILE_AVX ${CMAKE_CURRENT_BINARY_DIR} - ${SCALFMM_CMAKE_MODULE_PATH}/compileTestAvx.cpp - COMPILE_DEFINITIONS "${CMAKE_CXX_FLAGS} ${AVX_FLAGS}" - OUTPUT_VARIABLE COMPILE_AVX_OUTPUT) - if(${COMPILE_AVX}) - message(STATUS "%%%%%%%%%%%% COMPILE_AVX = ${COMPILE_AVX} %%%%< ${AVX_FLAGS}") - - set(SCALFMM_CXX_FLAGS "${SCALFMM_CXX_FLAGS} ${AVX_FLAGS}") - message(STATUS "%%%%%%%%%%%% SCALFMM_CXX_FLAGS = ${SCALFMM_CXX_FLAGS}") - #set( SCALFMM_USE_SSE OFF FORCE) # ne marche pas - try_compile(COMPILE_RESULT_AVSPE ${CMAKE_CURRENT_BINARY_DIR} - ${SCALFMM_CMAKE_MODULE_PATH}/checkAVXpe.cpp - COMPILE_DEFINITIONS "${CMAKE_CXX_FLAGS} ${AVX_FLAGS}") - if( NOT ${COMPILE_RESULT_AVSPE}) - - - set(__AVXPE_INTEL_COMPILER ON) - endif() - - message(STATUS ${CMAKE_CXX_FLAGS} ) - else(${COMPILE_AVX}) - message(FATAL_ERROR "AVX NOT SUPPORTED ; Set SCALFMM_USE_AVX to OFF \n Output from test is : ${COMPILE_AVX_OUTPUT} ") - endif(${COMPILE_AVX}) - endif(SCALFMM_USE_AVX) list(APPEND FUSE_LIST "AVX") - # - # Error if both SCALFMM_USE_AVX AND SCALFMM_USE_SSE are set - # - if( SCALFMM_USE_AVX AND SCALFMM_USE_SSE) - message(FATAL_ERROR "Check SCALFMM_USE_SSE or SCALFMM_USE_AVX BUT NOT BOTH. ") - endif(SCALFMM_USE_AVX AND SCALFMM_USE_SSE) - ################################################################## - # Use AVX2 # - ################################################################## + list(APPEND FUSE_LIST "MIC") - message(STATUS "SCALFMM_USE_AVX2 = ${SCALFMM_USE_AVX2}") - if(SCALFMM_USE_AVX2) - if(NOT EXISTS ${SCALFMM_CMAKE_MODULE_PATH}/compileTestAvx2.cpp) - message(FATAL_ERROR "The CompileTestSseFile does not exist (${SCALFMM_CMAKE_MODULE_PATH}/compileTestAvx.cpp)" ) - endif() - - try_compile(COMPILE_AVX2 ${CMAKE_CURRENT_BINARY_DIR} - ${SCALFMM_CMAKE_MODULE_PATH}/compileTestAvx2.cpp - COMPILE_DEFINITIONS "${CMAKE_CXX_FLAGS} ${AVX2_FLAGS}" - OUTPUT_VARIABLE COMPILE_AVX2_OUTPUT) - if(${COMPILE_AVX2}) - set(SCALFMM_CXX_FLAGS "${SCALFMM_CXX_FLAGS} ${AVX2_FLAGS}") - #set( SCALFMM_USE_SSE OFF FORCE) # ne marche pas - try_compile(COMPILE_RESULT_AVSPE ${CMAKE_CURRENT_BINARY_DIR} - ${SCALFMM_CMAKE_MODULE_PATH}/checkAVX2pe.cpp - COMPILE_DEFINITIONS "${CMAKE_CXX_FLAGS} ${AVX2_FLAGS}") - if( NOT ${COMPILE_RESULT_AVSPE}) - set(__AVX2PE_INTEL_COMPILER ON) - endif() - - message(STATUS ${CMAKE_CXX_FLAGS} ) - else(${COMPILE_AVX2}) - message(FATAL_ERROR "AVX2 NOT SUPPORTED ; Set SCALFMM_USE_AVX2 to OFF \n Output from test is : ${COMPILE_AVX_OUTPUT} ") - endif(${COMPILE_AVX2}) - endif(SCALFMM_USE_AVX2) - list(APPEND FUSE_LIST "AVX2") - # - # Error if both SCALFMM_USE_AVX2 AND SCALFMM_USE_SSE are set - # - if( SCALFMM_USE_AVX2 AND SCALFMM_USE_SSE) - message(FATAL_ERROR "Check SCALFMM_USE_SSE or SCALFMM_USE_AVX2 BUT NOT BOTH. ") - endif(SCALFMM_USE_AVX2 AND SCALFMM_USE_SSE) - ################################################################## - # Use native MIC compilation # - ################################################################## -# If( SCALFMM_USE_MIC_NATIVE ) -# include(CheckCCompilerFlag) -# check_c_compiler_flag(-mmic HAS_MMIC_FLAG) -# if(NOT HAS_MMIC_FLAG) -# message(FATAL_ERROR "MIC NOT SUPPORTED ; Set SCALFMM_USE_MIC_NATIVE to OFF") -# endif() -# set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mmic") -# else() -# #set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -xhost") -# endif() -# list(APPEND FUSE_LIST "MIC") ################################################################## # # Set EZTRACE diff --git a/CMakeModules/GetCpuInfos.cmake b/CMakeModules/GetCpuInfos.cmake deleted file mode 100644 index e6fd0611e7fb064341681745a0ab64d9a6495702..0000000000000000000000000000000000000000 --- a/CMakeModules/GetCpuInfos.cmake +++ /dev/null @@ -1,52 +0,0 @@ -########################################################################################### -# Berenger Bramas Inria -# This goes with the getCpuInfos.cpp -# This will create one CMAKE value per output option from the cpp file. -# For example the output of the CPP file can be: -# SSE3=TRUE;AVX=FALSE -# Then it will create: -# CPUOPTION_SSE3 = TRUE -# CPUOPTION_AVX = FALSE -# -# The binary should return 0 on success. -########################################################################################### -macro(GetCpuInfos) -# The original CPP file -set(GetCpuInfosFile "${PROJECT_SOURCE_DIR}/CMakeModules/getCpuInfos.cpp") - -# Fatal error if the file does not exist -if(NOT EXISTS ${GetCpuInfosFile}) - message(FATAL_ERROR "The GetCpuInfosFile does not exist (${GetCpuInfosFile})") -endif() - -# Compile and execute the file -try_run(RUN_RESULT_VAR COMPILE_RESULT_VAR - ${CMAKE_BINARY_DIR} ${GetCpuInfosFile} # [CMAKE_FLAGS <Flags>] [COMPILE_DEFINITIONS <flags>] - COMPILE_OUTPUT_VARIABLE comp - RUN_OUTPUT_VARIABLE run) - -# If it has successfuly compiled an run -if(COMPILE_RESULT_VAR AND (RUN_RESULT_VAR EQUAL 0) ) - set( CPU_OPTIONS ${run} ) - # For each value - foreach(optionNode ${run}) - # Get name and value - string(REPLACE "=" ";" optionNameAndValue ${optionNode}) - list(LENGTH optionNameAndValue optionLength) - # If we get both - if(optionLength EQUAL 2) - list(GET optionNameAndValue 0 optionName) - list(GET optionNameAndValue 1 optionValue) - # create cmake variable - set(CPUOPTION_${optionName} ${optionValue}) - else() - message(WARNING "GetCpuInfosFile wrong format for ${optionNode}.") - endif() - endforeach() - # output the sentence from the binrary - message(STATUS "CPUOPTION : ${CPU_OPTIONS}") -else() - message(WARNING "GetCpuInfosFile did not return correctly.") -endif() - -endmacro(GetCpuInfos) diff --git a/CMakeModules/checkAVX2pe.cpp b/CMakeModules/checkAVX2pe.cpp deleted file mode 100644 index 8cf8b1c7d888b39e92d72f8aa3fea7d6ba77c366..0000000000000000000000000000000000000000 --- a/CMakeModules/checkAVX2pe.cpp +++ /dev/null @@ -1,12 +0,0 @@ - -#include "immintrin.h" - - -int main() { -#ifdef __MIC__ - __m512 tx, ty ; - tx += ty ; -#endif - return 0; -} - diff --git a/CMakeModules/checkAVXpe.cpp b/CMakeModules/checkAVXpe.cpp deleted file mode 100644 index 8f4838ac740f8b747296379fefb763d71ace2b53..0000000000000000000000000000000000000000 --- a/CMakeModules/checkAVXpe.cpp +++ /dev/null @@ -1,9 +0,0 @@ - -#include "immintrin.h" - - -int main() { - __m256d tx, ty ; - tx += ty ; - return 0; -} diff --git a/CMakeModules/checkSSEpe.cpp b/CMakeModules/checkSSEpe.cpp deleted file mode 100644 index fe26f30df6d45ce4baa67bf71fdcf89be797a43e..0000000000000000000000000000000000000000 --- a/CMakeModules/checkSSEpe.cpp +++ /dev/null @@ -1,15 +0,0 @@ -#include <xmmintrin.h> // SSE -#include <emmintrin.h> //SSE2 -#include <pmmintrin.h> //SSE3 -#ifdef __SSSE3__ -#include <tmmintrin.h> //SSSE3 -#endif -#ifdef __SSSE4_1__ -#include <smmintrin.h> // SSE4 -#endif - -int main() { - __m128d tx, ty ; - tx += ty ; - return 0; -} diff --git a/CMakeModules/compileTestAvx.cpp b/CMakeModules/compileTestAvx.cpp deleted file mode 100644 index fc9a21b637ed21e64b8e9aa586304bfb5dd29aa1..0000000000000000000000000000000000000000 --- a/CMakeModules/compileTestAvx.cpp +++ /dev/null @@ -1,27 +0,0 @@ - -#include <x86intrin.h> -#include <xmmintrin.h> // SSE -#include <emmintrin.h> // SSE2 -#include <pmmintrin.h> // SSE3 -#include <tmmintrin.h> // SSSE3 -#include <smmintrin.h> // SSE4 - -#include <immintrin.h> // AVX - -int main(){ - { - __m256d res0d, res1d; - res0d = _mm256_hadd_pd(res0d, res1d); - - __m256 res0, res1; - res0 = _mm256_hadd_ps(res0, res1); - } - { - __m128d res0d, res1d; - res0d = _mm_hadd_pd(res0d, res1d); - - __m128 res0, res1; - res0 = _mm_hadd_ps(res0, res1); - } - return 0; -} diff --git a/CMakeModules/compileTestAvx2.cpp b/CMakeModules/compileTestAvx2.cpp deleted file mode 100644 index cc5138c77dbba62a31952b1e7a8af5fc6bb07b3f..0000000000000000000000000000000000000000 --- a/CMakeModules/compileTestAvx2.cpp +++ /dev/null @@ -1,37 +0,0 @@ - -#include <x86intrin.h> -#include <xmmintrin.h> // SSE -#include <emmintrin.h> // SSE2 -#include <pmmintrin.h> // SSE3 -#include <tmmintrin.h> // SSSE3 -#include <smmintrin.h> // SSE4 - -#include <immintrin.h> // AVX - -int main(){ - { - #ifdef __MIC__ - __m512d res0d, res1d; - res0d = _mm512_hadd_pd(res0d, res1d); - - __m512 res0, res1; - res0 = _mm512_hadd_ps(res0, res1); - #endif - } - { - __m256d res0d, res1d; - res0d = _mm256_hadd_pd(res0d, res1d); - - __m256 res0, res1; - res0 = _mm256_hadd_ps(res0, res1); - } - { - __m128d res0d, res1d; - res0d = _mm_hadd_pd(res0d, res1d); - - __m128 res0, res1; - res0 = _mm_hadd_ps(res0, res1); - } - return 0; -} - diff --git a/CMakeModules/compileTestIntel.cpp b/CMakeModules/compileTestIntel.cpp deleted file mode 100644 index 3db518c378cc0e36a067bea93d7cf0b45a5ff222..0000000000000000000000000000000000000000 --- a/CMakeModules/compileTestIntel.cpp +++ /dev/null @@ -1,13 +0,0 @@ -int main(){ - - int i ; -#ifdef __INTEL_COMPILER - - i = 0; - -#else - -#error 'Not Intel Compiler " - -#endif -} diff --git a/CMakeModules/compileTestSse.cpp b/CMakeModules/compileTestSse.cpp deleted file mode 100644 index 0ff249bb5bd69662efb5f7956c2a9c980b54e8fc..0000000000000000000000000000000000000000 --- a/CMakeModules/compileTestSse.cpp +++ /dev/null @@ -1,20 +0,0 @@ - -#include <x86intrin.h> -#include <xmmintrin.h> // SSE -#include <emmintrin.h> // SSE2 -#include <pmmintrin.h> // SSE3 -#ifdef __SSSE3__ -#include <tmmintrin.h> //SSSE3 -#endif -#ifdef __SSSE4_1__ -#include <smmintrin.h> // SSE4 -#endif -int main(){ - __m128d res0d, res1d; - res0d = _mm_hadd_pd(res0d, res1d); - - __m128 res0, res1; - res0 = _mm_hadd_ps(res0, res1); - - return 0; -} diff --git a/CMakeModules/getCpuInfos.cpp b/CMakeModules/getCpuInfos.cpp deleted file mode 100644 index 4cbd3143cb373f01739af20ec4d7ea13c8389495..0000000000000000000000000000000000000000 --- a/CMakeModules/getCpuInfos.cpp +++ /dev/null @@ -1,338 +0,0 @@ -/////////////////////////////////////////////////////////////////////////// -// Berenger Bramas INRIA - 2014 -// Code provided under GNU Lesser General Public License -// -// -// This file ask the cpuid to get access to CPU properties. -// The file contains 3 mains parts: -// × First part is a wrapper in case we are on Windows or Linux -// × Second part is several call to the function and fill of an list -// × Third is out of scope, it prints the state and the properties -// in a strict format in order to post process -/////////////////////////////////////////////////////////////////////////// - - - -/////////////////////////////////////////////////////////////////////////// -// Part 1: -// Defines cpuid: -// × A Wrapper if we are on windows -// × A call to assembly else -/////////////////////////////////////////////////////////////////////////// - -enum RegistersNum { - EaxRegister = 0, - EbxRegister, - EcxRegister, - EdxRegister -}; - -#ifdef _WIN32 - -// On windows __cpuid exists: http://msdn.microsoft.com/en-us/library/hskdteyh(v=vs.90).aspx -// void __cpuid(int CPUInfo[4],int InfoType); -// we would like to have the same name for not windows -#define cpuid __cpuid - -#elif _ARCH_PPC -#error("PPC") -#else - -// Else we have to ask the CPU directly by executin cpuid. -// eax should contains the information querry argument. -// Then we have to take the results from the different registers. -// -// From : http://www.ibiblio.org/gferg/ldp/GCC-Inline-Assembly-HOWTO.html -// -// asm ( assembler template -// : output operands // optional -// : input operands // optional -// : list of clobbered registers // optional -// ); -// -// +---+--------------------+ -// | r | Register(s) | -// +---+--------------------+ -// | a | %eax, %ax, %al | -// | b | %ebx, %bx, %bl | -// | c | %ecx, %cx, %cl | -// | d | %edx, %dx, %dl | -// | S | %esi, %si | -// | D | %edi, %di | -// +---+--------------------+ -// - - -// GCC Inline Assembly but with the same prototype as windows -void cpuid(int CPUInfo[4],int InfoType){ - __asm__ __volatile__ ( - "cpuid": // Execute this instruction - "=a" (CPUInfo[EaxRegister]), // Store eax in 0 - "=b" (CPUInfo[EbxRegister]), // Store ebx in 1 - "=c" (CPUInfo[EcxRegister]), // Store ecx in 2 - "=d" (CPUInfo[EdxRegister]) : // Store edx in 3 - "a" (InfoType) // Input InfoType in eax before instruction - ); -} - -#endif - -#ifndef _ARCH_PPC -bool CPUInfoGetEAX(const int CPUInfo[4], const int position){ - return (CPUInfo[EaxRegister] & ((int)1 << position)) != 0; -} - -bool CPUInfoGetEBX(const int CPUInfo[4], const int position){ - return (CPUInfo[EbxRegister] & ((int)1 << position)) != 0; -} - -bool CPUInfoGetECX(const int CPUInfo[4], const int position){ - return (CPUInfo[EcxRegister] & ((int)1 << position)) != 0; -} - -bool CPUInfoGetEDX(const int CPUInfo[4], const int position){ - return (CPUInfo[EdxRegister] & ((int)1 << position)) != 0; -} - -/////////////////////////////////////////////////////////////////////////// -// Part 2: -// Call the cpuid function and ask for particular information. -// In our case we want to use these information to print it (and later use it -// in a CMake file). -// So you can change this file to get more informations and do something else with them. -// -// From 64-ia-32-architectures-software-developer-instruction-set-reference-manual-325383.pdf -// Or recommanded : AMD CPUID_Specification.pdf -// We know in part CPUID—CPU Identification that a call to the cpuid instruction fill the registers -// with the cpu property. -/////////////////////////////////////////////////////////////////////////// - -#include <string> -#include <list> - -struct CpuProperty { - CpuProperty(const char inName[], const bool IsEnable) - : name(inName), enabled(IsEnable){ - } - - std::string name; - bool enabled; -}; - -std::list<CpuProperty> getProperties(){ - std::list<CpuProperty> properties; - - // To store the registers value - int info[4]; - - // Basic CPUID Information - cpuid(info, 0); - // The largest CPUID standard-function input value supported by the processor implementation. - const int limitStandardFunction = info[EaxRegister]; - - // Extended Function CPUID Information - cpuid(info, 0x80000000); - // The largest CPUID extended-function input value supported by the processor implementation - int limitExtendedFunction = info[EaxRegister]; - - // Detect Instruction Set - if (limitStandardFunction >= 1){ - cpuid(info,0x00000001); // Basic CPUID Information - /* - 0x00000001 - EDX : - 31:29 Reserved. - 28 HTT: Hyper-Threading Technology. Indicates either that there is more than one thread per CPU core - or more than one CPU core per processor. AMD currently does not support more than one thread per - CPU core. See “Legacy Method” on page 23. - 27 Reserved. - 26 SSE2: SSE2 extensions. See Appendix D “CPUID Feature Sets” in APM3. - 25 SSE: SSE extensions. See Appendix D “CPUID Feature Sets” in APM3 appendix and “64-Bit Media - Programming” in APM1. - 24 FXSR: FXSAVE and FXRSTOR instructions. See “FXSAVE” and “FXRSTOR” in APM4. - 23 MMX: MMXTM instructions. See Appendix D “CPUID Feature Sets” in APM3 and “128-Bit Media - and Scientific Programming” in APM1. - 22:20 Reserved. - 19 18 Reserved. - 17 PSE36: Page-size extensions. The PDE[20:13] supplies physical address [39:32]. See “Page Translation and Protection” in APM2. - 16 PAT: Page attribute table. PCD, PWT, and PATi are used to alter memory type. See “Page-Attribute Table Mechanism” in APM2. - 15 CMOV: Conditional move instructions, CMOV, FCMOV. See “CMOV”, “FCMOV” in APM3. - 14 MCA: Machine check architecture, MCG_CAP. See “Machine Check Mechanism” in APM2. - 13 PGE: Page global extension, CR4.PGE. See “Page Translation and Protection” in APM2. - 12 MTRR: Memory-type range registers. MTRRcap supported. See “Page Translation and Protection” in APM2. - 11 SysEnterSysExit: SYSENTER and SYSEXIT instructions. See “SYSENTER”, “SYSEXIT“ in APM3. - 10 Reserved - 9 APIC. Advanced programmable interrupt controller (APIC) exists and is enabled. See “Exceptions - and Interrupts” in APM2. - 8 CMPXCHG8B: CMPXCHG8B instruction. See “CMPXCHG8B” in APM3. - 7 MCE: Machine check exception, CR4.MCE. See “Machine Check Mechanism” in APM2. - 6 PAE: Physical-address extensions (PAE), support for physical addresses ≥ 32b. Number of physical - address bits above 32b is implementation specific. See “Page Translation and Protection” in APM2. - 5 MSR: AMD model-specific registers (MSRs), with RDMSR and WRMSR instructions. See “Model - Specific Registers” in APM2. - 4 TSC: Time stamp counter. RDTSC and RDTSCP instruction support. See “Debug and Performance - Resources” in APM2. - 3 PSE: Page-size extensions (4 MB pages). See “Page Translation and Protection” in APM2. - 2 DE: Debugging extensions, I/O breakpoints, CR4.DE. See “Debug and Performance Resources” in - APM2. - 1 VME: Virtual-mode enhancements, CR4.VME, CR4.PVI, software interrupt indirection, expansion - of the TSS with the software, indirection bitmap, EFLAGS.VIF, EFLAGS.VIP. See “System - Resources” in APM2. - 0 FPU: x87 floating point unit on-chip. See “x87 Floating Point Programming” in APM1 - */ - - properties.push_back(CpuProperty("MMX", CPUInfoGetEDX(info, 23))); - properties.push_back(CpuProperty("SSE", CPUInfoGetEDX(info, 25))); - properties.push_back(CpuProperty("SSE2", CPUInfoGetEDX(info, 26))); - - /* - 0x00000001 - ECX : - 0 SSE3 Streaming SIMD Extensions 3 (SSE3). A value of 1 indicates the processor supports this - technology. - 1 PCLMULQDQ PCLMULQDQ. A value of 1 indicates the processor supports the PCLMULQDQ instruction - 2 DTES64 64-bit DS Area. A value of 1 indicates the processor supports DS area using 64-bit layout - 3 MONITOR MONITOR/MWAIT. A value of 1 indicates the processor supports this feature. - 4 DS-CPL CPL Qualified Debug Store. A value of 1 indicates the processor supports the extensions to the - Debug Store feature to allow for branch message storage qualified by CPL. - 5 VMX Virtual Machine Extensions. A value of 1 indicates that the processor supports this technology - 6 SMX Safer Mode Extensions. A value of 1 indicates that the processor supports this technology. See - Chapter 5, “Safer Mode Extensions Reference”. - 7 EIST Enhanced Intel SpeedStep® technology. A value of 1 indicates that the processor supports this - technology. - 8 TM2 Thermal Monitor 2. A value of 1 indicates whether the processor supports this technology. - 9 SSSE3 A value of 1 indicates the presence of the Supplemental Streaming SIMD Extensions 3 (SSSE3). A - value of 0 indicates the instruction extensions are not present in the processor - 10 CNXT-ID L1 Context ID. A value of 1 indicates the L1 data cache mode can be set to either adaptive mode - or shared mode. A value of 0 indicates this feature is not supported. See definition of the - IA32_MISC_ENABLE MSR Bit 24 (L1 Data Cache Context Mode) for details. - 11 SDBG A value of 1 indicates the processor supports IA32_DEBUG_INTERFACE MSR for silicon debug. - 12 FMA A value of 1 indicates the processor supports FMA extensions using YMM state. - 13 CMPXCHG16B CMPXCHG16B Available. A value of 1 indicates that the feature is available. See the - “CMPXCHG8B/CMPXCHG16B—Compare and Exchange Bytes” section in this chapter for a - description. - 14 xTPR Update - Control - xTPR Update Control. A value of 1 indicates that the processor supports changing - IA32_MISC_ENABLE[bit 23]. - 15 PDCM Perfmon and Debug Capability: A value of 1 indicates the processor supports the performance - and debug feature indication MSR IA32_PERF_CAPABILITIES. - 16 Reserved Reserved - 17 PCID Process-context identifiers. A value of 1 indicates that the processor supports PCIDs and that - software may set CR4.PCIDE to 1. - 18 DCA A value of 1 indicates the processor supports the ability to prefetch data from a memory mapped - device. - 19 SSE4.1 A value of 1 indicates that the processor supports SSE4.1. - 20 SSE4.2 A value of 1 indicates that the processor supports SSE4.2. - 21 x2APIC A value of 1 indicates that the processor supports x2APIC feature. - 22 MOVBE A value of 1 indicates that the processor supports MOVBE instruction. - 23 POPCNT A value of 1 indicates that the processor supports the POPCNT instruction. - 24 TSC-Deadline A value of 1 indicates that the processor’s local APIC timer supports one-shot operation using a - TSC deadline value. - 25 AESNI A value of 1 indicates that the processor supports the AESNI instruction extensions. - 26 XSAVE A value of 1 indicates that the processor supports the XSAVE/XRSTOR processor extended states - feature, the XSETBV/XGETBV instructions, and XCR0. - 27 OSXSAVE A value of 1 indicates that the OS has set CR4.OSXSAVE[bit 18] to enable the XSAVE feature set. - 28 AVX A value of 1 indicates the processor supports the AVX instruction extensions. - 29 F16C A value of 1 indicates that processor supports 16-bit floating-point conversion instructions. - 30 RDRAND A value of 1 indicates that processor supports RDRAND instruction. - 31 Not Used Always returns 0. - */ - properties.push_back(CpuProperty("SSE3", CPUInfoGetECX(info, 0))); - properties.push_back(CpuProperty("SSSE3", CPUInfoGetECX(info, 9))); - properties.push_back(CpuProperty("SSE41", CPUInfoGetECX(info, 19))); - properties.push_back(CpuProperty("SSE42", CPUInfoGetECX(info, 20))); - properties.push_back(CpuProperty("AVX", CPUInfoGetECX(info, 28))); - properties.push_back(CpuProperty("FMA3", CPUInfoGetECX(info, 12))); - } - - if (limitExtendedFunction >= 0x80000001){ - cpuid(info,0x80000001); // Extended Function CPUID Information - /* - 0x80000001 - EDX : - 31 3DNow: 3DNow!TM instructions. See Appendix D “Instruction Subsets and CPUID Feature Sets” in APM3. - 30 3DNowExt: AMD extensions to 3DNow! instructions. See Appendix D “Instruction Subsets and - CPUID Feature Sets” in APM3. - 29 LM: Long mode. See “Processor Initialization and Long-Mode Activation” in APM2. - 28 Reserved. - 27 RDTSCP: RDTSCP instruction. See “RDTSCP” in APM3. - 26 Page1GB: 1-GB large page support. See “1-GB Paging Support” in APM2. - 25 FFXSR: FXSAVE and FXRSTOR instruction optimizations. See “FXSAVE” and “FXRSTOR” in APM4. - 24 FXSR: FXSAVE and FXRSTOR instructions. Same as CPUID Fn0000_0001_EDX[FXSR]. - 23 MMX: MMXTM instructions. Same as CPUID Fn0000_0001_EDX[MMX]. - 22 MmxExt: AMD extensions to MMX instructions. See Appendix D “Instruction Subsets and CPUID - Feature Sets” in APM3 and “128-Bit Media and Scientific Programming” in APM1. - 21 Reserved. - 20 NX: No-execute page protection. See “Page Translation and Protection” in APM2. - 19:18 Reserved. - 17 PSE36: Page-size extensions. Same as CPUID Fn0000_0001_EDX[PSE36]. - 16 PAT: Page attribute table. Same as CPUID Fn0000_0001_EDX[PAT]. - 15 CMOV: Conditional move instructions. Same as CPUID Fn0000_0001_EDX[CMOV] - 14 MCA: Machine check architecture. Same as CPUID Fn0000_0001_EDX[MCA]. - 13 PGE: Page global extension. Same as CPUID Fn0000_0001_EDX[PGE]. - 12 MTRR: Memory-type range registers. Same as CPUID Fn0000_0001_EDX[MTRR]. - 11 SysCallSysRet: SYSCALL and SYSRET instructions. See “SYSCALL” and “SYSRET” in APM3. - 10 Reserved. - 9 APIC. Advanced programmable interrupt controller. Same as CPUID Fn0000_0001_EDX[APIC]. - 8 CMPXCHG8B: CMPXCHG8B instruction. Same as CPUID Fn0000_0001_EDX[CMPXCHG8B]. - 7 MCE: Machine check exception. Same as CPUID Fn0000_0001_EDX[MCE]. - 6 PAE: Physical-address extensions. Same as CPUID Fn0000_0001_EDX[PAE]. - 5 MSR: AMD model-specific registers. Same as CPUID Fn0000_0001_EDX[MSR]. - 4 TSC: Time stamp counter. Same as CPUID Fn0000_0001_EDX[TSC]. - 3 PSE: Page-size extensions. Same as CPUID Fn0000_0001_EDX[PSE]. - 2 DE: Debugging extensions. Same as CPUID Fn0000_0001_EDX[DE]. - 1 VME: Virtual-mode enhancements. Same as CPUID Fn0000_0001_EDX[VME]. - 0 FPU: x87 floating-point unit on-chip. Same as CPUID Fn0000_0001_EDX[FPU]. - */ - properties.push_back(CpuProperty("x64", CPUInfoGetEDX(info, 29))); - /* - 0x80000001 - ECX : - 31:14 Reserved. - 13 WDT: Watchdog timer support. - 12 SKINIT: SKINIT, STGI, and DEV support. - 11:10 Reserved. - 9 OSVW: OS visible workaround. Indicates OS-visible workaround support. See “OS Visible Work- - around (OSVW) Information” in APM2. - 8 3DNowPrefetch: PREFETCH and PREFETCHW instruction support. See “PREFETCH” and “PREFETCHW” in APM3. - 7 MisAlignSse: Misaligned SSE mode. See “Misaligned Access Support Added for SSE Instructions” - in APM1. - 6 SSE4A: EXTRQ, INSERTQ, MOVNTSS, and MOVNTSD instruction support. See “EXTRQ”, - “INSERTQ”, “MOVNTSS”, and “MOVNTSD” in APM4. - 5 ABM: Advanced bit manipulation. LZCNT instruction support. See “LZCNT” in APM3. - 4 AltMovCr8: LOCK MOV CR0 means MOV CR8. See “MOV(CRn)” in APM3. - 3 ExtApicSpace: This bit indicates the presence of extended APIC register space starting at offset - 400h from the “APIC Base Address Register,” as specified in the BKDG.. - 2 SVM: Secure virtual machine feature. See “Secure Virtual Machine” in APM2. - 1 CmpLegacy: Core multi-processing legacy mode. See “Legacy Method” on page 23. - 0 LahfSahf: LAHF and SAHF instruction support in 64-bit mode. See “LAHF” and “SAHF” in APM3. - */ - properties.push_back(CpuProperty("SSE4a", CPUInfoGetECX(info, 6))); - properties.push_back(CpuProperty("FMA4", CPUInfoGetECX(info, 16))); - properties.push_back(CpuProperty("XOP", CPUInfoGetECX(info, 11))); - } - - return properties; -} -#else -// POWER (IBM) -#endif - - -/////////////////////////////////////////////////////////////////////////// -// Part 3: -// Print the information in a format to use it with CMake -/////////////////////////////////////////////////////////////////////////// - -#include <iostream> - -int main(){ - const std::list<CpuProperty> properties = getProperties(); - - const std::list<CpuProperty>::const_iterator endIterProperties = properties.end(); - for(std::list<CpuProperty>::const_iterator iterProperties = properties.begin() - ; iterProperties != endIterProperties - ; ++iterProperties){ - // Print the status - std::cout << (*iterProperties).name << "=" << ((*iterProperties).enabled?"TRUE":"FALSE") << ";"; - } - - return 0; -} diff --git a/README.md b/README.md index a8a8f525e55f6bbd260402481f6d898075142353..7c6c11fb63784ea5c3d2d2cd926d8fcec501806f 100644 --- a/README.md +++ b/README.md @@ -30,7 +30,7 @@ The following are optional: ### Get and Build ScalFMM To use last development states of ScalFMM, please clone the develop - branch. Note that ScalFMM contains a git submodule `morse_cmake`. + branch. Note that ScalFMM contains two git submodules `morse_cmake` and `inastemp`. To get sources please use these commands: ``` bash git clone --recursive git@gitlab.inria.fr:solverstack/ScalFMM.git -b develop diff --git a/Src/Adaptive/FAdaptChebKernel.hpp b/Src/Adaptive/FAdaptChebKernel.hpp index 1c3f753436b458b108f27503eb58221686b0b86b..2ed9b00e26e568b280f1e9a8d1e2b00b3fac9649 100644 --- a/Src/Adaptive/FAdaptChebKernel.hpp +++ b/Src/Adaptive/FAdaptChebKernel.hpp @@ -95,13 +95,13 @@ public: // apply P2L for(int idxRhs = 0 ; idxRhs < NVALS ; ++idxRhs){ for (unsigned int m = 0; m<FBase::nnodes; ++m) { - ComputeClass XX = FMath::ConvertTo<ComputeClass>(X[m].getX()); - ComputeClass XY = FMath::ConvertTo<ComputeClass>(X[m].getY()); - ComputeClass XZ = FMath::ConvertTo<ComputeClass>(X[m].getZ()); + ComputeClass XX = ComputeClass(X[m].getX()); + ComputeClass XY = ComputeClass(X[m].getY()); + ComputeClass XZ = ComputeClass(X[m].getZ()); std::size_t idxPart = 0; // Compute using vectorization for all but the last array elements - ComputeClass tmpLocalExp = FMath::Zero<ComputeClass>(); + ComputeClass tmpLocalExp = ComputeClass(0); for (; idxPart < ((particles->getNbParticles()) / FRealCount); @@ -114,7 +114,7 @@ public: * physicalValues[idxPart]; } - local->get(idxRhs)[m] += FMath::ConvertTo<FReal>(tmpLocalExp); + local->get(idxRhs)[m] += ComputeClass(tmpLocalExp); // Compute the last array elements one by one if they exist if(idxPart < ((particles->getNbParticles() + FRealCount - 1) / FRealCount)) { @@ -172,11 +172,11 @@ public: for (unsigned int n=0; n<FBase::nnodes; ++n){ ComputeClass MultipoleExpansion = - FMath::ConvertTo<ComputeClass, FReal>(pole->get(idxRhs)[n]); + ComputeClass(pole->get(idxRhs)[n]); - ComputeClass YX = FMath::ConvertTo<ComputeClass, FReal>(Y[n].getX()); - ComputeClass YY = FMath::ConvertTo<ComputeClass, FReal>(Y[n].getY()); - ComputeClass YZ = FMath::ConvertTo<ComputeClass, FReal>(Y[n].getZ()); + ComputeClass YX = ComputeClass(Y[n].getX()); + ComputeClass YY = ComputeClass(Y[n].getY()); + ComputeClass YZ = ComputeClass(Y[n].getZ()); for(std::size_t idxPart = 0; idxPart < ( (particles->getNbParticles() + FRealCount - 1) diff --git a/Src/Adaptive/FAdaptUnifKernel.hpp b/Src/Adaptive/FAdaptUnifKernel.hpp index 16f7254e1212c8bd49c91937bb26b1af3cf3ccfd..75063ba960d135598deba4b5faefd9f8999953e5 100644 --- a/Src/Adaptive/FAdaptUnifKernel.hpp +++ b/Src/Adaptive/FAdaptUnifKernel.hpp @@ -140,11 +140,11 @@ public: // apply P2L for(int idxRhs = 0 ; idxRhs < NVALS ; ++idxRhs){ for (unsigned int m = 0; m < FBase::nnodes; ++m) { - ComputeClass XX = FMath::ConvertTo<ComputeClass>(X[m].getX()); - ComputeClass XY = FMath::ConvertTo<ComputeClass>(X[m].getY()); - ComputeClass XZ = FMath::ConvertTo<ComputeClass>(X[m].getZ()); + ComputeClass XX = ComputeClass(X[m].getX()); + ComputeClass XY = ComputeClass(X[m].getY()); + ComputeClass XZ = ComputeClass(X[m].getZ()); - ComputeClass tmpLocalExp = FMath::Zero<ComputeClass>(); + ComputeClass tmpLocalExp = ComputeClass(0); // Compute using vectorization for all but the last array elements std::size_t idxPart = 0; for (; idxPart < (particles->getNbParticles() / FRealCount); @@ -157,7 +157,7 @@ public: * physicalValues[idxPart]; } - local->get(idxRhs)[m] += FMath::ConvertTo<FReal>(tmpLocalExp); + local->get(idxRhs)[m] += FReal(tmpLocalExp); // Compute the last array elements one by one if they exist if(idxPart < ((particles->getNbParticles() + FRealCount - 1) / FRealCount)) { @@ -221,11 +221,11 @@ public: for (unsigned int n=0; n<FBase::nnodes; ++n){ ComputeClass MultipoleExpansion = - FMath::ConvertTo<ComputeClass, FReal>(pole->get(idxRhs)[n]); + ComputeClass(pole->get(idxRhs)[n]); - ComputeClass YX = FMath::ConvertTo<ComputeClass, FReal>(Y[n].getX()); - ComputeClass YY = FMath::ConvertTo<ComputeClass, FReal>(Y[n].getY()); - ComputeClass YZ = FMath::ConvertTo<ComputeClass, FReal>(Y[n].getZ()); + ComputeClass YX = ComputeClass(Y[n].getX()); + ComputeClass YY = ComputeClass(Y[n].getY()); + ComputeClass YZ = ComputeClass(Y[n].getZ()); for(std::size_t idxPart = 0; idxPart < ( (particles->getNbParticles() + FRealCount - 1) diff --git a/Src/Kernels/Interpolation/FInterpMatrixKernel.hpp b/Src/Kernels/Interpolation/FInterpMatrixKernel.hpp index eee694ac1aadfcd7b912fd9cd6a7913952489d8a..3f0b1f9f298c6089c903a6e9dd224339071b3067 100644 --- a/Src/Kernels/Interpolation/FInterpMatrixKernel.hpp +++ b/Src/Kernels/Interpolation/FInterpMatrixKernel.hpp @@ -89,7 +89,7 @@ struct FInterpMatrixKernelR : FInterpAbstractMatrixKernel<FReal> const ValueClass diffx = (xt-xs); const ValueClass diffy = (yt-ys); const ValueClass diffz = (zt-zs); - return FMath::One<ValueClass>() / FMath::Sqrt(diffx*diffx + diffy*diffy + diffz*diffz); + return ValueClass(1) / FMath::Sqrt(diffx*diffx + diffy*diffy + diffz*diffz); } // evaluate interaction (blockwise) @@ -110,7 +110,7 @@ struct FInterpMatrixKernelR : FInterpAbstractMatrixKernel<FReal> const ValueClass diffx = (xt-xs); const ValueClass diffy = (yt-ys); const ValueClass diffz = (zt-zs); - const ValueClass one_over_r = FMath::One<ValueClass>() / FMath::Sqrt(diffx*diffx + diffy*diffy + diffz*diffz); + const ValueClass one_over_r = ValueClass(1) / FMath::Sqrt(diffx*diffx + diffy*diffy + diffz*diffz); const ValueClass one_over_r3 = one_over_r*one_over_r*one_over_r; @@ -176,9 +176,9 @@ struct FInterpMatrixKernelRH :FInterpMatrixKernelR<FReal>{ const ValueClass diffx = (xt-xs); const ValueClass diffy = (yt-ys); const ValueClass diffz = (zt-zs); - return FMath::One<ValueClass>() / FMath::Sqrt(FMath::ConvertTo<ValueClass,FReal>(LX)*diffx*diffx + - FMath::ConvertTo<ValueClass,FReal>(LY)*diffy*diffy + - FMath::ConvertTo<ValueClass,FReal>(LZ)*diffz*diffz); + return ValueClass(1) / FMath::Sqrt(ValueClass(LX)*diffx*diffx + + ValueClass(LY)*diffy*diffy + + ValueClass(LZ)*diffz*diffz); } void setCoeff(const FReal& a, const FReal& b, const FReal& c) {LX= a*a ; LY = b*b ; LZ = c *c;} @@ -208,16 +208,16 @@ struct FInterpMatrixKernelRH :FInterpMatrixKernelR<FReal>{ const ValueClass diffx = (xt-xs); const ValueClass diffy = (yt-ys); const ValueClass diffz = (zt-zs); - const ValueClass one_over_rL = FMath::One<ValueClass>() / FMath::Sqrt(FMath::ConvertTo<ValueClass,FReal>(LX)*diffx*diffx + - FMath::ConvertTo<ValueClass,FReal>(LY)*diffy*diffy + - FMath::ConvertTo<ValueClass,FReal>(LZ)*diffz*diffz); + const ValueClass one_over_rL = ValueClass(1) / (ValueClass(LX)*diffx*diffx + + ValueClass(LY)*diffy*diffy + + ValueClass(LZ)*diffz*diffz); const ValueClass one_over_rL3 = one_over_rL*one_over_rL*one_over_rL; block[0] = one_over_rL; - blockDerivative[0] = FMath::ConvertTo<ValueClass,FReal>(LX) * one_over_rL3 * diffx; - blockDerivative[1] = FMath::ConvertTo<ValueClass,FReal>(LY)* one_over_rL3 * diffy; - blockDerivative[2] = FMath::ConvertTo<ValueClass,FReal>(LZ)* one_over_rL3 * diffz; + blockDerivative[0] = ValueClass(LX) * one_over_rL3 * diffx; + blockDerivative[1] = ValueClass(LY)* one_over_rL3 * diffy; + blockDerivative[2] = ValueClass(LZ)* one_over_rL3 * diffz; } @@ -283,7 +283,7 @@ struct FInterpMatrixKernelRR : FInterpAbstractMatrixKernel<FReal> const ValueClass diffx = (xt-xs); const ValueClass diffy = (yt-ys); const ValueClass diffz = (zt-zs); - return FMath::One<ValueClass>() / FReal(diffx*diffx+diffy*diffy+diffz*diffz); + return ValueClass(1) / FReal(diffx*diffx+diffy*diffy+diffz*diffz); } // evaluate interaction (blockwise) @@ -305,12 +305,12 @@ struct FInterpMatrixKernelRR : FInterpAbstractMatrixKernel<FReal> const ValueClass diffy = (yt-ys); const ValueClass diffz = (zt-zs); const ValueClass r2 = (diffx*diffx+diffy*diffy+diffz*diffz); - const ValueClass one_over_r2 = FMath::One<ValueClass>() / (r2); + const ValueClass one_over_r2 = ValueClass(1) / (r2); const ValueClass one_over_r4 = one_over_r2*one_over_r2; block[0] = one_over_r2; - const ValueClass coef = FMath::ConvertTo<ValueClass,FReal>(-2.) * one_over_r4; + const ValueClass coef = ValueClass(-2.) * one_over_r4; blockDerivative[0] = coef * diffx; blockDerivative[1] = coef * diffy; blockDerivative[2] = coef * diffz; @@ -382,7 +382,7 @@ struct FInterpMatrixKernelLJ : FInterpAbstractMatrixKernel<FReal> const ValueClass diffz = (zt-zs); const ValueClass r = FMath::Sqrt(diffx*diffx+diffy*diffy+diffz*diffz); const ValueClass r3 = r*r*r; - const ValueClass one_over_r6 = FMath::One<ValueClass>() / (r3*r3); + const ValueClass one_over_r6 = ValueClass(1) / (r3*r3); //return one_over_r6 * one_over_r6; //return one_over_r6; return one_over_r6 * one_over_r6 - one_over_r6; @@ -409,12 +409,12 @@ struct FInterpMatrixKernelLJ : FInterpAbstractMatrixKernel<FReal> const ValueClass r = FMath::Sqrt(diffx*diffx+diffy*diffy+diffz*diffz); const ValueClass r2 = r*r; const ValueClass r3 = r2*r; - const ValueClass one_over_r6 = FMath::One<ValueClass>() / (r3*r3); + const ValueClass one_over_r6 = ValueClass(1) / (r3*r3); const ValueClass one_over_r8 = one_over_r6 / (r2); block[0] = one_over_r6 * one_over_r6 - one_over_r6; - const FReal coef = FMath::ConvertTo<ValueClass,FReal>(12.0)*one_over_r6*one_over_r8 - FMath::ConvertTo<ValueClass,FReal>(6.0)*one_over_r8; + const FReal coef = ValueClass(12.0)*one_over_r6*one_over_r8 - ValueClass(6.0)*one_over_r8; blockDerivative[0]= coef * diffx; blockDerivative[1]= coef * diffy; blockDerivative[2]= coef * diffz; @@ -493,7 +493,7 @@ struct FInterpMatrixKernelAPLUSRR : FInterpAbstractMatrixKernel<FReal> const ValueClass diffy = (yt-ys); const ValueClass diffz = (zt-zs); const ValueClass r2 = (diffx*diffx+diffy*diffy+diffz*diffz); - return FMath::One<ValueClass>() / (r2 + FMath::ConvertTo<ValueClass,FReal>(CoreWidth)); + return ValueClass(1) / (r2 + ValueClass(CoreWidth)); } // evaluate interaction (blockwise) @@ -515,13 +515,13 @@ struct FInterpMatrixKernelAPLUSRR : FInterpAbstractMatrixKernel<FReal> const ValueClass diffy = (yt-ys); const ValueClass diffz = (zt-zs); const ValueClass r2 = (diffx*diffx+diffy*diffy+diffz*diffz); - const ValueClass one_over_a_plus_r2 = FMath::One<ValueClass>() / (r2 + FMath::ConvertTo<ValueClass,FReal>(CoreWidth)); + const ValueClass one_over_a_plus_r2 = ValueClass(1) / (r2 + ValueClass(CoreWidth)); const ValueClass one_over_a_plus_r2_squared = one_over_a_plus_r2*one_over_a_plus_r2; block[0] = one_over_a_plus_r2; // TODO Fix derivative - const ValueClass coef = FMath::ConvertTo<ValueClass,FReal>(-2.) * one_over_a_plus_r2_squared; + const ValueClass coef = ValueClass(-2.) * one_over_a_plus_r2_squared; blockDerivative[0] = coef * diffx; blockDerivative[1] = coef * diffy; blockDerivative[2] = coef * diffz; diff --git a/Src/Kernels/Interpolation/FInterpMatrixKernel_Covariance.hpp b/Src/Kernels/Interpolation/FInterpMatrixKernel_Covariance.hpp index 97894794ca2900d89fcce38bd5865174614a2cd3..38f7104099a17615ddfbc0e035ad5ff85ab692e8 100644 --- a/Src/Kernels/Interpolation/FInterpMatrixKernel_Covariance.hpp +++ b/Src/Kernels/Interpolation/FInterpMatrixKernel_Covariance.hpp @@ -120,13 +120,13 @@ struct FInterpMatrixKernelGauss : FAbstractCorrelationKernel<FReal> { const ValueClass diff[3] = {(x1-x2),(y1-y2),(z1-z2)}; - ValueClass dist2 = FMath::Zero<ValueClass>(); + ValueClass dist2 = ValueClass(0.); for(int d=0; d<3; ++d){ - const ValueClass distX = diff[d] / FMath::ConvertTo<ValueClass,FReal>(lengthScale_); + const ValueClass distX = diff[d] / ValueClass(lengthScale_); dist2 += distX*distX; } - return FMath::Exp(FMath::ConvertTo<ValueClass,FReal>(-0.5)*dist2); + return FMath::Exp(ValueClass(-0.5)*dist2); } @@ -145,7 +145,7 @@ struct FInterpMatrixKernelGauss : FAbstractCorrelationKernel<FReal> ValueClass block[1], ValueClass blockDerivative[3]) const { block[0]=this->evaluate(x1,y1,z1,x2,y2,z2); - const ValueClass lengthScaleOpt = FMath::ConvertTo<ValueClass,FReal>(-1/(lengthScale_*lengthScale_)); + const ValueClass lengthScaleOpt = ValueClass(-1/(lengthScale_*lengthScale_)); blockDerivative[0] = block[0]*(x1-x2) * lengthScaleOpt; blockDerivative[1] = block[0]*(y1-y2) * lengthScaleOpt; blockDerivative[2] = block[0]*(z1-z2) * lengthScaleOpt; diff --git a/Src/Kernels/Interpolation/FInterpMatrixKernel_TensorialInteractions.hpp b/Src/Kernels/Interpolation/FInterpMatrixKernel_TensorialInteractions.hpp index 7365e70805926ff5381751934beb4d77048430bd..3b86102f213b485bc75139d1f6249bde0ba94a33 100644 --- a/Src/Kernels/Interpolation/FInterpMatrixKernel_TensorialInteractions.hpp +++ b/Src/Kernels/Interpolation/FInterpMatrixKernel_TensorialInteractions.hpp @@ -161,7 +161,7 @@ struct FInterpMatrixKernel_R_IJ : FInterpAbstractMatrixKernel<FReal> const ValueClass diffy = (yt-ys); const ValueClass diffz = (zt-zs); const ValueClass r2 = diffx*diffx+diffy*diffy+diffz*diffz; - const ValueClass one_over_r = FMath::One<ValueClass>()/FMath::Sqrt(r2 + FMath::ConvertTo<ValueClass,FReal>(_CoreWidth2)); + const ValueClass one_over_r = ValueClass(1)/FMath::Sqrt(r2 + ValueClass(_CoreWidth2)); const ValueClass one_over_r3 = one_over_r*one_over_r*one_over_r; ValueClass ri,rj; @@ -192,7 +192,7 @@ struct FInterpMatrixKernel_R_IJ : FInterpAbstractMatrixKernel<FReal> const ValueClass diffy = (yt-ys); const ValueClass diffz = (zt-zs); const ValueClass r2 = diffx*diffx+diffy*diffy+diffz*diffz; - const ValueClass one_over_r = FMath::One<ValueClass>()/FMath::Sqrt(r2 + FMath::ConvertTo<ValueClass,FReal>(_CoreWidth2)); + const ValueClass one_over_r = ValueClass(1)/FMath::Sqrt(r2 + ValueClass(_CoreWidth2)); const ValueClass one_over_r3 = one_over_r*one_over_r*one_over_r; const ValueClass r[3] = {diffx,diffy,diffz}; @@ -219,14 +219,14 @@ struct FInterpMatrixKernel_R_IJ : FInterpAbstractMatrixKernel<FReal> const ValueClass diffy = (yt-ys); const ValueClass diffz = (zt-zs); const ValueClass r2[3] = {diffx*diffx,diffy*diffy,diffz*diffz}; - const ValueClass one_over_r2 = FMath::One<ValueClass>() / (r2[0] + r2[1] + r2[2] + FMath::ConvertTo<ValueClass,FReal>(_CoreWidth2)); + const ValueClass one_over_r2 = ValueClass(1) / (r2[0] + r2[1] + r2[2] + ValueClass(_CoreWidth2)); const ValueClass one_over_r = FMath::Sqrt(one_over_r2); const ValueClass one_over_r3 = one_over_r2*one_over_r; const ValueClass r[3] = {diffx,diffy,diffz}; - const ValueClass Three = FMath::ConvertTo<ValueClass,FReal>(3.); - const ValueClass MinusOne = - FMath::One<ValueClass>(); + const ValueClass Three = ValueClass(3.); + const ValueClass MinusOne = - ValueClass(1); for(unsigned int d=0;d<NCMP;++d){ unsigned int i = indexTab[d]; diff --git a/Src/Kernels/P2P/FP2P.hpp b/Src/Kernels/P2P/FP2P.hpp index 93c7a8ab964780d601c4607b48379c9537672d1f..ffd98abe0e9114bb62f3927dec225d94f27438aa 100644 --- a/Src/Kernels/P2P/FP2P.hpp +++ b/Src/Kernels/P2P/FP2P.hpp @@ -131,14 +131,14 @@ static void GenericFullMutual(ContainerClass* const FRestrict inTargets, ComputeClass*const sourcesPotentials = (ComputeClass*)inNeighbors[idxNeighbors]->getPotentials(); for(FSize idxTarget = 0 ; idxTarget < nbParticlesTargets ; ++idxTarget){ - const ComputeClass tx = FMath::ConvertTo<ComputeClass, const FReal*>(&targetsX[idxTarget]); - const ComputeClass ty = FMath::ConvertTo<ComputeClass, const FReal*>(&targetsY[idxTarget]); - const ComputeClass tz = FMath::ConvertTo<ComputeClass, const FReal*>(&targetsZ[idxTarget]); - const ComputeClass tv = FMath::ConvertTo<ComputeClass, const FReal*>(&targetsPhysicalValues[idxTarget]); - ComputeClass tfx = FMath::Zero<ComputeClass>(); - ComputeClass tfy = FMath::Zero<ComputeClass>(); - ComputeClass tfz = FMath::Zero<ComputeClass>(); - ComputeClass tpo = FMath::Zero<ComputeClass>(); + const ComputeClass tx = ComputeClass(&targetsX[idxTarget]); + const ComputeClass ty = ComputeClass(&targetsY[idxTarget]); + const ComputeClass tz = ComputeClass(&targetsZ[idxTarget]); + const ComputeClass tv = ComputeClass(&targetsPhysicalValues[idxTarget]); + ComputeClass tfx = ComputeClass(0.); + ComputeClass tfy = ComputeClass(0.); + ComputeClass tfz = ComputeClass(0.); + ComputeClass tpo = ComputeClass(0.); for(FSize idxSource = 0 ; idxSource < nbParticlesSources ; ++idxSource){ ComputeClass Kxy[1]; @@ -146,7 +146,7 @@ static void GenericFullMutual(ContainerClass* const FRestrict inTargets, MatrixKernel->evaluateBlockAndDerivative(tx,ty,tz, sourcesX[idxSource],sourcesY[idxSource],sourcesZ[idxSource], Kxy,dKxy); - const ComputeClass mutual_coeff = FMath::ConvertTo<ComputeClass, FReal>(MatrixKernel->getMutualCoefficient());; // 1 if symmetric; -1 if antisymmetric + const ComputeClass mutual_coeff = ComputeClass(MatrixKernel->getMutualCoefficient());; // 1 if symmetric; -1 if antisymmetric const ComputeClass coef = (tv * sourcesPhysicalValues[idxSource]); @@ -165,10 +165,10 @@ static void GenericFullMutual(ContainerClass* const FRestrict inTargets, sourcesPotentials[idxSource] += mutual_coeff * Kxy[0] * tv; } - targetsForcesX[idxTarget] += FMath::ConvertTo<FReal, ComputeClass>(tfx); - targetsForcesY[idxTarget] += FMath::ConvertTo<FReal, ComputeClass>(tfy); - targetsForcesZ[idxTarget] += FMath::ConvertTo<FReal, ComputeClass>(tfz); - targetsPotentials[idxTarget] += FMath::ConvertTo<FReal, ComputeClass>(tpo); + targetsForcesX[idxTarget] += tfx.horizontalSum(); + targetsForcesY[idxTarget] += tfy.horizontalSum(); + targetsForcesZ[idxTarget] += tfz.horizontalSum(); + targetsPotentials[idxTarget] += tpo.horizontalSum(); } } } @@ -201,14 +201,14 @@ static void GenericInner(ContainerClass* const FRestrict inTargets, const Matrix ComputeClass*const sourcesPotentials = (ComputeClass*)targetsPotentials; for(FSize idxTarget = 0 ; idxTarget < nbParticlesTargets ; ++idxTarget){ - const ComputeClass tx = FMath::ConvertTo<ComputeClass, const FReal*>(&targetsX[idxTarget]); - const ComputeClass ty = FMath::ConvertTo<ComputeClass, const FReal*>(&targetsY[idxTarget]); - const ComputeClass tz = FMath::ConvertTo<ComputeClass, const FReal*>(&targetsZ[idxTarget]); - const ComputeClass tv = FMath::ConvertTo<ComputeClass, const FReal*>(&targetsPhysicalValues[idxTarget]); - ComputeClass tfx = FMath::Zero<ComputeClass>(); - ComputeClass tfy = FMath::Zero<ComputeClass>(); - ComputeClass tfz = FMath::Zero<ComputeClass>(); - ComputeClass tpo = FMath::Zero<ComputeClass>(); + const ComputeClass tx = ComputeClass(&targetsX[idxTarget]); + const ComputeClass ty = ComputeClass(&targetsY[idxTarget]); + const ComputeClass tz = ComputeClass(&targetsZ[idxTarget]); + const ComputeClass tv = ComputeClass(&targetsPhysicalValues[idxTarget]); + ComputeClass tfx = ComputeClass(0.); + ComputeClass tfy = ComputeClass(0.); + ComputeClass tfz = ComputeClass(0.); + ComputeClass tpo = ComputeClass(0.); for(FSize idxSource = (idxTarget+NbFRealInComputeClass)/NbFRealInComputeClass ; idxSource < nbParticlesSources ; ++idxSource){ ComputeClass Kxy[1]; @@ -216,7 +216,7 @@ static void GenericInner(ContainerClass* const FRestrict inTargets, const Matrix MatrixKernel->evaluateBlockAndDerivative(tx,ty,tz, sourcesX[idxSource],sourcesY[idxSource],sourcesZ[idxSource], Kxy,dKxy); - const ComputeClass mutual_coeff = FMath::ConvertTo<ComputeClass, FReal>(MatrixKernel->getMutualCoefficient()); // 1 if symmetric; -1 if antisymmetric + const ComputeClass mutual_coeff = ComputeClass(MatrixKernel->getMutualCoefficient()); // 1 if symmetric; -1 if antisymmetric const ComputeClass coef = (tv * sourcesPhysicalValues[idxSource]); @@ -227,18 +227,18 @@ static void GenericInner(ContainerClass* const FRestrict inTargets, const Matrix tfx += dKxy[0]; tfy += dKxy[1]; tfz += dKxy[2]; - tpo = FMath::FMAdd(Kxy[0],sourcesPhysicalValues[idxSource],tpo); + tpo += Kxy[0]*sourcesPhysicalValues[idxSource]; sourcesForcesX[idxSource] -= dKxy[0]; sourcesForcesY[idxSource] -= dKxy[1]; sourcesForcesZ[idxSource] -= dKxy[2]; - sourcesPotentials[idxSource] = FMath::FMAdd(mutual_coeff * Kxy[0],tv,sourcesPotentials[idxSource]); + sourcesPotentials[idxSource] += (mutual_coeff * Kxy[0])*tv; } - targetsForcesX[idxTarget] += FMath::ConvertTo<FReal, ComputeClass>(tfx); - targetsForcesY[idxTarget] += FMath::ConvertTo<FReal, ComputeClass>(tfy); - targetsForcesZ[idxTarget] += FMath::ConvertTo<FReal, ComputeClass>(tfz); - targetsPotentials[idxTarget] += FMath::ConvertTo<FReal, ComputeClass>(tpo); + targetsForcesX[idxTarget] += tfx.horizontalSum(); + targetsForcesY[idxTarget] += tfy.horizontalSum(); + targetsForcesZ[idxTarget] += tfz.horizontalSum(); + targetsPotentials[idxTarget] += tpo.horizontalSum(); } } @@ -295,14 +295,14 @@ static void GenericFullRemote(ContainerClass* const FRestrict inTargets, const C const ComputeClass*const sourcesZ = (const ComputeClass*)inNeighbors[idxNeighbors]->getPositions()[2]; for(FSize idxTarget = 0 ; idxTarget < nbParticlesTargets ; ++idxTarget){ - const ComputeClass tx = FMath::ConvertTo<ComputeClass, const FReal*>(&targetsX[idxTarget]); - const ComputeClass ty = FMath::ConvertTo<ComputeClass, const FReal*>(&targetsY[idxTarget]); - const ComputeClass tz = FMath::ConvertTo<ComputeClass, const FReal*>(&targetsZ[idxTarget]); - const ComputeClass tv = FMath::ConvertTo<ComputeClass, const FReal*>(&targetsPhysicalValues[idxTarget]); - ComputeClass tfx = FMath::Zero<ComputeClass>(); - ComputeClass tfy = FMath::Zero<ComputeClass>(); - ComputeClass tfz = FMath::Zero<ComputeClass>(); - ComputeClass tpo = FMath::Zero<ComputeClass>(); + const ComputeClass tx = ComputeClass(&targetsX[idxTarget]); + const ComputeClass ty = ComputeClass(&targetsY[idxTarget]); + const ComputeClass tz = ComputeClass(&targetsZ[idxTarget]); + const ComputeClass tv = ComputeClass(&targetsPhysicalValues[idxTarget]); + ComputeClass tfx = ComputeClass(0.); + ComputeClass tfy = ComputeClass(0.); + ComputeClass tfz = ComputeClass(0.); + ComputeClass tpo = ComputeClass(0.); for(FSize idxSource = 0 ; idxSource < nbParticlesSources ; ++idxSource){ ComputeClass Kxy[1]; @@ -322,10 +322,10 @@ static void GenericFullRemote(ContainerClass* const FRestrict inTargets, const C tpo += Kxy[0] * sourcesPhysicalValues[idxSource]; } - targetsForcesX[idxTarget] += FMath::ConvertTo<FReal, ComputeClass>(tfx); - targetsForcesY[idxTarget] += FMath::ConvertTo<FReal, ComputeClass>(tfy); - targetsForcesZ[idxTarget] += FMath::ConvertTo<FReal, ComputeClass>(tfz); - targetsPotentials[idxTarget] += FMath::ConvertTo<FReal, ComputeClass>(tpo); + targetsForcesX[idxTarget] += tfx.horizontalSum(); + targetsForcesY[idxTarget] += tfy.horizontalSum(); + targetsForcesZ[idxTarget] += tfz.horizontalSum(); + targetsPotentials[idxTarget] += tpo.horizontalSum(); } } } @@ -337,107 +337,26 @@ template <class FReal> struct FP2PT{ }; -#if defined(SCALFMM_USE_AVX) -template <> -struct FP2PT<double>{ - template <class ContainerClass, class MatrixKernelClass> - static void FullMutual(ContainerClass* const FRestrict inTargets, ContainerClass* const inNeighbors[], - const int limiteNeighbors, const MatrixKernelClass *const MatrixKernel){ - FP2P::GenericFullMutual<double, ContainerClass, MatrixKernelClass, __m256d, 4>(inTargets, inNeighbors, limiteNeighbors, MatrixKernel); - } - - - template <class ContainerClass, class MatrixKernelClass> - static void Inner(ContainerClass* const FRestrict inTargets, const MatrixKernelClass *const MatrixKernel){ - FP2P::GenericInner<double, ContainerClass, MatrixKernelClass, __m256d, 4>(inTargets, MatrixKernel); - } - - template <class ContainerClass, class MatrixKernelClass> - static void FullRemote(ContainerClass* const FRestrict inTargets, const ContainerClass* const inNeighbors[], - const int limiteNeighbors, const MatrixKernelClass *const MatrixKernel){ - FP2P::GenericFullRemote<double, ContainerClass, MatrixKernelClass, __m256d, 4>(inTargets, inNeighbors, limiteNeighbors, MatrixKernel); - } -}; - -template <> -struct FP2PT<float>{ - template <class ContainerClass, class MatrixKernelClass> - static void FullMutual(ContainerClass* const FRestrict inTargets, ContainerClass* const inNeighbors[], - const int limiteNeighbors, const MatrixKernelClass *const MatrixKernel){ - FP2P::GenericFullMutual<float, ContainerClass, MatrixKernelClass, __m256, 8>(inTargets, inNeighbors, limiteNeighbors, MatrixKernel); - } - - template <class ContainerClass, class MatrixKernelClass> - static void Inner(ContainerClass* const FRestrict inTargets, const MatrixKernelClass *const MatrixKernel){ - FP2P::GenericInner<float, ContainerClass, MatrixKernelClass, __m256, 8>(inTargets, MatrixKernel); - } +#include "InastempCompileConfig.h" - template <class ContainerClass, class MatrixKernelClass> - static void FullRemote(ContainerClass* const FRestrict inTargets, const ContainerClass* const inNeighbors[], - const int limiteNeighbors, const MatrixKernelClass *const MatrixKernel){ - FP2P::GenericFullRemote<float, ContainerClass, MatrixKernelClass, __m256, 8>(inTargets, inNeighbors, limiteNeighbors, MatrixKernel); - } -}; -#elif defined(SCALFMM_USE_AVX2) template <> struct FP2PT<double>{ template <class ContainerClass, class MatrixKernelClass> static void FullMutual(ContainerClass* const FRestrict inTargets, ContainerClass* const inNeighbors[], const int limiteNeighbors, const MatrixKernelClass *const MatrixKernel){ - FP2P::GenericFullMutual<double, ContainerClass, MatrixKernelClass, __m512d, 8>(inTargets, inNeighbors, limiteNeighbors, MatrixKernel); - } - - template <class ContainerClass, class MatrixKernelClass> - static void Inner(ContainerClass* const FRestrict inTargets, const MatrixKernelClass *const MatrixKernel){ - FP2P::GenericInner<double, ContainerClass, MatrixKernelClass, __m512d, 8>(inTargets, MatrixKernel); - } - - template <class ContainerClass, class MatrixKernelClass> - static void FullRemote(ContainerClass* const FRestrict inTargets, const ContainerClass* const inNeighbors[], - const int limiteNeighbors, const MatrixKernelClass *const MatrixKernel){ - FP2P::GenericFullRemote<double, ContainerClass, MatrixKernelClass, __m512d, 8>(inTargets, inNeighbors, limiteNeighbors, MatrixKernel); - } -}; - -template <> -struct FP2PT<float>{ - template <class ContainerClass, class MatrixKernelClass> - static void FullMutual(ContainerClass* const FRestrict inTargets, ContainerClass* const inNeighbors[], - const int limiteNeighbors, const MatrixKernelClass *const MatrixKernel){ - FP2P::GenericFullMutual<float, ContainerClass, MatrixKernelClass, __m512, 16>(inTargets, inNeighbors, limiteNeighbors, MatrixKernel); + FP2P::GenericFullMutual<double, ContainerClass, MatrixKernelClass, InaVecBestTypeDouble, InaVecBestTypeDouble::VecLength>(inTargets, inNeighbors, limiteNeighbors, MatrixKernel); } template <class ContainerClass, class MatrixKernelClass> static void Inner(ContainerClass* const FRestrict inTargets, const MatrixKernelClass *const MatrixKernel){ - FP2P::GenericInner<float, ContainerClass, MatrixKernelClass, __m512, 16>(inTargets, MatrixKernel); + FP2P::GenericInner<double, ContainerClass, MatrixKernelClass, InaVecBestTypeDouble, InaVecBestTypeDouble::VecLength>(inTargets, MatrixKernel); } template <class ContainerClass, class MatrixKernelClass> static void FullRemote(ContainerClass* const FRestrict inTargets, const ContainerClass* const inNeighbors[], const int limiteNeighbors, const MatrixKernelClass *const MatrixKernel){ - FP2P::GenericFullRemote<float, ContainerClass, MatrixKernelClass, __m512, 16>(inTargets, inNeighbors, limiteNeighbors, MatrixKernel); - } -}; -#elif defined(SCALFMM_USE_SSE) -template <> -struct FP2PT<double>{ - template <class ContainerClass, class MatrixKernelClass> - static void FullMutual(ContainerClass* const FRestrict inTargets, ContainerClass* const inNeighbors[], - const int limiteNeighbors, const MatrixKernelClass *const MatrixKernel){ - FP2P::GenericFullMutual<double, ContainerClass, MatrixKernelClass, __m128d, 2>(inTargets, inNeighbors, limiteNeighbors, MatrixKernel); - } - - - template <class ContainerClass, class MatrixKernelClass> - static void Inner(ContainerClass* const FRestrict inTargets, const MatrixKernelClass *const MatrixKernel){ - FP2P::GenericInner<double, ContainerClass, MatrixKernelClass, __m128d, 2>(inTargets, MatrixKernel); - } - - template <class ContainerClass, class MatrixKernelClass> - static void FullRemote(ContainerClass* const FRestrict inTargets, const ContainerClass* const inNeighbors[], - const int limiteNeighbors, const MatrixKernelClass *const MatrixKernel){ - FP2P::GenericFullRemote<double, ContainerClass, MatrixKernelClass, __m128d, 2>(inTargets, inNeighbors, limiteNeighbors, MatrixKernel); + FP2P::GenericFullRemote<double, ContainerClass, MatrixKernelClass, InaVecBestTypeDouble, InaVecBestTypeDouble::VecLength>(inTargets, inNeighbors, limiteNeighbors, MatrixKernel); } }; @@ -446,62 +365,21 @@ struct FP2PT<float>{ template <class ContainerClass, class MatrixKernelClass> static void FullMutual(ContainerClass* const FRestrict inTargets, ContainerClass* const inNeighbors[], const int limiteNeighbors, const MatrixKernelClass *const MatrixKernel){ - FP2P::GenericFullMutual<float, ContainerClass, MatrixKernelClass, __m128, 4>(inTargets, inNeighbors, limiteNeighbors, MatrixKernel); - } - - template <class ContainerClass, class MatrixKernelClass> - static void Inner(ContainerClass* const FRestrict inTargets, ContainerClass* const inNeighbors[], - const int limiteNeighbors, const MatrixKernelClass *const MatrixKernel){ - FP2P::GenericInner<float, ContainerClass, MatrixKernelClass, __m128, 4>(inTargets, MatrixKernel); - } - - template <class ContainerClass, class MatrixKernelClass> - static void FullRemote(ContainerClass* const FRestrict inTargets, const ContainerClass* const inNeighbors[], - const int limiteNeighbors, const MatrixKernelClass *const MatrixKernel){ - FP2P::GenericFullRemote<float, ContainerClass, MatrixKernelClass, __m128, 4>(inTargets, inNeighbors, limiteNeighbors, MatrixKernel); - } -}; -#else -template <> -struct FP2PT<double>{ - template <class ContainerClass, class MatrixKernelClass> - static void FullMutual(ContainerClass* const FRestrict inTargets, ContainerClass* const inNeighbors[], - const int limiteNeighbors, const MatrixKernelClass *const MatrixKernel){ - FP2P::GenericFullMutual<double, ContainerClass, MatrixKernelClass, double, 1>(inTargets, inNeighbors, limiteNeighbors, MatrixKernel); + FP2P::GenericFullMutual<float, ContainerClass, MatrixKernelClass, InaVecBestTypeFloat, InaVecBestTypeFloat::VecLength>(inTargets, inNeighbors, limiteNeighbors, MatrixKernel); } template <class ContainerClass, class MatrixKernelClass> static void Inner(ContainerClass* const FRestrict inTargets, const MatrixKernelClass *const MatrixKernel){ - FP2P::GenericInner<double, ContainerClass, MatrixKernelClass, double, 1>(inTargets, MatrixKernel); + FP2P::GenericInner<float, ContainerClass, MatrixKernelClass, InaVecBestTypeFloat, InaVecBestTypeFloat::VecLength>(inTargets, MatrixKernel); } template <class ContainerClass, class MatrixKernelClass> static void FullRemote(ContainerClass* const FRestrict inTargets, const ContainerClass* const inNeighbors[], const int limiteNeighbors, const MatrixKernelClass *const MatrixKernel){ - FP2P::GenericFullRemote<double, ContainerClass, MatrixKernelClass, double, 1>(inTargets, inNeighbors, limiteNeighbors, MatrixKernel); + FP2P::GenericFullRemote<float, ContainerClass, MatrixKernelClass, InaVecBestTypeFloat, InaVecBestTypeFloat::VecLength>(inTargets, inNeighbors, limiteNeighbors, MatrixKernel); } }; -template <> -struct FP2PT<float>{ - template <class ContainerClass, class MatrixKernelClass> - static void FullMutual(ContainerClass* const FRestrict inTargets, ContainerClass* const inNeighbors[], - const int limiteNeighbors, const MatrixKernelClass *const MatrixKernel){ - FP2P::GenericFullMutual<float, ContainerClass, MatrixKernelClass, float, 1>(inTargets, inNeighbors, limiteNeighbors, MatrixKernel); - } - - template <class ContainerClass, class MatrixKernelClass> - static void Inner(ContainerClass* const FRestrict inTargets, const MatrixKernelClass *const MatrixKernel){ - FP2P::GenericInner<float, ContainerClass, MatrixKernelClass, float, 1>(inTargets, MatrixKernel); - } - - template <class ContainerClass, class MatrixKernelClass> - static void FullRemote(ContainerClass* const FRestrict inTargets, const ContainerClass* const inNeighbors[], - const int limiteNeighbors, const MatrixKernelClass *const MatrixKernel){ - FP2P::GenericFullRemote<float, ContainerClass, MatrixKernelClass, float, 1>(inTargets, inNeighbors, limiteNeighbors, MatrixKernel); - } -}; -#endif #include "FP2PTensorialKij.hpp" diff --git a/Src/Kernels/P2P/FP2PR.hpp b/Src/Kernels/P2P/FP2PR.hpp index f9e83c4f1bf0f99e4f4a0969423430016ca0fc70..dbdc49fc70a94ef1ae52a17d5c4d85b28b7a0674 100644 --- a/Src/Kernels/P2P/FP2PR.hpp +++ b/Src/Kernels/P2P/FP2PR.hpp @@ -81,7 +81,7 @@ static void GenericFullMutual(ContainerClass* const FRestrict inTargets, Contain FReal*const targetsForcesZ = inTargets->getForcesZ(); FReal*const targetsPotentials = inTargets->getPotentials(); - const ComputeClass mOne = FMath::One<ComputeClass>(); + const ComputeClass mOne = ComputeClass(1); for(FSize idxNeighbors = 0 ; idxNeighbors < limiteNeighbors ; ++idxNeighbors){ if( inNeighbors[idxNeighbors] ){ @@ -96,14 +96,14 @@ static void GenericFullMutual(ContainerClass* const FRestrict inTargets, Contain ComputeClass*const sourcesPotentials = (ComputeClass*)inNeighbors[idxNeighbors]->getPotentials(); for(FSize idxTarget = 0 ; idxTarget < nbParticlesTargets ; ++idxTarget){ - const ComputeClass tx = FMath::ConvertTo<ComputeClass, const FReal*>(&targetsX[idxTarget]); - const ComputeClass ty = FMath::ConvertTo<ComputeClass, const FReal*>(&targetsY[idxTarget]); - const ComputeClass tz = FMath::ConvertTo<ComputeClass, const FReal*>(&targetsZ[idxTarget]); - const ComputeClass tv = FMath::ConvertTo<ComputeClass, const FReal*>(&targetsPhysicalValues[idxTarget]); - ComputeClass tfx = FMath::Zero<ComputeClass>(); - ComputeClass tfy = FMath::Zero<ComputeClass>(); - ComputeClass tfz = FMath::Zero<ComputeClass>(); - ComputeClass tpo = FMath::Zero<ComputeClass>(); + const ComputeClass tx = ComputeClass(&targetsX[idxTarget]); + const ComputeClass ty = ComputeClass(&targetsY[idxTarget]); + const ComputeClass tz = ComputeClass(&targetsZ[idxTarget]); + const ComputeClass tv = ComputeClass(&targetsPhysicalValues[idxTarget]); + ComputeClass tfx = ComputeClass(0.); + ComputeClass tfy = ComputeClass(0.); + ComputeClass tfz = ComputeClass(0.); + ComputeClass tpo = ComputeClass(0.); for(FSize idxSource = 0 ; idxSource < nbParticlesSources ; ++idxSource){ ComputeClass dx = tx - sourcesX[idxSource]; @@ -111,7 +111,7 @@ static void GenericFullMutual(ContainerClass* const FRestrict inTargets, Contain ComputeClass dz = tz - sourcesZ[idxSource]; ComputeClass inv_square_distance = mOne / (dx*dx + dy*dy + dz*dz); - const ComputeClass inv_distance = FMath::Sqrt(inv_square_distance); + const ComputeClass inv_distance = inv_square_distance.sqrt(); inv_square_distance *= inv_distance; inv_square_distance *= tv * sourcesPhysicalValues[idxSource]; @@ -131,10 +131,10 @@ static void GenericFullMutual(ContainerClass* const FRestrict inTargets, Contain sourcesPotentials[idxSource] += inv_distance * tv; } - targetsForcesX[idxTarget] += FMath::ConvertTo<FReal, ComputeClass>(tfx); - targetsForcesY[idxTarget] += FMath::ConvertTo<FReal, ComputeClass>(tfy); - targetsForcesZ[idxTarget] += FMath::ConvertTo<FReal, ComputeClass>(tfz); - targetsPotentials[idxTarget] += FMath::ConvertTo<FReal, ComputeClass>(tpo); + targetsForcesX[idxTarget] += tfx.horizontalSum(); + targetsForcesY[idxTarget] += tfy.horizontalSum(); + targetsForcesZ[idxTarget] += tfz.horizontalSum(); + targetsPotentials[idxTarget] += tpo.horizontalSum(); } } } @@ -153,7 +153,7 @@ static void GenericInner(ContainerClass* const FRestrict inTargets){ FReal*const targetsForcesZ = inTargets->getForcesZ(); FReal*const targetsPotentials = inTargets->getPotentials(); - const ComputeClass mOne = FMath::One<ComputeClass>(); + const ComputeClass mOne = ComputeClass(1); {//In this part, we compute (vectorially) the interaction //within the target leaf. @@ -169,14 +169,14 @@ static void GenericInner(ContainerClass* const FRestrict inTargets){ ComputeClass*const sourcesPotentials = (ComputeClass*)targetsPotentials; for(FSize idxTarget = 0 ; idxTarget < nbParticlesTargets ; ++idxTarget){ - const ComputeClass tx = FMath::ConvertTo<ComputeClass, const FReal*>(&targetsX[idxTarget]); - const ComputeClass ty = FMath::ConvertTo<ComputeClass, const FReal*>(&targetsY[idxTarget]); - const ComputeClass tz = FMath::ConvertTo<ComputeClass, const FReal*>(&targetsZ[idxTarget]); - const ComputeClass tv = FMath::ConvertTo<ComputeClass, const FReal*>(&targetsPhysicalValues[idxTarget]); - ComputeClass tfx = FMath::Zero<ComputeClass>(); - ComputeClass tfy = FMath::Zero<ComputeClass>(); - ComputeClass tfz = FMath::Zero<ComputeClass>(); - ComputeClass tpo = FMath::Zero<ComputeClass>(); + const ComputeClass tx = ComputeClass(&targetsX[idxTarget]); + const ComputeClass ty = ComputeClass(&targetsY[idxTarget]); + const ComputeClass tz = ComputeClass(&targetsZ[idxTarget]); + const ComputeClass tv = ComputeClass(&targetsPhysicalValues[idxTarget]); + ComputeClass tfx = ComputeClass(0.); + ComputeClass tfy = ComputeClass(0.); + ComputeClass tfz = ComputeClass(0.); + ComputeClass tpo = ComputeClass(0.); for(FSize idxSource = (idxTarget+NbFRealInComputeClass)/NbFRealInComputeClass ; idxSource < nbParticlesSources ; ++idxSource){ @@ -184,7 +184,7 @@ static void GenericInner(ContainerClass* const FRestrict inTargets){ ComputeClass dy = ty - sourcesY[idxSource]; ComputeClass dz = tz - sourcesZ[idxSource]; ComputeClass inv_square_distance = mOne / (dx*dx + dy*dy + dz*dz); - const ComputeClass inv_distance = FMath::Sqrt(inv_square_distance); + const ComputeClass inv_distance = inv_square_distance.sqrt(); inv_square_distance *= inv_distance; inv_square_distance *= tv * sourcesPhysicalValues[idxSource]; @@ -204,10 +204,10 @@ static void GenericInner(ContainerClass* const FRestrict inTargets){ sourcesPotentials[idxSource] += inv_distance * tv; } - targetsForcesX[idxTarget] += FMath::ConvertTo<FReal, ComputeClass>(tfx); - targetsForcesY[idxTarget] += FMath::ConvertTo<FReal, ComputeClass>(tfy); - targetsForcesZ[idxTarget] += FMath::ConvertTo<FReal, ComputeClass>(tfz); - targetsPotentials[idxTarget] += FMath::ConvertTo<FReal, ComputeClass>(tpo); + targetsForcesX[idxTarget] += tfx.horizontalSum(); + targetsForcesY[idxTarget] += tfy.horizontalSum(); + targetsForcesZ[idxTarget] += tfz.horizontalSum(); + targetsPotentials[idxTarget] += tpo.horizontalSum(); } } @@ -255,7 +255,7 @@ static void GenericFullRemote(ContainerClass* const FRestrict inTargets, const C FReal*const targetsForcesZ = inTargets->getForcesZ(); FReal*const targetsPotentials = inTargets->getPotentials(); - const ComputeClass mOne = FMath::One<ComputeClass>(); + const ComputeClass mOne = ComputeClass(1); for(FSize idxNeighbors = 0 ; idxNeighbors < limiteNeighbors ; ++idxNeighbors){ if( inNeighbors[idxNeighbors] ){ @@ -266,14 +266,14 @@ static void GenericFullRemote(ContainerClass* const FRestrict inTargets, const C const ComputeClass*const sourcesZ = (const ComputeClass*)inNeighbors[idxNeighbors]->getPositions()[2]; for(FSize idxTarget = 0 ; idxTarget < nbParticlesTargets ; ++idxTarget){ - const ComputeClass tx = FMath::ConvertTo<ComputeClass, const FReal*>(&targetsX[idxTarget]); - const ComputeClass ty = FMath::ConvertTo<ComputeClass, const FReal*>(&targetsY[idxTarget]); - const ComputeClass tz = FMath::ConvertTo<ComputeClass, const FReal*>(&targetsZ[idxTarget]); - const ComputeClass tv = FMath::ConvertTo<ComputeClass, const FReal*>(&targetsPhysicalValues[idxTarget]); - ComputeClass tfx = FMath::Zero<ComputeClass>(); - ComputeClass tfy = FMath::Zero<ComputeClass>(); - ComputeClass tfz = FMath::Zero<ComputeClass>(); - ComputeClass tpo = FMath::Zero<ComputeClass>(); + const ComputeClass tx = ComputeClass(&targetsX[idxTarget]); + const ComputeClass ty = ComputeClass(&targetsY[idxTarget]); + const ComputeClass tz = ComputeClass(&targetsZ[idxTarget]); + const ComputeClass tv = ComputeClass(&targetsPhysicalValues[idxTarget]); + ComputeClass tfx = ComputeClass(0.); + ComputeClass tfy = ComputeClass(0.); + ComputeClass tfz = ComputeClass(0.); + ComputeClass tpo = ComputeClass(0.); for(FSize idxSource = 0 ; idxSource < nbParticlesSources ; ++idxSource){ ComputeClass dx = tx - sourcesX[idxSource]; @@ -281,7 +281,7 @@ static void GenericFullRemote(ContainerClass* const FRestrict inTargets, const C ComputeClass dz = tz - sourcesZ[idxSource]; ComputeClass inv_square_distance = mOne / (dx*dx + dy*dy + dz*dz); - const ComputeClass inv_distance = FMath::Sqrt(inv_square_distance); + const ComputeClass inv_distance = inv_square_distance.sqrt(); inv_square_distance *= inv_distance; inv_square_distance *= tv * sourcesPhysicalValues[idxSource]; @@ -296,10 +296,10 @@ static void GenericFullRemote(ContainerClass* const FRestrict inTargets, const C tpo += inv_distance * sourcesPhysicalValues[idxSource]; } - targetsForcesX[idxTarget] += FMath::ConvertTo<FReal, ComputeClass>(tfx); - targetsForcesY[idxTarget] += FMath::ConvertTo<FReal, ComputeClass>(tfy); - targetsForcesZ[idxTarget] += FMath::ConvertTo<FReal, ComputeClass>(tfz); - targetsPotentials[idxTarget] += FMath::ConvertTo<FReal, ComputeClass>(tpo); + targetsForcesX[idxTarget] += tfx.horizontalSum(); + targetsForcesY[idxTarget] += tfy.horizontalSum(); + targetsForcesZ[idxTarget] += tfz.horizontalSum(); + targetsPotentials[idxTarget] += tpo.horizontalSum(); } } } @@ -311,25 +311,25 @@ template <class FReal> struct FP2PRT{ }; -#if defined(SCALFMM_USE_AVX) +#include "InastempCompileConfig.h" template <> struct FP2PRT<double>{ template <class ContainerClass> static void FullMutual(ContainerClass* const FRestrict inTargets, ContainerClass* const inNeighbors[], const int limiteNeighbors){ - FP2PR::GenericFullMutual<double, ContainerClass, __m256d, 4>(inTargets, inNeighbors, limiteNeighbors); + FP2PR::GenericFullMutual<double, ContainerClass, InaVecBestTypeDouble, InaVecBestTypeDouble::VecLength>(inTargets, inNeighbors, limiteNeighbors); } template <class ContainerClass> static void Inner(ContainerClass* const FRestrict inTargets){ - FP2PR::GenericInner<double, ContainerClass, __m256d, 4>(inTargets); + FP2PR::GenericInner<double, ContainerClass, InaVecBestTypeDouble, InaVecBestTypeDouble::VecLength>(inTargets); } template <class ContainerClass> static void FullRemote(ContainerClass* const FRestrict inTargets, const ContainerClass* const inNeighbors[], const int limiteNeighbors){ - FP2PR::GenericFullRemote<double, ContainerClass, __m256d, 4>(inTargets, inNeighbors, limiteNeighbors); + FP2PR::GenericFullRemote<double, ContainerClass, InaVecBestTypeDouble, InaVecBestTypeDouble::VecLength>(inTargets, inNeighbors, limiteNeighbors); } }; @@ -338,143 +338,21 @@ struct FP2PRT<float>{ template <class ContainerClass> static void FullMutual(ContainerClass* const FRestrict inTargets, ContainerClass* const inNeighbors[], const int limiteNeighbors){ - FP2PR::GenericFullMutual<float, ContainerClass, __m256, 8>(inTargets, inNeighbors, limiteNeighbors); + FP2PR::GenericFullMutual<float, ContainerClass, InaVecBestTypeFloat, InaVecBestTypeFloat::VecLength>(inTargets, inNeighbors, limiteNeighbors); } template <class ContainerClass> static void Inner(ContainerClass* const FRestrict inTargets){ - FP2PR::GenericFullMutual<float, ContainerClass, __m256, 8>(inTargets); + FP2PR::GenericFullMutual<float, ContainerClass, InaVecBestTypeFloat, InaVecBestTypeFloat::VecLength>(inTargets); } template <class ContainerClass> static void FullRemote(ContainerClass* const FRestrict inTargets, const ContainerClass* const inNeighbors[], const int limiteNeighbors){ - FP2PR::GenericFullRemote<float, ContainerClass, __m256, 8>(inTargets, inNeighbors, limiteNeighbors); - } -}; -#elif defined(SCALFMM_USE_AVX2) -template <> -struct FP2PRT<double>{ - template <class ContainerClass> - static void FullMutual(ContainerClass* const FRestrict inTargets, ContainerClass* const inNeighbors[], - const int limiteNeighbors){ - FP2PR::GenericFullMutual<double, ContainerClass, __m512d, 8>(inTargets, inNeighbors, limiteNeighbors); - } - - template <class ContainerClass> - static void Inner(ContainerClass* const FRestrict inTargets){ - FP2PR::GenericInner<double, ContainerClass, __m512d, 8>(inTargets); - } - - template <class ContainerClass> - static void FullRemote(ContainerClass* const FRestrict inTargets, const ContainerClass* const inNeighbors[], - const int limiteNeighbors){ - FP2PR::GenericFullRemote<double, ContainerClass, __m512d, 8>(inTargets, inNeighbors, limiteNeighbors); - } -}; - -template <> -struct FP2PRT<float>{ - template <class ContainerClass> - static void FullMutual(ContainerClass* const FRestrict inTargets, ContainerClass* const inNeighbors[], - const int limiteNeighbors){ - FP2PR::GenericFullMutual<float, ContainerClass, __m512, 16>(inTargets, inNeighbors, limiteNeighbors); - } - - template <class ContainerClass> - static void Inner(ContainerClass* const FRestrict inTargets){ - FP2PR::GenericFullMutual<float, ContainerClass, __m512, 16>(inTargets); - } - - template <class ContainerClass> - static void FullRemote(ContainerClass* const FRestrict inTargets, const ContainerClass* const inNeighbors[], - const int limiteNeighbors){ - FP2PR::GenericFullRemote<float, ContainerClass, __m512, 16>(inTargets, inNeighbors, limiteNeighbors); - } -}; - -#elif defined(SCALFMM_USE_SSE) -template <> -struct FP2PRT<double>{ - template <class ContainerClass> - static void FullMutual(ContainerClass* const FRestrict inTargets, ContainerClass* const inNeighbors[], - const int limiteNeighbors){ - FP2PR::GenericFullMutual<double, ContainerClass, __m128d, 2>(inTargets, inNeighbors, limiteNeighbors); - } - - template <class ContainerClass> - static void Inner(ContainerClass* const FRestrict inTargets){ - FP2PR::GenericInner<double, ContainerClass, __m128d, 2>(inTargets); - } - - template <class ContainerClass> - static void FullRemote(ContainerClass* const FRestrict inTargets, const ContainerClass* const inNeighbors[], - const int limiteNeighbors){ - FP2PR::GenericFullRemote<double, ContainerClass, __m128d, 2>(inTargets, inNeighbors, limiteNeighbors); + FP2PR::GenericFullRemote<float, ContainerClass, InaVecBestTypeFloat, InaVecBestTypeFloat::VecLength>(inTargets, inNeighbors, limiteNeighbors); } }; -template <> -struct FP2PRT<float>{ - template <class ContainerClass> - static void FullMutual(ContainerClass* const FRestrict inTargets, ContainerClass* const inNeighbors[], - const int limiteNeighbors){ - FP2PR::GenericFullMutual<float, ContainerClass, __m128, 4>(inTargets, inNeighbors, limiteNeighbors); - } - - template <class ContainerClass> - static void Inner(ContainerClass* const FRestrict inTargets){ - FP2PR::GenericInner<float, ContainerClass, __m128, 4>(inTargets); - } - - template <class ContainerClass> - static void FullRemote(ContainerClass* const FRestrict inTargets, const ContainerClass* const inNeighbors[], - const int limiteNeighbors){ - FP2PR::GenericFullRemote<float, ContainerClass, __m128, 4>(inTargets, inNeighbors, limiteNeighbors); - } -}; - -#else -template <> -struct FP2PRT<double>{ - template <class ContainerClass> - static void FullMutual(ContainerClass* const FRestrict inTargets, ContainerClass* const inNeighbors[], - const int limiteNeighbors){ - FP2PR::GenericFullMutual<double, ContainerClass, double, 1>(inTargets, inNeighbors, limiteNeighbors); - } - - template <class ContainerClass> - static void Inner(ContainerClass* const FRestrict inTargets){ - FP2PR::GenericInner<double, ContainerClass, double, 1>(inTargets); - } - - template <class ContainerClass> - static void FullRemote(ContainerClass* const FRestrict inTargets, const ContainerClass* const inNeighbors[], - const int limiteNeighbors){ - FP2PR::GenericFullRemote<double, ContainerClass, double, 1>(inTargets, inNeighbors, limiteNeighbors); - } -}; - -template <> -struct FP2PRT<float>{ - template <class ContainerClass> - static void FullMutual(ContainerClass* const FRestrict inTargets, ContainerClass* const inNeighbors[], - const int limiteNeighbors){ - FP2PR::GenericFullMutual<float, ContainerClass, float, 1>(inTargets, inNeighbors, limiteNeighbors); - } - - template <class ContainerClass> - static void Inner(ContainerClass* const FRestrict inTargets){ - FP2PR::GenericInner<float, ContainerClass, float, 1>(inTargets); - } - - template <class ContainerClass> - static void FullRemote(ContainerClass* const FRestrict inTargets, const ContainerClass* const inNeighbors[], - const int limiteNeighbors){ - FP2PR::GenericFullRemote<float, ContainerClass, float, 1>(inTargets, inNeighbors, limiteNeighbors); - } -}; -#endif diff --git a/Src/ScalFmmConfig.h.cmake b/Src/ScalFmmConfig.h.cmake index ef3f6204e4c9c27c89cb394deb1e33d2b53810b3..71198ba5e448488e220e0a710d1a7dd97c013293 100644 --- a/Src/ScalFmmConfig.h.cmake +++ b/Src/ScalFmmConfig.h.cmake @@ -28,7 +28,6 @@ #cmakedefine SCALFMM_BLAS_ADD_ #cmakedefine SCALFMM_BLAS_UPCASE #cmakedefine SCALFMM_BLAS_NOCHANGE - //////////////////////////////////////////////////////// // FFT /////////////////////////////////////////////////////// @@ -68,20 +67,6 @@ #cmakedefine SCALFMM_USE_STARPU #cmakedefine SCALFMM_DISABLE_NATIVE_OMP4 -/////////////////////////////////////////////////////// -// SSE -/////////////////////////////////////////////////////// - -#cmakedefine SCALFMM_USE_SSE -#cmakedefine __AVXPE_INTEL_COMPILER - -/////////////////////////////////////////////////////// -// AVX -/////////////////////////////////////////////////////// - -#cmakedefine SCALFMM_USE_AVX -#cmakedefine __SSEPE_INTEL_COMPILER - /////////////////////////////////////////////////////// // EZTRACE /////////////////////////////////////////////////////// diff --git a/Src/Utils/FAvx.hpp b/Src/Utils/FAvx.hpp deleted file mode 100644 index 717ad2f8fadca99425aa419122a21bb98ff199ac..0000000000000000000000000000000000000000 --- a/Src/Utils/FAvx.hpp +++ /dev/null @@ -1,85 +0,0 @@ -// See LICENCE file at project root -#ifndef FAVX_HPP -#define FAVX_HPP - -#include "FGlobal.hpp" -#ifndef SCALFMM_USE_AVX -#error The AVX header is included while SCALFMM_USE_AVX is turned OFF -#else - - -#include <immintrin.h> - -#ifdef __AVXPE_INTEL_COMPILER - -//Side effect operators DOUBLE -inline __m256d& operator+=(__m256d & a, const __m256d & b){ - return (a = _mm256_add_pd (a,b)); -} - -inline __m256d& operator-=(__m256d& a, const __m256d& b){ - return (a = _mm256_sub_pd (a,b)); -} - -inline __m256d& operator*=(__m256d& a, const __m256d& b){ - return (a = _mm256_mul_pd (a,b)); -} - -inline __m256d& operator/=(__m256d& a, const __m256d& b){ - return (a = _mm256_div_pd (a,b)); -} - -//No side effect operators DOUBLE -inline __m256d operator+(const __m256d& a,const __m256d& b){ - return _mm256_add_pd (a,b); -} - -inline __m256d operator-(const __m256d& a, const __m256d& b){ - return _mm256_sub_pd (a,b); -} - -inline __m256d operator*(const __m256d& v1, const __m256d& v2){ - return _mm256_mul_pd(v1, v2); -} - -inline __m256d operator/(const __m256d& v1, const __m256d& v2){ - return _mm256_div_pd(v1, v2); -} - -//Side effect operators SINGLE -inline __m256& operator+=(__m256 & a, const __m256 & b){ - return (a = _mm256_add_ps (a,b)); -} - -inline __m256& operator-=(__m256& a, const __m256& b){ - return (a = _mm256_sub_ps (a,b)); -} - -inline __m256& operator*=(__m256& a, const __m256& b){ - return (a = _mm256_mul_ps (a,b)); -} - -inline __m256& operator/=(__m256& a, const __m256& b){ - return (a = _mm256_div_ps (a,b)); -} - -//No side effect operators SINGLE -inline __m256 operator+(const __m256& a,const __m256& b){ - return _mm256_add_ps (a,b); -} - -inline __m256 operator-(const __m256& a, const __m256& b){ - return _mm256_sub_ps (a,b); -} - -inline __m256 operator*(const __m256& v1, const __m256& v2){ - return _mm256_mul_ps(v1, v2); -} - -inline __m256 operator/(const __m256& v1, const __m256& v2){ - return _mm256_div_ps(v1, v2); -} - -#endif -#endif -#endif diff --git a/Src/Utils/FAvx2.hpp b/Src/Utils/FAvx2.hpp deleted file mode 100644 index 08226a205de9a9e9aae6e2c5725e7457c4aebb12..0000000000000000000000000000000000000000 --- a/Src/Utils/FAvx2.hpp +++ /dev/null @@ -1,85 +0,0 @@ -// See LICENCE file at project root -#ifndef FAVX2_HPP -#define FAVX2_HPP - -#include "FGlobal.hpp" -#ifndef SCALFMM_USE_AVX2 -#error The AVX header is included while SCALFMM_USE_AVX is turned OFF -#endif - -#include <immintrin.h> - -#ifdef __MIC__ - -//Side effect operators DOUBLE -inline __m512d& operator+=(__m512d & a, const __m512d & b){ - return (a = _mm512_add_pd (a,b)); -} - -inline __m512d& operator-=(__m512d& a, const __m512d& b){ - return (a = _mm512_sub_pd (a,b)); -} - -inline __m512d& operator*=(__m512d& a, const __m512d& b){ - return (a = _mm512_mul_pd (a,b)); -} - -inline __m512d& operator/=(__m512d& a, const __m512d& b){ - return (a = _mm512_div_pd (a,b)); -} - -//No side effect operators DOUBLE -inline __m512d operator+(const __m512d& a,const __m512d& b){ - return _mm512_add_pd (a,b); -} - -inline __m512d operator-(const __m512d& a, const __m512d& b){ - return _mm512_sub_pd (a,b); -} - -inline __m512d operator*(const __m512d& v1, const __m512d& v2){ - return _mm512_mul_pd(v1, v2); -} - -inline __m512d operator/(const __m512d& v1, const __m512d& v2){ - return _mm512_div_pd(v1, v2); -} - -//Side effect operators SINGLE -inline __m512& operator+=(__m512 & a, const __m512 & b){ - return (a = _mm512_add_ps (a,b)); -} - -inline __m512& operator-=(__m512& a, const __m512& b){ - return (a = _mm512_sub_ps (a,b)); -} - -inline __m512& operator*=(__m512& a, const __m512& b){ - return (a = _mm512_mul_ps (a,b)); -} - -inline __m512& operator/=(__m512& a, const __m512& b){ - return (a = _mm512_div_ps (a,b)); -} - -//No side effect operators SINGLE -inline __m512 operator+(const __m512& a,const __m512& b){ - return _mm512_add_ps (a,b); -} - -inline __m512 operator-(const __m512& a, const __m512& b){ - return _mm512_sub_ps (a,b); -} - -inline __m512 operator*(const __m512& v1, const __m512& v2){ - return _mm512_mul_ps(v1, v2); -} - -inline __m512 operator/(const __m512& v1, const __m512& v2){ - return _mm512_div_ps(v1, v2); -} - -#endif - -#endif - diff --git a/Src/Utils/FMath.hpp b/Src/Utils/FMath.hpp index d753ebcfcb3ea2664da3e0dd0add5dc03d5bacea..c2c8a0dcce37161f67f2f6b02515895035d5eb0c 100644 --- a/Src/Utils/FMath.hpp +++ b/Src/Utils/FMath.hpp @@ -8,14 +8,6 @@ #include "FGlobal.hpp" -#ifdef SCALFMM_USE_SSE -#include "FSse.hpp" -#endif - -#ifdef SCALFMM_USE_AVX -#include "FAvx.hpp" -#endif - /** * @author Berenger Bramas (berenger.bramas@inria.fr) @@ -40,36 +32,6 @@ struct FMath{ return (inV < 0 ? -inV : inV); } -#ifdef SCALFMM_USE_SSE - static __m128 Abs(const __m128 inV){ - return _mm_max_ps(_mm_sub_ps(_mm_setzero_ps(), inV), inV); - } - - static __m128d Abs(const __m128d inV){ - return _mm_max_pd(_mm_sub_pd(_mm_setzero_pd(), inV), inV); - } -#endif -#ifdef SCALFMM_USE_AVX - static __m256 Abs(const __m256 inV){ - return _mm256_max_ps(_mm256_sub_ps(_mm256_setzero_ps(), inV), inV); - } - - static __m256d Abs(const __m256d inV){ - return _mm256_max_pd(_mm256_sub_pd(_mm256_setzero_pd(), inV), inV); - } -#endif -#ifdef SCALFMM_USE_AVX2 -#ifdef __MIC__ - static __m512 Abs(const __m512 inV){ - return _mm512_max_ps(_mm512_sub_ps(_mm512_setzero_ps(), inV), inV); - } - - static __m512d Abs(const __m512d inV){ - return _mm512_max_pd(_mm512_sub_pd(_mm512_setzero_pd(), inV), inV); - } -#endif -#endif - /** To get max between 2 values */ template <class NumType> static NumType Max(const NumType inV1, const NumType inV2){ @@ -82,53 +44,6 @@ struct FMath{ return (inV1 < inV2 ? inV1 : inV2); } -#ifdef SCALFMM_USE_SSE - static __m128 Max(const __m128 inV1, const __m128 inV2){ - return _mm_max_ps(inV1, inV2); - } - static __m128 Min(const __m128 inV1, const __m128 inV2){ - return _mm_min_ps(inV1, inV2); - } - - static __m128d Max(const __m128d inV1, const __m128d inV2){ - return _mm_max_pd(inV1, inV2); - } - static __m128d Min(const __m128d inV1, const __m128d inV2){ - return _mm_min_pd(inV1, inV2); - } -#endif -#ifdef SCALFMM_USE_AVX - static __m256 Max(const __m256 inV1, const __m256 inV2){ - return _mm256_max_ps(inV1, inV2); - } - static __m256 Min(const __m256 inV1, const __m256 inV2){ - return _mm256_min_ps(inV1, inV2); - } - - static __m256d Max(const __m256d inV1, const __m256d inV2){ - return _mm256_max_pd(inV1, inV2); - } - static __m256d Min(const __m256d inV1, const __m256d inV2){ - return _mm256_min_pd(inV1, inV2); - } -#endif -#ifdef SCALFMM_USE_AVX2 -#ifdef __MIC__ - static __m512 Max(const __m512 inV1, const __m512 inV2){ - return _mm512_max_ps(inV1, inV2); - } - static __m512 Min(const __m512 inV1, const __m512 inV2){ - return _mm512_min_ps(inV1, inV2); - } - - static __m512d Max(const __m512d inV1, const __m512d inV2){ - return _mm512_max_pd(inV1, inV2); - } - static __m512d Min(const __m512d inV1, const __m512d inV2){ - return _mm512_min_pd(inV1, inV2); - } -#endif -#endif /** To know if 2 values seems to be equal */ template <class NumType> static bool LookEqual(const NumType inV1, const NumType inV2){ @@ -160,59 +75,6 @@ struct FMath{ return ceil(inValue); } -#if defined(SCALFMM_USE_SSE ) && defined(__SSSE4_1__) - static __m128 dfloor(const __m128 inV){ - return _mm_floor_ps(inV); - } - - static __m128d dfloor(const __m128d inV){ - return _mm_floor_pd(inV); - } - - static __m128 Ceil(const __m128 inV){ - return _mm_ceil_ps(inV); - } - - static __m128d Ceil(const __m128d inV){ - return _mm_ceil_pd(inV); - } -#endif -#ifdef SCALFMM_USE_AVX - static __m256 dfloor(const __m256 inV){ - return _mm256_floor_ps(inV); - } - - static __m256d dfloor(const __m256d inV){ - return _mm256_floor_pd(inV); - } - - static __m256 Ceil(const __m256 inV){ - return _mm256_ceil_ps(inV); - } - - static __m256d Ceil(const __m256d inV){ - return _mm256_ceil_pd(inV); - } -#endif -#ifdef SCALFMM_USE_AVX2 -#ifdef __MIC__ - static __m512 dfloor(const __m512 inV){ - return _mm512_floor_ps(inV); - } - - static __m512d dfloor(const __m512d inV){ - return _mm512_floor_pd(inV); - } - - static __m512 Ceil(const __m512 inV){ - return _mm512_ceil_ps(inV); - } - - static __m512d Ceil(const __m512d inV){ - return _mm512_ceil_pd(inV); - } -#endif -#endif /** To get pow */ static double pow(double x, double y){ return ::pow(x,y); @@ -263,39 +125,6 @@ struct FMath{ } -#if defined(SCALFMM_USE_SSE ) && defined(__SSSE4_1__) - static __m128 FMAdd(const __m128 inV1, const __m128 inV2, const __m128 inV3){ - return _mm_add_ps( _mm_mul_ps(inV1,inV2), inV3); - } - - static __m128d FMAdd(const __m128d inV1, const __m128d inV2, const __m128d inV3){ - return _mm_add_pd( _mm_mul_pd(inV1,inV2), inV3); - } - -#endif -#ifdef SCALFMM_USE_AVX - static __m256 FMAdd(const __m256 inV1, const __m256 inV2, const __m256 inV3){ - return _mm256_add_ps( _mm256_mul_ps(inV1,inV2), inV3); - } - - static __m256d FMAdd(const __m256d inV1, const __m256d inV2, const __m256d inV3){ - return _mm256_add_pd( _mm256_mul_pd(inV1,inV2), inV3); - } - -#endif -#ifdef SCALFMM_USE_AVX2 -#ifdef __MIC__ - static __m512 FMAdd(const __m512 inV1, const __m512 inV2, const __m512 inV3){ - //return _mm512_add_ps( _mm512_mul_ps(inV1,inV2), inV3); - return _mm512_fmadd_ps(inV1, inV2, inV3); - } - - static __m512d FMAdd(const __m512d inV1, const __m512d inV2, const __m512d inV3){ - //return _mm512_add_pd( _mm512_mul_pd(inV1,inV2), inV3); - return _mm512_fmadd_pd(inV1, inV2, inV3); - } -#endif -#endif /** To get sqrt of a FReal */ static float Sqrt(const float inValue){ return sqrtf(inValue); @@ -303,123 +132,25 @@ struct FMath{ static double Sqrt(const double inValue){ return sqrt(inValue); } - static float Rsqrt(const float inValue){ - return float(1.0)/sqrtf(inValue); - } - static double Rsqrt(const double inValue){ - return 1.0/sqrt(inValue); - } -#ifdef SCALFMM_USE_SSE - static __m128 Exp(const __m128 inV){ - float ptr[4]; - _mm_storeu_ps(ptr, inV); - for(int idx = 0 ; idx < 4 ; ++idx){ - ptr[idx] = std::exp(ptr[idx]); - } - return _mm_loadu_ps(ptr); - } - static __m128d Exp(const __m128d inV){ - double ptr[2]; - _mm_storeu_pd(ptr, inV); - for(int idx = 0 ; idx < 2 ; ++idx){ - ptr[idx] = std::exp(ptr[idx]); - } - return _mm_loadu_pd(ptr); - } - - static __m128 Sqrt(const __m128 inV){ - return _mm_sqrt_ps(inV); + template <class InaClass> + static InaClass Sqrt(const InaClass inValue){ + return inValue.sqrt(); } - static __m128d Sqrt(const __m128d inV){ - return _mm_sqrt_pd(inV); + template <class InaClass> + static InaClass Exp(const InaClass inValue){ + return inValue.exp(); } - static __m128 Rsqrt(const __m128 inV){ - return _mm_rsqrt_ps(inV); - } - - static __m128d Rsqrt(const __m128d inV){ - return _mm_set_pd1(1.0) / _mm_sqrt_pd(inV); - } -#endif -#ifdef SCALFMM_USE_AVX - static __m256 Exp(const __m256 inV){ - float ptr[8]; - _mm256_storeu_ps(ptr, inV); - for(int idx = 0 ; idx < 8 ; ++idx){ - ptr[idx] = std::exp(ptr[idx]); - } - return _mm256_loadu_ps(ptr); - } - - static __m256d Exp(const __m256d inV){ - double ptr[4]; - _mm256_storeu_pd(ptr, inV); - for(int idx = 0 ; idx < 4 ; ++idx){ - ptr[idx] = std::exp(ptr[idx]); - } - return _mm256_loadu_pd(ptr); - } - - static __m256 Sqrt(const __m256 inV){ - return _mm256_sqrt_ps(inV); - } - static __m256d Sqrt(const __m256d inV){ - return _mm256_sqrt_pd(inV); - } - - static __m256 Rsqrt(const __m256 inV){ - return _mm256_rsqrt_ps(inV); - } - - static __m256d Rsqrt(const __m256d inV){ - return _mm256_set1_pd(1.0) / _mm256_sqrt_pd(inV); - } -#endif -#ifdef SCALFMM_USE_AVX2 -#ifdef __MIC__ - static __m512 Exp(const __m512 inV){ - float ptr[16]; - _mm512_storeu_ps(ptr, inV); - for(int idx = 0 ; idx < 16 ; ++idx){ - ptr[idx] = std::exp(ptr[idx]); - } - return _mm512_loadu_ps(ptr); - } - - static __m512d Exp(const __m512d inV){ - double ptr[8]; - _mm512_storeu_pd(ptr, inV); - for(int idx = 0 ; idx < 8 ; ++idx){ - ptr[idx] = std::exp(ptr[idx]); - } - return _mm512_loadu_pd(ptr); - } - - static __m512d Exp(const __m512d inV){ - return _mm512_sqrt_pd(inV); - } - - static __m512 Sqrt(const __m512 inV){ - return _mm512_sqrt_ps(inV); - } - - static __m512d Sqrt(const __m512d inV){ - return _mm512_sqrt_pd(inV); + static float Rsqrt(const float inValue){ + return float(1.0)/sqrtf(inValue); } - - static __m512 Rsqrt(const __m512 inV){ - return _mm512_rsqrt_ps(inV); + static double Rsqrt(const double inValue){ + return 1.0/sqrt(inValue); } - static __m512d Rsqrt(const __m512d inV){ - return _mm512_set1_pd(1.0) / _mm512_sqrt_pd(inV); - } -#endif -#endif /** To get Log of a FReal */ static float Log(const float inValue){ return logf(inValue); @@ -503,15 +234,6 @@ struct FMath{ return std::isfinite(value); } - template <class NumType> - static NumType Zero(); - - template <class NumType> - static NumType One(); - - template <class DestType, class SrcType> - static DestType ConvertTo(const SrcType val); - /** A class to compute accuracy */ template <class FReal, class IndexType = FSize> @@ -531,10 +253,10 @@ struct FMath{ /** Add value to the current list */ void add(const FReal inGood, const FReal inBad){ - l2Diff += (inBad - inGood) * (inBad - inGood); - l2Dot += inGood * inGood; - max = Max(max , Abs(inGood)); - maxDiff = Max(maxDiff, Abs(inGood-inBad)); + l2Diff += (inBad - inGood) * (inBad - inGood); + l2Dot += inGood * inGood; + max = Max(max , Abs(inGood)); + maxDiff = Max(maxDiff, Abs(inGood-inBad)); nbElements += 1 ; } /** Add array of values */ @@ -609,213 +331,5 @@ struct FMath{ }; }; -template <> -inline float FMath::Zero<float>(){ - return float(0.0); -} - -template <> -inline double FMath::Zero<double>(){ - return double(0.0); -} - -template <> -inline float FMath::One<float>(){ - return float(1.0); -} - -template <> -inline double FMath::One<double>(){ - return double(1.0); -} - -template <> -inline float FMath::ConvertTo<float,float>(const float val){ - return val; -} - -template <> -inline double FMath::ConvertTo<double,double>(const double val){ - return val; -} - -template <> -inline float FMath::ConvertTo<float,const float*>(const float* val){ - return *val; -} - -template <> -inline double FMath::ConvertTo<double,const double*>(const double* val){ - return *val; -} - -#ifdef SCALFMM_USE_SSE -template <> -inline __m128 FMath::One<__m128>(){ - return _mm_set_ps1(1.0); -} - -template <> -inline __m128d FMath::One<__m128d>(){ - return _mm_set_pd1(1.0); -} - -template <> -inline __m128 FMath::Zero<__m128>(){ - return _mm_setzero_ps(); -} - -template <> -inline __m128d FMath::Zero<__m128d>(){ - return _mm_setzero_pd(); -} - -template <> -inline __m128 FMath::ConvertTo<__m128,float>(const float val){ - return _mm_set_ps1(val); -} - -template <> -inline __m128d FMath::ConvertTo<__m128d,double>(const double val){ - return _mm_set_pd1(val); -} - -template <> -inline __m128 FMath::ConvertTo<__m128,const float*>(const float* val){ - return _mm_load1_ps(val); -} - -template <> -inline __m128d FMath::ConvertTo<__m128d,const double*>(const double* val){ - return _mm_load1_pd(val); -} - -template <> -inline float FMath::ConvertTo<float,__m128>(const __m128 val){ - __attribute__((aligned(16))) float buffer[4]; - _mm_store_ps(buffer, val); - return buffer[0] + buffer[1] + buffer[2] + buffer[3]; -} - -template <> -inline double FMath::ConvertTo<double,__m128d>(const __m128d val){ - __attribute__((aligned(16))) double buffer[2]; - _mm_store_pd(buffer, val); - return buffer[0] + buffer[1]; -} -#endif - -#ifdef SCALFMM_USE_AVX -template <> -inline __m256 FMath::One<__m256>(){ - return _mm256_set1_ps(1.0); -} - -template <> -inline __m256d FMath::One<__m256d>(){ - return _mm256_set1_pd(1.0); -} - -template <> -inline __m256 FMath::Zero<__m256>(){ - return _mm256_setzero_ps(); -} - -template <> -inline __m256d FMath::Zero<__m256d>(){ - return _mm256_setzero_pd(); -} - -template <> -inline __m256 FMath::ConvertTo<__m256,float>(const float val){ - return _mm256_set1_ps(val); -} - -template <> -inline __m256d FMath::ConvertTo<__m256d,double>(const double val){ - return _mm256_set1_pd(val); -} - -template <> -inline __m256 FMath::ConvertTo<__m256,const float*>(const float* val){ - return _mm256_broadcast_ss(val); -} - -template <> -inline __m256d FMath::ConvertTo<__m256d,const double*>(const double* val){ - return _mm256_broadcast_sd(val); -} - -template <> -inline float FMath::ConvertTo<float,__m256>(const __m256 val){ - __attribute__((aligned(32))) float buffer[8]; - _mm256_store_ps(buffer, val); - return buffer[0] + buffer[1] + buffer[2] + buffer[3] + buffer[4] + buffer[5] + buffer[6] + buffer[7]; -} - -template <> -inline double FMath::ConvertTo<double,__m256d>(const __m256d val){ - __attribute__((aligned(32))) double buffer[4]; - _mm256_store_pd(buffer, val); - return buffer[0] + buffer[1] + buffer[2] + buffer[3]; -} -#endif -#ifdef SCALFMM_USE_AVX2 -#ifdef __MIC__ -template <> -inline __m512 FMath::One<__m512>(){ - return _mm512_set1_ps(1.0); -} - -template <> -inline __m512d FMath::One<__m512d>(){ - return _mm512_set1_pd(1.0); -} - -template <> -inline __m512 FMath::Zero<__m512>(){ - return _mm512_setzero_ps(); -} - -template <> -inline __m512d FMath::Zero<__m512d>(){ - return _mm512_setzero_pd(); -} - -template <> -inline __m512 FMath::ConvertTo<__m512,__attribute__((aligned(64))) float>(const float val){ - return _mm512_set1_ps(val); -} - -template <> -inline __m512d FMath::ConvertTo<__m512d,__attribute__((aligned(64))) double>(const double val){ - return _mm512_set1_pd(val); -} - -template <> -inline __m512 FMath::ConvertTo<__m512,const __attribute__((aligned(64))) float*>(const float* val){ - return _mm512_set1_ps(val[0]); -} - -template <> -inline __m512d FMath::ConvertTo<__m512d,const __attribute__((aligned(64))) double*>(const double* val){ - return _mm512_set1_pd(val[0]); -} - -template <> -inline float FMath::ConvertTo<float,__m512>(const __m512 val){ - __attribute__((aligned(64))) float buffer[16]; - _mm512_store_ps(buffer, val); - return buffer[0] + buffer[1] + buffer[2] + buffer[3] + buffer[4] + buffer[5] + buffer[6] + buffer[7] + buffer[8] + buffer[9] + buffer[10] + buffer[11] + buffer[12] + buffer[13] + buffer[14] + buffer[15]; -} - -template <> -inline double FMath::ConvertTo<double,__m512d>(const __m512d val){ - __attribute__((aligned(64))) double buffer[8]; - _mm512_store_pd(buffer, val); - return buffer[0] + buffer[1] + buffer[2] + buffer[3] + buffer[4] + buffer[5] + buffer[6] + buffer[7]; -} -#endif -#endif #endif //FMATH_HPP diff --git a/Src/Utils/FSse.hpp b/Src/Utils/FSse.hpp deleted file mode 100644 index ad818a515ca6aa95ef17451b018ca6fbcce6c00b..0000000000000000000000000000000000000000 --- a/Src/Utils/FSse.hpp +++ /dev/null @@ -1,96 +0,0 @@ -// See LICENCE file at project root -#ifndef FSSE_HPP -#define FSSE_HPP - -#include "FGlobal.hpp" -#ifndef SCALFMM_USE_SSE -#error The SSE header is included while SCALFMM_USE_SSE is turned OFF -#endif - -#include <xmmintrin.h> // SSE -#include <emmintrin.h> //SSE2 -#include <pmmintrin.h> //SSE3 -#ifdef __SSSE3__ -#include <tmmintrin.h> //SSSE3 -#endif -#ifdef __SSSE4_1__ -#include <smmintrin.h> // SSE4 -#endif - - -#ifndef _mm_set_pd1 -// Looks like clang's emmintrin.h doesn't have this alternate name. -// But _mm_set1_pd is an equivalent to _mm_set_pd1. -#define _mm_set_pd1 _mm_set1_pd -#endif - - -#ifdef __SSEPE_INTEL_COMPILER - -inline __m128d& operator+=(__m128d& v1, const __m128d& v2){ - return (v1 = _mm_add_pd(v1, v2)); -} - -inline __m128d& operator-=(__m128d& v1, const __m128d& v2){ - return (v1 = _mm_sub_pd(v1, v2)); -} - -inline __m128d& operator*=(__m128d& v1, const __m128d& v2){ - return (v1 = _mm_mul_pd(v1, v2)); -} - -inline __m128d& operator/=(__m128d& v1, const __m128d& v2){ - return (v1 = _mm_div_pd(v1, v2)); -} - -inline __m128d operator+(const __m128d& v1, const __m128d& v2){ - return _mm_add_pd(v1, v2); -} - -inline __m128d operator-(const __m128d& v1, const __m128d& v2){ - return _mm_sub_pd(v1, v2); -} - -inline __m128d operator*(const __m128d& v1, const __m128d& v2){ - return _mm_mul_pd(v1, v2); -} - -inline __m128d operator/(const __m128d& v1, const __m128d& v2){ - return _mm_div_pd(v1, v2); -} - -inline __m128& operator+=(__m128& v1, const __m128& v2){ - return (v1 = _mm_add_ps(v1, v2)); -} - -inline __m128& operator-=(__m128& v1, const __m128& v2){ - return (v1 = _mm_sub_ps(v1, v2)); -} - -inline __m128& operator*=(__m128& v1, const __m128& v2){ - return (v1 = _mm_mul_ps(v1, v2)); -} - -inline __m128& operator/=(__m128& v1, const __m128& v2){ - return (v1 = _mm_div_ps(v1, v2)); -} - -inline __m128 operator+(const __m128& v1, const __m128& v2){ - return _mm_add_ps(v1, v2); -} - -inline __m128 operator-(const __m128& v1, const __m128& v2){ - return _mm_sub_ps(v1, v2); -} - -inline __m128 operator*(const __m128& v1, const __m128& v2){ - return _mm_mul_ps(v1, v2); -} - -inline __m128 operator/(const __m128& v1, const __m128& v2){ - return _mm_div_ps(v1, v2); -} - -#endif - -#endif // FSSE_HPP diff --git a/inastemp b/inastemp new file mode 160000 index 0000000000000000000000000000000000000000..5f1c2dd418c99c2eaa9299633929bbc1ac75001e --- /dev/null +++ b/inastemp @@ -0,0 +1 @@ +Subproject commit 5f1c2dd418c99c2eaa9299633929bbc1ac75001e