Use inastemp

f0b0460d · Berenger Bramas · 2c5546a4 · f0b0460d · f0b0460d · 2c5546a4
Commit f0b0460d authored 7 years ago by Berenger Bramas
--- a/.gitmodules
+++ b/.gitmodules
 [submodule "CMakeModules/morse_cmake"]
 	path = CMakeModules/morse_cmake
 	url = https://gitlab.inria.fr/solverstack/morse_cmake.git
+[submodule "inastemp"]
+	path = inastemp
+	url = https://gitlab.mpcdf.mpg.de/bbramas/inastemp.git
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -22,10 +22,6 @@ SET(SCALFMM_CMAKE_MODULE_PATH  ${CMAKE_CURRENT_SOURCE_DIR}/CMakeModules/)
 #
 # Adds the CMAKE_DEPENDENT_OPTION command
 INCLUDE(CMakeDependentOption)
-#  Add to check CPU info
-include(GetCpuInfos)
-GetCpuInfos()
-#
 #===========================================================================
 # Version Number
 #===========================================================================
@@ -89,18 +85,8 @@ if (MORSE_DISTRIB_DIR OR EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/CMakeModules/morse_
  option( SCALFMM_USE_EZTRACE          "Set to ON to compile with eztrace framwork" OFF )
  option( SCALFMM_USE_STARPU           "Set to ON to build SCALFMM with StarPU"     OFF )
  option( SCALFMM_BUILD_UTILS          "Set to ON to build utils Tests"             OFF )
-  #
-  #  VECTORISATION
-  #
-  if( APPLE ) # to fix problem with  GCC and avx
-    CMAKE_DEPENDENT_OPTION( SCALFMM_USE_SSE              "Set to ON to compile with SSE support (and use intrinsec SSE P2P)" ON "CPUOPTION_SSE3;NOT CPUOPTION_AVX2" OFF  )
-    CMAKE_DEPENDENT_OPTION( SCALFMM_USE_AVX              "Set to ON to compile with AVX support (and use intrinsec AVX P2P)" OFF "CPUOPTION_AVX; NOT CPUOPTION_AVX2" OFF  )
-  else(APPLE)
-    CMAKE_DEPENDENT_OPTION( SCALFMM_USE_SSE              "Set to ON to compile with SSE support (and use intrinsec SSE P2P)" ON "CPUOPTION_SSE3;NOT CPUOPTION_AVX;NOT CPUOPTION_AVX2" OFF  )
-    CMAKE_DEPENDENT_OPTION( SCALFMM_USE_AVX              "Set to ON to compile with AVX support (and use intrinsec AVX P2P)" ON "CPUOPTION_AVX; NOT CPUOPTION_AVX2" OFF  )
-  endif(APPLE)
-  CMAKE_DEPENDENT_OPTION( SCALFMM_USE_AVX2             "Set to ON to compile with AVX support (and use intrinsec AVX2 P2P)" ON "CPUOPTION_AVX2" OFF )
-  
+
+
  if( SCALFMM_ONLY_DEVEL )
    if(CMAKE_CXX_COMPILER_ID STREQUAL "Intel")
      option( SCALFMM_USE_OMP4 "Set to ON to disable the gcc/intel omp4"    OFF )
@@ -160,63 +146,21 @@ if (MORSE_DISTRIB_DIR OR EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/CMakeModules/morse_
    set(SCALFMM_CXX_FLAGS  "${SCALFMM_CXX_FLAGS} -m64")
  endif()
  ##############################################################################
-  #                           Compile options                                  #
+  #                           Inastemp                                         #
  ##############################################################################
-  #  -xHost -mfpmath=sse
-  # -Wall Wnosign-conversion
-  #

-  # Set a fixed template depth
-  # Compilers don't use the same default for template-depth, we can enforce the same one everywhere.
-  # The magic number comes from GCC's default: https://gcc.gnu.org/onlinedocs/gcc/C_002b_002b-Dialect-Options.html#C_002b_002b-Dialect-Options
-  set(SCALFMM_CXX_FLAGS "${SCALFMM_CXX_FLAGS} -ftemplate-depth=900")
-  if(CMAKE_CXX_COMPILER_ID STREQUAL "Intel")
-    # INTEL
-    IF (APPLE)
-      IF( CPUOPTION_SSE42 )
-        set(SSE_FLAGS  "-msse4  -mfpmath=sse")   # -mtune=native -march=native
-      ELSEIF (CPUOPTION_SSE3)
-        set(SSE_FLAGS  "-msse3  -mfpmath=sse")   # -mtune=native -march=native
-      ENDIF (CPUOPTION_SSE42)
-    else(APPLE)
-      set(AVX_FLAGS  "-fp-model source -march=native -axCORE-AVX2,CORE-AVX-I,AVX") #-mavx
-      set(AVX2_FLAGS "-march=native  -axCORE-AVX2,CORE-AVX-I,AVX") #-march=core-avx2
-      set(SSE_FLAGS  "-axSSE4.2  -march=native")
-    endif(APPLE)
-    set(SCALFMM_CXX_FLAGS  "${SCALFMM_CXX_FLAGS} -fma -align -finline-functions")
-    #-Wshadow -Wpointer-arith -Wcast-qual -Wconversion  -Wall -Wnosign-conversion ")
-  elseif(CMAKE_CXX_COMPILER_ID STREQUAL "XL")
-    set(SCALFMM_CXX_FLAGS  "${SCALFMM_CXX_FLAGS} -mcpu=power8 -mtune=power8")
-  else() #if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
-    # NOT      INTEL
-    if(NOT SCALFMM_USE_MPI)
-      include(CheckCCompilerFlag)
-      check_c_compiler_flag(-Wzero-as-null-pointer-constant HAS_WZERO_NULL_PTR_FLAG)
-      if(HAS_WZERO_NULL_PTR_FLAG)
-        set(SCALFMM_CXX_FLAGS  "${SCALFMM_CXX_FLAGS} -Wzero-as-null-pointer-constant")
-      endif()
-    else()
-      include(CheckCCompilerFlag)
-      check_c_compiler_flag(-Wno-literal-suffix HAS_NO_LITERAL_SUFFIX_FLAG)
-      if(HAS_NO_LITERAL_SUFFIX_FLAG)
-        set(SCALFMM_CXX_FLAGS  "${SCALFMM_CXX_FLAGS} -Wno-literal-suffix")
-      endif()
-    endif()
-    IF (APPLE)
-      #      set(SSE_FLAGS  "-msse4  -mfpmath=sse")   # -mtune=native -march=native
-      IF( CPUOPTION_SSE42 )
-        set(SSE_FLAGS  "-msse4  -mfpmath=sse")   # -mtune=native -march=native
-      ELSEIF (CPUOPTION_SSE3)
-        set(SSE_FLAGS  "-msse3  -mfpmath=sse")   # -mtune=native -march=native
-      ENDIF (CPUOPTION_SSE42)
-      set(AVX_FLAGS "-mtune=native -march=avx")
-      set(AVX2_FLAGS "-mtune=native -march=native -mmic")
-    else(APPLE)
-      set(SSE_FLAGS  "-mtune=native -march=native")
-      set(AVX_FLAGS "-mtune=native -march=native")
-      set(AVX2_FLAGS "-mtune=native -march=native -mmic")
-    endif(APPLE)
-  endif()
+  set(INASTEMP_JUST_LIB TRUE)
+  # add the cmakelist directory
+  add_subdirectory(inastemp)
+  # use the filled variables from inastemp
+  INCLUDE_DIRECTORIES(
+           ${INASTEMP_BINARY_DIR}/Src    
+           ${INASTEMP_SOURCE_DIR}/Src 
+           ${INASTEMP_INCLUDE_DIR}
+           ${CMAKE_CURRENT_BINARY_DIR}/inastemp/Src
+      )
+  # propagate the flags to be able to compile 
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${INASTEMP_CXX_FLAGS}")

  ##############################################################################
  #                           FUSE list                                        #
@@ -637,129 +581,13 @@ if (MORSE_DISTRIB_DIR OR EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/CMakeModules/morse_
  list(APPEND FUSE_LIST "STARPU")

  ##################################################################
-  #                         Use SSE                                #
+  #                         FUSE                                   #
  ##################################################################

-  message( STATUS "SCALFMM_USE_SSE              = ${SCALFMM_USE_SSE}" )
-  if( SCALFMM_USE_SSE )
-    if(NOT EXISTS ${SCALFMM_CMAKE_MODULE_PATH}/compileTestSse.cpp)
-      message(FATAL_ERROR "The CompileTestSseFile does not exist (${SCALFMM_CMAKE_MODULE_PATH}/compileTestSse.cpp)" )
-    endif()
-    message( STATUS "SSE_FLAGS ${SSE_FLAGS}  -- ${CMAKE_CXX_FLAGS}  ")
-    try_compile(COMPILE_SSE  ${CMAKE_CURRENT_BINARY_DIR}
-      ${SCALFMM_CMAKE_MODULE_PATH}/compileTestSse.cpp
-      COMPILE_DEFINITIONS "${CMAKE_CXX_FLAGS} ${SSE_FLAGS}"
-      OUTPUT_VARIABLE COMPILE_SSE_OUTPUT)
-
-    if(${COMPILE_SSE})
-      set(SCALFMM_CXX_FLAGS "${SCALFMM_CXX_FLAGS} ${SSE_FLAGS}")
-
-      try_compile(COMPILE_RESULT_VAR ${CMAKE_CURRENT_BINARY_DIR}
-        ${SCALFMM_CMAKE_MODULE_PATH}/checkSSEpe.cpp
-        COMPILE_DEFINITIONS "${CMAKE_CXX_FLAGS} ${SSE_FLAGS}")
-      if( NOT ${COMPILE_RESULT_VAR})
-        set(__SSEPE_INTEL_COMPILER ON)
-      endif()
-      #set(SCALFMM_USE_AVX OFF)
-    else(${COMPILE_SSE})
-      message(FATAL_ERROR "SSE NOT SUPPORTED ; Set SCALFMM_USE_SSE  to OFF \n Output from test is : ${COMPILE_SSE_OUTPUT}")
-    endif(${COMPILE_SSE})
-  endif()
  list(APPEND FUSE_LIST "SSE")
-
-  ##################################################################
-  #                           Use AVX                              #
-  ##################################################################
-
-  message(STATUS "SCALFMM_USE_AVX               = ${SCALFMM_USE_AVX}")
-  if(SCALFMM_USE_AVX)
-    if(NOT EXISTS ${SCALFMM_CMAKE_MODULE_PATH}/compileTestAvx.cpp)
-
-      message(WARNING "SCALFMM_CMAKE_MODULE_PATH ${SCALFMM_CMAKE_MODULE_PATH}/compileTestAvx.cpp" )
-      message(FATAL_ERROR "The CompileTestAvxFile does not exist (${SCALFMM_CMAKE_MODULE_PATH}/compileTestAvx.cpp)" )
-    endif()
-
-    try_compile(COMPILE_AVX ${CMAKE_CURRENT_BINARY_DIR}
-      ${SCALFMM_CMAKE_MODULE_PATH}/compileTestAvx.cpp
-      COMPILE_DEFINITIONS "${CMAKE_CXX_FLAGS} ${AVX_FLAGS}"
-      OUTPUT_VARIABLE COMPILE_AVX_OUTPUT)
-    if(${COMPILE_AVX})
-      message(STATUS "%%%%%%%%%%%% COMPILE_AVX               = ${COMPILE_AVX}  %%%%<    ${AVX_FLAGS}")
-
-      set(SCALFMM_CXX_FLAGS "${SCALFMM_CXX_FLAGS}   ${AVX_FLAGS}")
-      message(STATUS "%%%%%%%%%%%% SCALFMM_CXX_FLAGS               = ${SCALFMM_CXX_FLAGS}")
-      #set( SCALFMM_USE_SSE   OFF   FORCE) # ne marche pas
-      try_compile(COMPILE_RESULT_AVSPE ${CMAKE_CURRENT_BINARY_DIR}
-        ${SCALFMM_CMAKE_MODULE_PATH}/checkAVXpe.cpp
-        COMPILE_DEFINITIONS "${CMAKE_CXX_FLAGS} ${AVX_FLAGS}")
-      if( NOT ${COMPILE_RESULT_AVSPE})
-
-
-        set(__AVXPE_INTEL_COMPILER ON)
-      endif()
-
-      message(STATUS ${CMAKE_CXX_FLAGS} )
-    else(${COMPILE_AVX})
-      message(FATAL_ERROR "AVX NOT SUPPORTED ; Set SCALFMM_USE_AVX  to OFF \n Output from test is : ${COMPILE_AVX_OUTPUT} ")
-    endif(${COMPILE_AVX})
-  endif(SCALFMM_USE_AVX)
  list(APPEND FUSE_LIST "AVX")
-  #
-  # Error if both SCALFMM_USE_AVX AND SCALFMM_USE_SSE are set
-  #
-  if( SCALFMM_USE_AVX AND SCALFMM_USE_SSE)
-    message(FATAL_ERROR "Check SCALFMM_USE_SSE or SCALFMM_USE_AVX BUT NOT BOTH. ")
-  endif(SCALFMM_USE_AVX AND SCALFMM_USE_SSE)
-  ##################################################################
-  #                           Use AVX2                             #
-  ##################################################################
+  list(APPEND FUSE_LIST "MIC")

-  message(STATUS "SCALFMM_USE_AVX2               = ${SCALFMM_USE_AVX2}")
-  if(SCALFMM_USE_AVX2)
-    if(NOT EXISTS ${SCALFMM_CMAKE_MODULE_PATH}/compileTestAvx2.cpp)
-      message(FATAL_ERROR "The CompileTestSseFile does not exist (${SCALFMM_CMAKE_MODULE_PATH}/compileTestAvx.cpp)" )
-    endif()
-
-    try_compile(COMPILE_AVX2 ${CMAKE_CURRENT_BINARY_DIR}
-      ${SCALFMM_CMAKE_MODULE_PATH}/compileTestAvx2.cpp
-      COMPILE_DEFINITIONS "${CMAKE_CXX_FLAGS} ${AVX2_FLAGS}"
-      OUTPUT_VARIABLE COMPILE_AVX2_OUTPUT)
-    if(${COMPILE_AVX2})
-      set(SCALFMM_CXX_FLAGS "${SCALFMM_CXX_FLAGS}   ${AVX2_FLAGS}")
-      #set( SCALFMM_USE_SSE   OFF   FORCE) # ne marche pas
-      try_compile(COMPILE_RESULT_AVSPE ${CMAKE_CURRENT_BINARY_DIR}
-        ${SCALFMM_CMAKE_MODULE_PATH}/checkAVX2pe.cpp
-        COMPILE_DEFINITIONS "${CMAKE_CXX_FLAGS} ${AVX2_FLAGS}")
-      if( NOT ${COMPILE_RESULT_AVSPE})
-        set(__AVX2PE_INTEL_COMPILER ON)
-      endif()
-
-      message(STATUS ${CMAKE_CXX_FLAGS} )
-    else(${COMPILE_AVX2})
-      message(FATAL_ERROR "AVX2 NOT SUPPORTED ; Set SCALFMM_USE_AVX2  to OFF \n Output from test is : ${COMPILE_AVX_OUTPUT} ")
-    endif(${COMPILE_AVX2})
-  endif(SCALFMM_USE_AVX2)
-  list(APPEND FUSE_LIST "AVX2")
-  #
-  # Error if both SCALFMM_USE_AVX2 AND SCALFMM_USE_SSE are set
-  #
-  if( SCALFMM_USE_AVX2 AND SCALFMM_USE_SSE)
-    message(FATAL_ERROR "Check SCALFMM_USE_SSE or SCALFMM_USE_AVX2 BUT NOT BOTH. ")
-  endif(SCALFMM_USE_AVX2 AND SCALFMM_USE_SSE)
-  ##################################################################
-  #                     Use  native MIC compilation                #
-  ##################################################################
-#  If( SCALFMM_USE_MIC_NATIVE )
-#    include(CheckCCompilerFlag)
-#    check_c_compiler_flag(-mmic HAS_MMIC_FLAG)
-#    if(NOT HAS_MMIC_FLAG)
-#      message(FATAL_ERROR "MIC NOT SUPPORTED ; Set SCALFMM_USE_MIC_NATIVE to OFF")
-#    endif()
-#    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mmic")
-#  else()
-#    #set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -xhost")
-#  endif()
-#  list(APPEND FUSE_LIST "MIC")
  ##################################################################
  #
  #   Set EZTRACE

--- a/CMakeModules/GetCpuInfos.cmake
+++ b/CMakeModules/GetCpuInfos.cmake
-###########################################################################################
-# Berenger Bramas Inria
-# This goes with the getCpuInfos.cpp
-# This will create one CMAKE value per output option from the cpp file.
-# For example the output of the CPP file can be:
-# SSE3=TRUE;AVX=FALSE
-# Then it will create:
-# CPUOPTION_SSE3 = TRUE
-# CPUOPTION_AVX = FALSE
-#
-# The binary should return 0 on success.
-###########################################################################################
-macro(GetCpuInfos)
-# The original CPP file
-set(GetCpuInfosFile "${PROJECT_SOURCE_DIR}/CMakeModules/getCpuInfos.cpp")
-
-# Fatal error if the file does not exist
-if(NOT EXISTS ${GetCpuInfosFile})
-	message(FATAL_ERROR "The GetCpuInfosFile does not exist (${GetCpuInfosFile})")
-endif()
-
-# Compile and execute the file
-try_run(RUN_RESULT_VAR COMPILE_RESULT_VAR
-          ${CMAKE_BINARY_DIR} ${GetCpuInfosFile}  # [CMAKE_FLAGS <Flags>] [COMPILE_DEFINITIONS <flags>]
-          COMPILE_OUTPUT_VARIABLE comp
-          RUN_OUTPUT_VARIABLE run)
-
-# If it has successfuly compiled an run
-if(COMPILE_RESULT_VAR AND (RUN_RESULT_VAR EQUAL 0) )
-	set( CPU_OPTIONS ${run} )
-	# For each value
-	foreach(optionNode ${run})
-		# Get name and value
-		string(REPLACE "=" ";" optionNameAndValue ${optionNode})
-		list(LENGTH optionNameAndValue optionLength)
-		# If we get both
-		if(optionLength EQUAL 2)
-			list(GET optionNameAndValue 0 optionName)
-			list(GET optionNameAndValue 1 optionValue)
-			# create cmake variable
-			set(CPUOPTION_${optionName} ${optionValue})
-		else()
-			message(WARNING "GetCpuInfosFile wrong format for ${optionNode}.")
-		endif()
-	endforeach()
-	# output the sentence from the binrary
-	message(STATUS "CPUOPTION : ${CPU_OPTIONS}")
-else()
-	message(WARNING "GetCpuInfosFile did not return correctly.")
-endif()
-
-endmacro(GetCpuInfos)
--- a/CMakeModules/checkAVX2pe.cpp
+++ b/CMakeModules/checkAVX2pe.cpp
-
-#include "immintrin.h"
-
-
-int main() {
-#ifdef __MIC__
-	__m512 tx, ty ;
-	tx += ty ;
-#endif
-  return 0;
-}
-
--- a/CMakeModules/checkAVXpe.cpp
+++ b/CMakeModules/checkAVXpe.cpp
-
-#include "immintrin.h"
-
-
-int main() {
-	__m256d tx, ty ;
-	tx += ty ;
-  return 0;
-}
--- a/CMakeModules/checkSSEpe.cpp
+++ b/CMakeModules/checkSSEpe.cpp
-#include <xmmintrin.h>  // SSE
-#include <emmintrin.h>  //SSE2
-#include <pmmintrin.h> //SSE3
-#ifdef __SSSE3__
-#include <tmmintrin.h>  //SSSE3
-#endif
-#ifdef __SSSE4_1__
-#include <smmintrin.h> // SSE4
-#endif
-
-int main() {
-	__m128d tx, ty ;
-	tx += ty ;
-  return 0;
-}
--- a/CMakeModules/compileTestAvx.cpp
+++ b/CMakeModules/compileTestAvx.cpp
-
-#include <x86intrin.h>
-#include <xmmintrin.h> // SSE
-#include <emmintrin.h> // SSE2
-#include <pmmintrin.h> // SSE3
-#include <tmmintrin.h> // SSSE3
-#include <smmintrin.h> // SSE4
-
-#include <immintrin.h> // AVX
-
-int main(){
-	{
-		__m256d res0d, res1d;
-		res0d = _mm256_hadd_pd(res0d, res1d);
-
-		__m256 res0, res1;
-		res0 = _mm256_hadd_ps(res0, res1);
-	}
-	{
-		__m128d res0d, res1d;
-		res0d = _mm_hadd_pd(res0d, res1d);
-
-		__m128 res0, res1;
-		res0 = _mm_hadd_ps(res0, res1);
-	}
-	return 0;
-}
--- a/CMakeModules/compileTestAvx2.cpp
+++ b/CMakeModules/compileTestAvx2.cpp
-
-#include <x86intrin.h>
-#include <xmmintrin.h> // SSE
-#include <emmintrin.h> // SSE2
-#include <pmmintrin.h> // SSE3
-#include <tmmintrin.h> // SSSE3
-#include <smmintrin.h> // SSE4
-
-#include <immintrin.h> // AVX
-
-int main(){
-	{
-	#ifdef __MIC__
-		__m512d res0d, res1d;
-		res0d = _mm512_hadd_pd(res0d, res1d);
-
-		__m512 res0, res1;
-		res0 = _mm512_hadd_ps(res0, res1);
-	#endif
-	}
-	{
-		__m256d res0d, res1d;
-		res0d = _mm256_hadd_pd(res0d, res1d);
-
-		__m256 res0, res1;
-		res0 = _mm256_hadd_ps(res0, res1);
-	}
-	{
-		__m128d res0d, res1d;
-		res0d = _mm_hadd_pd(res0d, res1d);
-
-		__m128 res0, res1;
-		res0 = _mm_hadd_ps(res0, res1);
-	}
-	return 0;
-}
-
--- a/CMakeModules/compileTestIntel.cpp
+++ b/CMakeModules/compileTestIntel.cpp
-int main(){
-
-    int i ;
-#ifdef __INTEL_COMPILER
-
-     i = 0;
-
-#else
-
-#error 'Not Intel Compiler "
-
-#endif
-}
--- a/CMakeModules/compileTestSse.cpp
+++ b/CMakeModules/compileTestSse.cpp
-
-#include <x86intrin.h>
-#include <xmmintrin.h> // SSE
-#include <emmintrin.h> // SSE2
-#include <pmmintrin.h> // SSE3
-#ifdef __SSSE3__
-#include <tmmintrin.h>  //SSSE3
-#endif
-#ifdef __SSSE4_1__
-#include <smmintrin.h> // SSE4
-#endif
-int main(){
-	__m128d res0d, res1d;
-	res0d = _mm_hadd_pd(res0d, res1d);
-
-	__m128 res0, res1;
-	res0 = _mm_hadd_ps(res0, res1);
-
-	return 0;
-}
--- a/CMakeModules/getCpuInfos.cpp
+++ b/CMakeModules/getCpuInfos.cpp
-///////////////////////////////////////////////////////////////////////////
-// Berenger Bramas INRIA - 2014
-// Code provided under GNU Lesser General Public License
-//
-//
-// This file ask the cpuid to get access to CPU properties.
-// The file contains 3 mains parts:
-// × First part is a wrapper in case we are on Windows or Linux
-// × Second part is several call to the function and fill of an list
-// × Third is out of scope, it prints the state and the properties
-//   in a strict format in order to post process
-///////////////////////////////////////////////////////////////////////////
-
-
-
-///////////////////////////////////////////////////////////////////////////
-// Part 1:
-// Defines cpuid:
-// × A Wrapper if we are on windows
-// × A call to assembly else
-///////////////////////////////////////////////////////////////////////////
-
-enum RegistersNum {
-    EaxRegister = 0,
-    EbxRegister,
-    EcxRegister,
-    EdxRegister
-};
-
-#ifdef _WIN32
-
-// On windows __cpuid exists: http://msdn.microsoft.com/en-us/library/hskdteyh(v=vs.90).aspx
-// void __cpuid(int CPUInfo[4],int InfoType);
-// we would like to have the same name for not windows
-#define cpuid    __cpuid
-
-#elif  _ARCH_PPC
-#error("PPC") 
-#else
-
-// Else we have to ask the CPU directly by executin cpuid.
-// eax should contains the information querry argument.
-// Then we have to take the results from the different registers.
-//
-//    From : http://www.ibiblio.org/gferg/ldp/GCC-Inline-Assembly-HOWTO.html
-//
-//    asm ( assembler template
-//        : output operands                  // optional
-//        : input operands                   // optional
-//        : list of clobbered registers      // optional
-//        );
-//
-//    +---+--------------------+
-//    | r |    Register(s)     |
-//    +---+--------------------+
-//    | a |   %eax, %ax, %al   |
-//    | b |   %ebx, %bx, %bl   |
-//    | c |   %ecx, %cx, %cl   |
-//    | d |   %edx, %dx, %dl   |
-//    | S |   %esi, %si        |
-//    | D |   %edi, %di        |
-//    +---+--------------------+
-//
-
-
-//  GCC Inline Assembly but with the same prototype as windows
-void cpuid(int CPUInfo[4],int InfoType){
-    __asm__ __volatile__ (
-        "cpuid":            // Execute this instruction
-        "=a" (CPUInfo[EaxRegister]),  // Store eax in 0
-        "=b" (CPUInfo[EbxRegister]),  // Store ebx in 1
-        "=c" (CPUInfo[EcxRegister]),  // Store ecx in 2
-        "=d" (CPUInfo[EdxRegister]) : // Store edx in 3
-        "a" (InfoType)      // Input InfoType in eax before instruction
-    );
-}
-
-#endif
-
-#ifndef  _ARCH_PPC
-bool CPUInfoGetEAX(const int CPUInfo[4], const int position){
-    return (CPUInfo[EaxRegister] & ((int)1 << position)) != 0;
-}
-
-bool CPUInfoGetEBX(const int CPUInfo[4], const int position){
-    return (CPUInfo[EbxRegister] & ((int)1 << position)) != 0;
-}
-
-bool CPUInfoGetECX(const int CPUInfo[4], const int position){
-    return (CPUInfo[EcxRegister] & ((int)1 << position)) != 0;
-}
-
-bool CPUInfoGetEDX(const int CPUInfo[4], const int position){
-    return (CPUInfo[EdxRegister] & ((int)1 << position)) != 0;
-}
-
-///////////////////////////////////////////////////////////////////////////
-// Part 2:
-// Call the cpuid function and ask for particular information.
-// In our case we want to use these information to print it (and later use it
-// in a CMake file).
-// So you can change this file to get more informations and do something else with them.
-//
-//    From 64-ia-32-architectures-software-developer-instruction-set-reference-manual-325383.pdf
-//    Or recommanded : AMD CPUID_Specification.pdf
-//    We know in part CPUID—CPU Identification that a call to the cpuid instruction fill the registers
-//    with the cpu property.
-///////////////////////////////////////////////////////////////////////////
-
-#include <string>
-#include <list>
-
-struct CpuProperty {
-    CpuProperty(const char inName[], const bool IsEnable)
-        : name(inName), enabled(IsEnable){
-    }
-
-    std::string name;
-    bool enabled;
-};
-
-std::list<CpuProperty> getProperties(){
-    std::list<CpuProperty> properties;
-
-    // To store the registers value
-	int info[4];
-
-    // Basic CPUID Information
-    cpuid(info, 0);
-    // The largest CPUID standard-function input value supported by the processor implementation.
-    const int limitStandardFunction = info[EaxRegister];
-
-    // Extended Function CPUID Information
-    cpuid(info, 0x80000000);
-    // The largest CPUID extended-function input value supported by the processor implementation
-    int limitExtendedFunction = info[EaxRegister];
-
-	//  Detect Instruction Set
-    if (limitStandardFunction >= 1){
-        cpuid(info,0x00000001); // Basic CPUID Information
-        /*
-        0x00000001 - EDX :
-            31:29 Reserved.
-            28 HTT: Hyper-Threading Technology. Indicates either that there is more than one thread per CPU core
-              or more than one CPU core per processor. AMD currently does not support more than one thread per
-             CPU core. See “Legacy Method” on page 23.
-            27 Reserved.
-            26 SSE2: SSE2 extensions. See Appendix D “CPUID Feature Sets” in APM3.
-            25 SSE: SSE extensions. See Appendix D “CPUID Feature Sets” in APM3 appendix and “64-Bit Media
-                    Programming” in APM1.
-            24 FXSR: FXSAVE and FXRSTOR instructions. See “FXSAVE” and “FXRSTOR” in APM4.
-            23 MMX: MMXTM instructions. See Appendix D “CPUID Feature Sets” in APM3 and “128-Bit Media
-                      and Scientific Programming” in APM1.
-            22:20 Reserved.
-            19 18 Reserved.
-            17 PSE36: Page-size extensions. The PDE[20:13] supplies physical address [39:32]. See “Page Translation and Protection” in APM2.
-            16 PAT: Page attribute table. PCD, PWT, and PATi are used to alter memory type. See “Page-Attribute Table Mechanism” in APM2.
-            15 CMOV: Conditional move instructions, CMOV, FCMOV. See “CMOV”, “FCMOV” in APM3.
-            14 MCA: Machine check architecture, MCG_CAP. See “Machine Check Mechanism” in APM2.
-            13 PGE: Page global extension, CR4.PGE. See “Page Translation and Protection” in APM2.
-            12 MTRR: Memory-type range registers. MTRRcap supported. See “Page Translation and Protection” in APM2.
-            11 SysEnterSysExit: SYSENTER and SYSEXIT instructions. See “SYSENTER”, “SYSEXIT“ in APM3.
-            10 Reserved
-            9 APIC. Advanced programmable interrupt controller (APIC) exists and is enabled. See “Exceptions
-               and Interrupts” in APM2.
-            8 CMPXCHG8B: CMPXCHG8B instruction. See “CMPXCHG8B” in APM3.
-            7 MCE: Machine check exception, CR4.MCE. See “Machine Check Mechanism” in APM2.
-            6 PAE: Physical-address extensions (PAE), support for physical addresses ≥ 32b. Number of physical
-               address bits above 32b is implementation specific. See “Page Translation and Protection” in APM2.
-            5 MSR: AMD model-specific registers (MSRs), with RDMSR and WRMSR instructions. See “Model
-               Specific Registers” in APM2.
-            4 TSC: Time stamp counter. RDTSC and RDTSCP instruction support. See “Debug and Performance
-               Resources” in APM2.
-            3 PSE: Page-size extensions (4 MB pages). See “Page Translation and Protection” in APM2.
-            2 DE: Debugging extensions, I/O breakpoints, CR4.DE. See “Debug and Performance Resources” in
-                 APM2.
-            1 VME: Virtual-mode enhancements, CR4.VME, CR4.PVI, software interrupt indirection, expansion
-             of the TSS with the software, indirection bitmap, EFLAGS.VIF, EFLAGS.VIP. See “System
-              Resources” in APM2.
-            0 FPU: x87 floating point unit on-chip. See “x87 Floating Point Programming” in APM1
-         */
-
-        properties.push_back(CpuProperty("MMX", CPUInfoGetEDX(info, 23)));
-        properties.push_back(CpuProperty("SSE", CPUInfoGetEDX(info, 25)));
-        properties.push_back(CpuProperty("SSE2", CPUInfoGetEDX(info, 26)));
-
-        /*
-         0x00000001 - ECX :
-            0 SSE3 Streaming SIMD Extensions 3 (SSE3). A value of 1 indicates the processor supports this
-            technology.
-            1 PCLMULQDQ PCLMULQDQ. A value of 1 indicates the processor supports the PCLMULQDQ instruction
-            2 DTES64 64-bit DS Area. A value of 1 indicates the processor supports DS area using 64-bit layout
-            3 MONITOR MONITOR/MWAIT. A value of 1 indicates the processor supports this feature.
-            4 DS-CPL CPL Qualified Debug Store. A value of 1 indicates the processor supports the extensions to the
-            Debug Store feature to allow for branch message storage qualified by CPL.
-            5 VMX Virtual Machine Extensions. A value of 1 indicates that the processor supports this technology
-            6 SMX Safer Mode Extensions. A value of 1 indicates that the processor supports this technology. See
-            Chapter 5, “Safer Mode Extensions Reference”.
-            7 EIST Enhanced Intel SpeedStep® technology. A value of 1 indicates that the processor supports this
-            technology.
-            8 TM2 Thermal Monitor 2. A value of 1 indicates whether the processor supports this technology.
-            9 SSSE3 A value of 1 indicates the presence of the Supplemental Streaming SIMD Extensions 3 (SSSE3). A
-            value of 0 indicates the instruction extensions are not present in the processor
-            10 CNXT-ID L1 Context ID. A value of 1 indicates the L1 data cache mode can be set to either adaptive mode
-            or shared mode. A value of 0 indicates this feature is not supported. See definition of the
-            IA32_MISC_ENABLE MSR Bit 24 (L1 Data Cache Context Mode) for details.
-            11 SDBG A value of 1 indicates the processor supports IA32_DEBUG_INTERFACE MSR for silicon debug.
-            12 FMA A value of 1 indicates the processor supports FMA extensions using YMM state.
-            13 CMPXCHG16B CMPXCHG16B Available. A value of 1 indicates that the feature is available. See the
-            “CMPXCHG8B/CMPXCHG16B—Compare and Exchange Bytes” section in this chapter for a
-            description.
-            14 xTPR Update
-            Control
-            xTPR Update Control. A value of 1 indicates that the processor supports changing
-            IA32_MISC_ENABLE[bit 23].
-            15 PDCM Perfmon and Debug Capability: A value of 1 indicates the processor supports the performance
-            and debug feature indication MSR IA32_PERF_CAPABILITIES.
-            16 Reserved Reserved
-            17 PCID Process-context identifiers. A value of 1 indicates that the processor supports PCIDs and that
-            software may set CR4.PCIDE to 1.
-            18 DCA A value of 1 indicates the processor supports the ability to prefetch data from a memory mapped
-            device.
-            19 SSE4.1 A value of 1 indicates that the processor supports SSE4.1.
-            20 SSE4.2 A value of 1 indicates that the processor supports SSE4.2.
-            21 x2APIC A value of 1 indicates that the processor supports x2APIC feature.
-            22 MOVBE A value of 1 indicates that the processor supports MOVBE instruction.
-            23 POPCNT A value of 1 indicates that the processor supports the POPCNT instruction.
-            24 TSC-Deadline A value of 1 indicates that the processor’s local APIC timer supports one-shot operation using a
-            TSC deadline value.
-            25 AESNI A value of 1 indicates that the processor supports the AESNI instruction extensions.
-            26 XSAVE A value of 1 indicates that the processor supports the XSAVE/XRSTOR processor extended states
-            feature, the XSETBV/XGETBV instructions, and XCR0.
-            27 OSXSAVE A value of 1 indicates that the OS has set CR4.OSXSAVE[bit 18] to enable the XSAVE feature set.
-            28 AVX A value of 1 indicates the processor supports the AVX instruction extensions.
-            29 F16C A value of 1 indicates that processor supports 16-bit floating-point conversion instructions.
-            30 RDRAND A value of 1 indicates that processor supports RDRAND instruction.
-            31 Not Used Always returns 0.
-          */
-        properties.push_back(CpuProperty("SSE3", CPUInfoGetECX(info,  0)));
-        properties.push_back(CpuProperty("SSSE3", CPUInfoGetECX(info,  9)));
-        properties.push_back(CpuProperty("SSE41", CPUInfoGetECX(info, 19)));
-        properties.push_back(CpuProperty("SSE42", CPUInfoGetECX(info, 20)));
-        properties.push_back(CpuProperty("AVX",  CPUInfoGetECX(info, 28)));
-        properties.push_back(CpuProperty("FMA3", CPUInfoGetECX(info, 12)));
-	}
-
-    if (limitExtendedFunction >= 0x80000001){
-        cpuid(info,0x80000001); // Extended Function CPUID Information
-        /*
-        0x80000001 - EDX :
-            31 3DNow: 3DNow!TM instructions. See Appendix D “Instruction Subsets and CPUID Feature Sets” in APM3.
-            30 3DNowExt: AMD extensions to 3DNow! instructions. See Appendix D “Instruction Subsets and
-                CPUID Feature Sets” in APM3.
-            29 LM: Long mode. See “Processor Initialization and Long-Mode Activation” in APM2.
-            28 Reserved.
-            27 RDTSCP: RDTSCP instruction. See “RDTSCP” in APM3.
-            26 Page1GB: 1-GB large page support. See “1-GB Paging Support” in APM2.
-            25 FFXSR: FXSAVE and FXRSTOR instruction optimizations. See “FXSAVE” and “FXRSTOR” in APM4.
-            24 FXSR: FXSAVE and FXRSTOR instructions. Same as CPUID Fn0000_0001_EDX[FXSR].
-            23 MMX: MMXTM instructions. Same as CPUID Fn0000_0001_EDX[MMX].
-            22 MmxExt: AMD extensions to MMX instructions. See Appendix D “Instruction Subsets and CPUID
-                Feature Sets” in APM3 and “128-Bit Media and Scientific Programming” in APM1.
-            21 Reserved.
-            20 NX: No-execute page protection. See “Page Translation and Protection” in APM2.
-            19:18 Reserved.
-            17 PSE36: Page-size extensions. Same as CPUID Fn0000_0001_EDX[PSE36].
-            16 PAT: Page attribute table. Same as CPUID Fn0000_0001_EDX[PAT].
-            15 CMOV: Conditional move instructions. Same as CPUID Fn0000_0001_EDX[CMOV]
-            14 MCA: Machine check architecture. Same as CPUID Fn0000_0001_EDX[MCA].
-            13 PGE: Page global extension. Same as CPUID Fn0000_0001_EDX[PGE].
-            12 MTRR: Memory-type range registers. Same as CPUID Fn0000_0001_EDX[MTRR].
-            11 SysCallSysRet: SYSCALL and SYSRET instructions. See “SYSCALL” and “SYSRET” in APM3.
-            10 Reserved.
-            9 APIC. Advanced programmable interrupt controller. Same as CPUID Fn0000_0001_EDX[APIC].
-            8 CMPXCHG8B: CMPXCHG8B instruction. Same as CPUID Fn0000_0001_EDX[CMPXCHG8B].
-            7 MCE: Machine check exception. Same as CPUID Fn0000_0001_EDX[MCE].
-            6 PAE: Physical-address extensions. Same as CPUID Fn0000_0001_EDX[PAE].
-            5 MSR: AMD model-specific registers. Same as CPUID Fn0000_0001_EDX[MSR].
-            4 TSC: Time stamp counter. Same as CPUID Fn0000_0001_EDX[TSC].
-            3 PSE: Page-size extensions. Same as CPUID Fn0000_0001_EDX[PSE].
-            2 DE: Debugging extensions. Same as CPUID Fn0000_0001_EDX[DE].
-            1 VME: Virtual-mode enhancements. Same as CPUID Fn0000_0001_EDX[VME].
-            0 FPU: x87 floating-point unit on-chip. Same as CPUID Fn0000_0001_EDX[FPU].
-          */
-        properties.push_back(CpuProperty("x64", CPUInfoGetEDX(info, 29)));
-        /*
-        0x80000001 - ECX :
-            31:14 Reserved.
-            13 WDT: Watchdog timer support.
-            12 SKINIT: SKINIT, STGI, and DEV support.
-            11:10 Reserved.
-            9 OSVW: OS visible workaround. Indicates OS-visible workaround support. See “OS Visible Work-
-             around (OSVW) Information” in APM2.
-            8 3DNowPrefetch: PREFETCH and PREFETCHW instruction support. See “PREFETCH” and “PREFETCHW” in APM3.
-            7 MisAlignSse: Misaligned SSE mode. See “Misaligned Access Support Added for SSE Instructions”
-                 in APM1.
-            6 SSE4A: EXTRQ, INSERTQ, MOVNTSS, and MOVNTSD instruction support. See “EXTRQ”,
-                 “INSERTQ”, “MOVNTSS”, and “MOVNTSD” in APM4.
-            5 ABM: Advanced bit manipulation. LZCNT instruction support. See “LZCNT” in APM3.
-            4 AltMovCr8: LOCK MOV CR0 means MOV CR8. See “MOV(CRn)” in APM3.
-            3 ExtApicSpace: This bit indicates the presence of extended APIC register space starting at offset
-             400h from the “APIC Base Address Register,” as specified in the BKDG..
-            2 SVM: Secure virtual machine feature. See “Secure Virtual Machine” in APM2.
-            1 CmpLegacy: Core multi-processing legacy mode. See “Legacy Method” on page 23.
-            0 LahfSahf: LAHF and SAHF instruction support in 64-bit mode. See “LAHF” and “SAHF” in   APM3.
-          */
-        properties.push_back(CpuProperty("SSE4a", CPUInfoGetECX(info,  6)));
-        properties.push_back(CpuProperty("FMA4", CPUInfoGetECX(info, 16)));
-        properties.push_back(CpuProperty("XOP", CPUInfoGetECX(info, 11)));
-	}
-
-    return properties;
-}
-#else
-// POWER (IBM)
-#endif
-
-
-///////////////////////////////////////////////////////////////////////////
-// Part 3:
-// Print the information in a format to use it with CMake
-///////////////////////////////////////////////////////////////////////////
-
-#include <iostream>
-
-int main(){
-    const std::list<CpuProperty> properties = getProperties();
-
-    const std::list<CpuProperty>::const_iterator endIterProperties = properties.end();
-    for(std::list<CpuProperty>::const_iterator iterProperties = properties.begin()
-            ; iterProperties != endIterProperties
-            ; ++iterProperties){
-        // Print the status
-        std::cout << (*iterProperties).name << "=" << ((*iterProperties).enabled?"TRUE":"FALSE") << ";";
-    }
-
-    return 0;
-}
--- a/README.md
+++ b/README.md
@@ -30,7 +30,7 @@ The following are optional:

 ### Get and Build ScalFMM
 To use last development states of ScalFMM, please clone the develop
-  branch. Note that ScalFMM contains a git submodule `morse_cmake`.
+  branch. Note that ScalFMM contains two git submodules `morse_cmake` and `inastemp`.
  To get sources please use these commands:
 ``` bash
 git clone --recursive git@gitlab.inria.fr:solverstack/ScalFMM.git -b develop

--- a/Src/Adaptive/FAdaptChebKernel.hpp
+++ b/Src/Adaptive/FAdaptChebKernel.hpp
@@ -95,13 +95,13 @@ public:
        // apply P2L
        for(int idxRhs = 0 ; idxRhs < NVALS ; ++idxRhs){
            for (unsigned int m = 0; m<FBase::nnodes; ++m) {
-                ComputeClass XX = FMath::ConvertTo<ComputeClass>(X[m].getX());
-                ComputeClass XY = FMath::ConvertTo<ComputeClass>(X[m].getY());
-                ComputeClass XZ = FMath::ConvertTo<ComputeClass>(X[m].getZ());
+                ComputeClass XX = ComputeClass(X[m].getX());
+                ComputeClass XY = ComputeClass(X[m].getY());
+                ComputeClass XZ = ComputeClass(X[m].getZ());

                std::size_t idxPart = 0;
                // Compute using vectorization for all but the last array elements
-                ComputeClass tmpLocalExp = FMath::Zero<ComputeClass>();
+                ComputeClass tmpLocalExp = ComputeClass(0);
                for (;
                     idxPart < ((particles->getNbParticles())
                                / FRealCount);
@@ -114,7 +114,7 @@ public:
                        * physicalValues[idxPart];
                }

-                local->get(idxRhs)[m] += FMath::ConvertTo<FReal>(tmpLocalExp);
+                local->get(idxRhs)[m] += ComputeClass(tmpLocalExp);

                // Compute the last array elements one by one if they exist
                if(idxPart < ((particles->getNbParticles() + FRealCount - 1) / FRealCount)) {
@@ -172,11 +172,11 @@ public:
            for (unsigned int n=0; n<FBase::nnodes; ++n){

                ComputeClass  MultipoleExpansion =
-                    FMath::ConvertTo<ComputeClass, FReal>(pole->get(idxRhs)[n]);
+                    ComputeClass(pole->get(idxRhs)[n]);

-                ComputeClass YX = FMath::ConvertTo<ComputeClass, FReal>(Y[n].getX());
-                ComputeClass YY = FMath::ConvertTo<ComputeClass, FReal>(Y[n].getY());
-                ComputeClass YZ = FMath::ConvertTo<ComputeClass, FReal>(Y[n].getZ());
+                ComputeClass YX = ComputeClass(Y[n].getX());
+                ComputeClass YY = ComputeClass(Y[n].getY());
+                ComputeClass YZ = ComputeClass(Y[n].getZ());

                for(std::size_t idxPart = 0;
                    idxPart < ( (particles->getNbParticles() + FRealCount - 1)

--- a/Src/Adaptive/FAdaptUnifKernel.hpp
+++ b/Src/Adaptive/FAdaptUnifKernel.hpp
@@ -140,11 +140,11 @@ public:
        // apply P2L
        for(int idxRhs = 0 ; idxRhs < NVALS ; ++idxRhs){
            for (unsigned int m = 0; m < FBase::nnodes; ++m) {
-                ComputeClass XX = FMath::ConvertTo<ComputeClass>(X[m].getX());
-                ComputeClass XY = FMath::ConvertTo<ComputeClass>(X[m].getY());
-                ComputeClass XZ = FMath::ConvertTo<ComputeClass>(X[m].getZ());
+                ComputeClass XX = ComputeClass(X[m].getX());
+                ComputeClass XY = ComputeClass(X[m].getY());
+                ComputeClass XZ = ComputeClass(X[m].getZ());

-                ComputeClass tmpLocalExp = FMath::Zero<ComputeClass>();
+                ComputeClass tmpLocalExp = ComputeClass(0);
                // Compute using vectorization for all but the last array elements
                std::size_t idxPart = 0;
                for (; idxPart < (particles->getNbParticles() / FRealCount);
@@ -157,7 +157,7 @@ public:
                        * physicalValues[idxPart];
                }

-                local->get(idxRhs)[m] += FMath::ConvertTo<FReal>(tmpLocalExp);
+                local->get(idxRhs)[m] += FReal(tmpLocalExp);

                // Compute the last array elements one by one if they exist
                if(idxPart < ((particles->getNbParticles() + FRealCount - 1) / FRealCount)) {
@@ -221,11 +221,11 @@ public:
            for (unsigned int n=0; n<FBase::nnodes; ++n){

                ComputeClass  MultipoleExpansion =
-                    FMath::ConvertTo<ComputeClass, FReal>(pole->get(idxRhs)[n]);
+                    ComputeClass(pole->get(idxRhs)[n]);

-                ComputeClass YX = FMath::ConvertTo<ComputeClass, FReal>(Y[n].getX());
-                ComputeClass YY = FMath::ConvertTo<ComputeClass, FReal>(Y[n].getY());
-                ComputeClass YZ = FMath::ConvertTo<ComputeClass, FReal>(Y[n].getZ());
+                ComputeClass YX = ComputeClass(Y[n].getX());
+                ComputeClass YY = ComputeClass(Y[n].getY());
+                ComputeClass YZ = ComputeClass(Y[n].getZ());

                for(std::size_t idxPart = 0;
                    idxPart < ( (particles->getNbParticles() + FRealCount - 1)

--- a/Src/Kernels/Interpolation/FInterpMatrixKernel.hpp
+++ b/Src/Kernels/Interpolation/FInterpMatrixKernel.hpp
@@ -89,7 +89,7 @@ struct FInterpMatrixKernelR : FInterpAbstractMatrixKernel<FReal>
        const ValueClass diffx = (xt-xs);
        const ValueClass diffy = (yt-ys);
        const ValueClass diffz = (zt-zs);
-        return FMath::One<ValueClass>() / FMath::Sqrt(diffx*diffx + diffy*diffy + diffz*diffz);
+        return ValueClass(1) / FMath::Sqrt(diffx*diffx + diffy*diffy + diffz*diffz);
    }

    // evaluate interaction (blockwise)
@@ -110,7 +110,7 @@ struct FInterpMatrixKernelR : FInterpAbstractMatrixKernel<FReal>
        const ValueClass diffx = (xt-xs);
        const ValueClass diffy = (yt-ys);
        const ValueClass diffz = (zt-zs);
-        const ValueClass one_over_r = FMath::One<ValueClass>() / FMath::Sqrt(diffx*diffx + diffy*diffy + diffz*diffz);
+        const ValueClass one_over_r = ValueClass(1) / FMath::Sqrt(diffx*diffx + diffy*diffy + diffz*diffz);

        const ValueClass one_over_r3 = one_over_r*one_over_r*one_over_r;

@@ -176,9 +176,9 @@ struct FInterpMatrixKernelRH :FInterpMatrixKernelR<FReal>{
        const ValueClass diffx = (xt-xs);
        const ValueClass diffy = (yt-ys);
        const ValueClass diffz = (zt-zs);
-        return FMath::One<ValueClass>() / FMath::Sqrt(FMath::ConvertTo<ValueClass,FReal>(LX)*diffx*diffx +
-                                       FMath::ConvertTo<ValueClass,FReal>(LY)*diffy*diffy +
-                                       FMath::ConvertTo<ValueClass,FReal>(LZ)*diffz*diffz);
+        return ValueClass(1) / FMath::Sqrt(ValueClass(LX)*diffx*diffx +
+                                       ValueClass(LY)*diffy*diffy +
+                                       ValueClass(LZ)*diffz*diffz);
    }
    void setCoeff(const FReal& a,  const FReal& b, const FReal& c)
    {LX= a*a ; LY = b*b ; LZ = c *c;}
@@ -208,16 +208,16 @@ struct FInterpMatrixKernelRH :FInterpMatrixKernelR<FReal>{
        const ValueClass diffx = (xt-xs);
        const ValueClass diffy = (yt-ys);
        const ValueClass diffz = (zt-zs);
-        const ValueClass one_over_rL = FMath::One<ValueClass>() / FMath::Sqrt(FMath::ConvertTo<ValueClass,FReal>(LX)*diffx*diffx +
-                                                          FMath::ConvertTo<ValueClass,FReal>(LY)*diffy*diffy +
-                                                          FMath::ConvertTo<ValueClass,FReal>(LZ)*diffz*diffz);
+        const ValueClass one_over_rL = ValueClass(1) / (ValueClass(LX)*diffx*diffx +
+                                                          ValueClass(LY)*diffy*diffy +
+                                                          ValueClass(LZ)*diffz*diffz);
        const ValueClass one_over_rL3 = one_over_rL*one_over_rL*one_over_rL;

        block[0] = one_over_rL;

-        blockDerivative[0] = FMath::ConvertTo<ValueClass,FReal>(LX) * one_over_rL3 * diffx;
-        blockDerivative[1] = FMath::ConvertTo<ValueClass,FReal>(LY)* one_over_rL3 * diffy;
-        blockDerivative[2] = FMath::ConvertTo<ValueClass,FReal>(LZ)* one_over_rL3 * diffz;
+        blockDerivative[0] = ValueClass(LX) * one_over_rL3 * diffx;
+        blockDerivative[1] = ValueClass(LY)* one_over_rL3 * diffy;
+        blockDerivative[2] = ValueClass(LZ)* one_over_rL3 * diffz;

    }

@@ -283,7 +283,7 @@ struct FInterpMatrixKernelRR : FInterpAbstractMatrixKernel<FReal>
        const ValueClass diffx = (xt-xs);
        const ValueClass diffy = (yt-ys);
        const ValueClass diffz = (zt-zs);
-        return FMath::One<ValueClass>() / FReal(diffx*diffx+diffy*diffy+diffz*diffz);
+        return ValueClass(1) / FReal(diffx*diffx+diffy*diffy+diffz*diffz);
    }

    // evaluate interaction (blockwise)
@@ -305,12 +305,12 @@ struct FInterpMatrixKernelRR : FInterpAbstractMatrixKernel<FReal>
        const ValueClass diffy = (yt-ys);
        const ValueClass diffz = (zt-zs);
        const ValueClass r2 = (diffx*diffx+diffy*diffy+diffz*diffz);
-        const ValueClass one_over_r2 = FMath::One<ValueClass>() / (r2);
+        const ValueClass one_over_r2 = ValueClass(1) / (r2);
        const ValueClass one_over_r4 = one_over_r2*one_over_r2;

        block[0] = one_over_r2;

-        const ValueClass coef = FMath::ConvertTo<ValueClass,FReal>(-2.) * one_over_r4;
+        const ValueClass coef = ValueClass(-2.) * one_over_r4;
        blockDerivative[0] = coef * diffx;
        blockDerivative[1] = coef * diffy;
        blockDerivative[2] = coef * diffz;
@@ -382,7 +382,7 @@ struct FInterpMatrixKernelLJ : FInterpAbstractMatrixKernel<FReal>
        const ValueClass diffz = (zt-zs);
        const ValueClass r = FMath::Sqrt(diffx*diffx+diffy*diffy+diffz*diffz);
        const ValueClass r3 = r*r*r;
-        const ValueClass one_over_r6 = FMath::One<ValueClass>() / (r3*r3);
+        const ValueClass one_over_r6 = ValueClass(1) / (r3*r3);
        //return one_over_r6 * one_over_r6;
        //return one_over_r6;
        return one_over_r6 * one_over_r6 - one_over_r6;
@@ -409,12 +409,12 @@ struct FInterpMatrixKernelLJ : FInterpAbstractMatrixKernel<FReal>
        const ValueClass r = FMath::Sqrt(diffx*diffx+diffy*diffy+diffz*diffz);
        const ValueClass r2 = r*r;
        const ValueClass r3 = r2*r;
-        const ValueClass one_over_r6 = FMath::One<ValueClass>() / (r3*r3);
+        const ValueClass one_over_r6 = ValueClass(1) / (r3*r3);
        const ValueClass one_over_r8 = one_over_r6 / (r2);

        block[0] = one_over_r6 * one_over_r6 - one_over_r6;

-        const FReal coef = FMath::ConvertTo<ValueClass,FReal>(12.0)*one_over_r6*one_over_r8 - FMath::ConvertTo<ValueClass,FReal>(6.0)*one_over_r8;
+        const FReal coef = ValueClass(12.0)*one_over_r6*one_over_r8 - ValueClass(6.0)*one_over_r8;
        blockDerivative[0]= coef * diffx;
        blockDerivative[1]= coef * diffy;
        blockDerivative[2]= coef * diffz;
@@ -493,7 +493,7 @@ struct FInterpMatrixKernelAPLUSRR : FInterpAbstractMatrixKernel<FReal>
        const ValueClass diffy = (yt-ys);
        const ValueClass diffz = (zt-zs);
        const ValueClass r2 = (diffx*diffx+diffy*diffy+diffz*diffz);
-        return FMath::One<ValueClass>() / (r2 + FMath::ConvertTo<ValueClass,FReal>(CoreWidth));
+        return ValueClass(1) / (r2 + ValueClass(CoreWidth));
    }

    // evaluate interaction (blockwise)
@@ -515,13 +515,13 @@ struct FInterpMatrixKernelAPLUSRR : FInterpAbstractMatrixKernel<FReal>
        const ValueClass diffy = (yt-ys);
        const ValueClass diffz = (zt-zs);
        const ValueClass r2 = (diffx*diffx+diffy*diffy+diffz*diffz);
-        const ValueClass one_over_a_plus_r2 = FMath::One<ValueClass>() / (r2 + FMath::ConvertTo<ValueClass,FReal>(CoreWidth));
+        const ValueClass one_over_a_plus_r2 = ValueClass(1) / (r2 + ValueClass(CoreWidth));
        const ValueClass one_over_a_plus_r2_squared = one_over_a_plus_r2*one_over_a_plus_r2;

        block[0] = one_over_a_plus_r2;

        // TODO Fix derivative
-        const ValueClass coef = FMath::ConvertTo<ValueClass,FReal>(-2.) * one_over_a_plus_r2_squared;
+        const ValueClass coef = ValueClass(-2.) * one_over_a_plus_r2_squared;
        blockDerivative[0] = coef * diffx;
        blockDerivative[1] = coef * diffy;
        blockDerivative[2] = coef * diffz;

--- a/Src/Kernels/Interpolation/FInterpMatrixKernel_Covariance.hpp
+++ b/Src/Kernels/Interpolation/FInterpMatrixKernel_Covariance.hpp
@@ -120,13 +120,13 @@ struct FInterpMatrixKernelGauss : FAbstractCorrelationKernel<FReal>
  {
    const ValueClass diff[3] = {(x1-x2),(y1-y2),(z1-z2)};

-    ValueClass dist2 = FMath::Zero<ValueClass>();
+    ValueClass dist2 = ValueClass(0.);
    for(int d=0; d<3; ++d){
-      const ValueClass distX = diff[d] / FMath::ConvertTo<ValueClass,FReal>(lengthScale_);
+      const ValueClass distX = diff[d] / ValueClass(lengthScale_);
      dist2 += distX*distX;
    }

-    return FMath::Exp(FMath::ConvertTo<ValueClass,FReal>(-0.5)*dist2);
+    return FMath::Exp(ValueClass(-0.5)*dist2);

  }

@@ -145,7 +145,7 @@ struct FInterpMatrixKernelGauss : FAbstractCorrelationKernel<FReal>
                                  ValueClass block[1], ValueClass blockDerivative[3]) const
  {
    block[0]=this->evaluate(x1,y1,z1,x2,y2,z2);
-    const ValueClass lengthScaleOpt = FMath::ConvertTo<ValueClass,FReal>(-1/(lengthScale_*lengthScale_));
+    const ValueClass lengthScaleOpt = ValueClass(-1/(lengthScale_*lengthScale_));
    blockDerivative[0] = block[0]*(x1-x2) * lengthScaleOpt;
    blockDerivative[1] = block[0]*(y1-y2) * lengthScaleOpt;
    blockDerivative[2] = block[0]*(z1-z2) * lengthScaleOpt;

--- a/Src/Kernels/Interpolation/FInterpMatrixKernel_TensorialInteractions.hpp
+++ b/Src/Kernels/Interpolation/FInterpMatrixKernel_TensorialInteractions.hpp
@@ -161,7 +161,7 @@ struct FInterpMatrixKernel_R_IJ : FInterpAbstractMatrixKernel<FReal>
        const ValueClass diffy = (yt-ys);
        const ValueClass diffz = (zt-zs);
        const ValueClass r2 = diffx*diffx+diffy*diffy+diffz*diffz;
-        const ValueClass one_over_r = FMath::One<ValueClass>()/FMath::Sqrt(r2 + FMath::ConvertTo<ValueClass,FReal>(_CoreWidth2));
+        const ValueClass one_over_r = ValueClass(1)/FMath::Sqrt(r2 + ValueClass(_CoreWidth2));
        const ValueClass one_over_r3 = one_over_r*one_over_r*one_over_r;
        ValueClass ri,rj;

@@ -192,7 +192,7 @@ struct FInterpMatrixKernel_R_IJ : FInterpAbstractMatrixKernel<FReal>
        const ValueClass diffy = (yt-ys);
        const ValueClass diffz = (zt-zs);
        const ValueClass r2 = diffx*diffx+diffy*diffy+diffz*diffz;
-        const ValueClass one_over_r = FMath::One<ValueClass>()/FMath::Sqrt(r2 + FMath::ConvertTo<ValueClass,FReal>(_CoreWidth2));
+        const ValueClass one_over_r = ValueClass(1)/FMath::Sqrt(r2 + ValueClass(_CoreWidth2));
        const ValueClass one_over_r3 = one_over_r*one_over_r*one_over_r;

        const ValueClass r[3] = {diffx,diffy,diffz};
@@ -219,14 +219,14 @@ struct FInterpMatrixKernel_R_IJ : FInterpAbstractMatrixKernel<FReal>
        const ValueClass diffy = (yt-ys);
        const ValueClass diffz = (zt-zs);
        const ValueClass r2[3] = {diffx*diffx,diffy*diffy,diffz*diffz};
-        const ValueClass one_over_r2 = FMath::One<ValueClass>() / (r2[0] + r2[1] + r2[2] + FMath::ConvertTo<ValueClass,FReal>(_CoreWidth2));
+        const ValueClass one_over_r2 = ValueClass(1) / (r2[0] + r2[1] + r2[2] + ValueClass(_CoreWidth2));
        const ValueClass one_over_r  = FMath::Sqrt(one_over_r2);
        const ValueClass one_over_r3 = one_over_r2*one_over_r;

        const ValueClass r[3] = {diffx,diffy,diffz};

-        const ValueClass Three = FMath::ConvertTo<ValueClass,FReal>(3.);
-        const ValueClass MinusOne = - FMath::One<ValueClass>();
+        const ValueClass Three = ValueClass(3.);
+        const ValueClass MinusOne = - ValueClass(1);

        for(unsigned int d=0;d<NCMP;++d){
            unsigned int i = indexTab[d];

--- a/Src/Kernels/P2P/FP2P.hpp
+++ b/Src/Kernels/P2P/FP2P.hpp
@@ -131,14 +131,14 @@ static void GenericFullMutual(ContainerClass* const FRestrict inTargets,
            ComputeClass*const sourcesPotentials = (ComputeClass*)inNeighbors[idxNeighbors]->getPotentials();

            for(FSize idxTarget = 0 ; idxTarget < nbParticlesTargets ; ++idxTarget){
-                const ComputeClass tx = FMath::ConvertTo<ComputeClass, const FReal*>(&targetsX[idxTarget]);
-                const ComputeClass ty = FMath::ConvertTo<ComputeClass, const FReal*>(&targetsY[idxTarget]);
-                const ComputeClass tz = FMath::ConvertTo<ComputeClass, const FReal*>(&targetsZ[idxTarget]);
-                const ComputeClass tv = FMath::ConvertTo<ComputeClass, const FReal*>(&targetsPhysicalValues[idxTarget]);
-                ComputeClass  tfx = FMath::Zero<ComputeClass>();
-                ComputeClass  tfy = FMath::Zero<ComputeClass>();
-                ComputeClass  tfz = FMath::Zero<ComputeClass>();
-                ComputeClass  tpo = FMath::Zero<ComputeClass>();
+                const ComputeClass tx = ComputeClass(&targetsX[idxTarget]);
+                const ComputeClass ty = ComputeClass(&targetsY[idxTarget]);
+                const ComputeClass tz = ComputeClass(&targetsZ[idxTarget]);
+                const ComputeClass tv = ComputeClass(&targetsPhysicalValues[idxTarget]);
+                ComputeClass  tfx = ComputeClass(0.);
+                ComputeClass  tfy = ComputeClass(0.);
+                ComputeClass  tfz = ComputeClass(0.);
+                ComputeClass  tpo = ComputeClass(0.);

                for(FSize idxSource = 0 ; idxSource < nbParticlesSources ; ++idxSource){
                    ComputeClass Kxy[1];
@@ -146,7 +146,7 @@ static void GenericFullMutual(ContainerClass* const FRestrict inTargets,
                    MatrixKernel->evaluateBlockAndDerivative(tx,ty,tz,
                                                             sourcesX[idxSource],sourcesY[idxSource],sourcesZ[idxSource],
                                                             Kxy,dKxy);
-                    const ComputeClass mutual_coeff = FMath::ConvertTo<ComputeClass, FReal>(MatrixKernel->getMutualCoefficient());; // 1 if symmetric; -1 if antisymmetric
+                    const ComputeClass mutual_coeff = ComputeClass(MatrixKernel->getMutualCoefficient());; // 1 if symmetric; -1 if antisymmetric

                    const ComputeClass coef = (tv * sourcesPhysicalValues[idxSource]);

@@ -165,10 +165,10 @@ static void GenericFullMutual(ContainerClass* const FRestrict inTargets,
                    sourcesPotentials[idxSource] += mutual_coeff * Kxy[0] * tv;
                }

-                targetsForcesX[idxTarget] += FMath::ConvertTo<FReal, ComputeClass>(tfx);
-                targetsForcesY[idxTarget] += FMath::ConvertTo<FReal, ComputeClass>(tfy);
-                targetsForcesZ[idxTarget] += FMath::ConvertTo<FReal, ComputeClass>(tfz);
-                targetsPotentials[idxTarget] += FMath::ConvertTo<FReal, ComputeClass>(tpo);
+                targetsForcesX[idxTarget] += tfx.horizontalSum();
+                targetsForcesY[idxTarget] += tfy.horizontalSum();
+                targetsForcesZ[idxTarget] += tfz.horizontalSum();
+                targetsPotentials[idxTarget] += tpo.horizontalSum();
            }
        }
    }
@@ -201,14 +201,14 @@ static void GenericInner(ContainerClass* const FRestrict inTargets, const Matrix
        ComputeClass*const sourcesPotentials = (ComputeClass*)targetsPotentials;

        for(FSize idxTarget = 0 ; idxTarget < nbParticlesTargets ; ++idxTarget){
-            const ComputeClass tx = FMath::ConvertTo<ComputeClass, const FReal*>(&targetsX[idxTarget]);
-            const ComputeClass ty = FMath::ConvertTo<ComputeClass, const FReal*>(&targetsY[idxTarget]);
-            const ComputeClass tz = FMath::ConvertTo<ComputeClass, const FReal*>(&targetsZ[idxTarget]);
-            const ComputeClass tv = FMath::ConvertTo<ComputeClass, const FReal*>(&targetsPhysicalValues[idxTarget]);
-            ComputeClass  tfx = FMath::Zero<ComputeClass>();
-            ComputeClass  tfy = FMath::Zero<ComputeClass>();
-            ComputeClass  tfz = FMath::Zero<ComputeClass>();
-            ComputeClass  tpo = FMath::Zero<ComputeClass>();
+            const ComputeClass tx = ComputeClass(&targetsX[idxTarget]);
+            const ComputeClass ty = ComputeClass(&targetsY[idxTarget]);
+            const ComputeClass tz = ComputeClass(&targetsZ[idxTarget]);
+            const ComputeClass tv = ComputeClass(&targetsPhysicalValues[idxTarget]);
+            ComputeClass  tfx = ComputeClass(0.);
+            ComputeClass  tfy = ComputeClass(0.);
+            ComputeClass  tfz = ComputeClass(0.);
+            ComputeClass  tpo = ComputeClass(0.);

            for(FSize idxSource = (idxTarget+NbFRealInComputeClass)/NbFRealInComputeClass ; idxSource < nbParticlesSources ; ++idxSource){
                ComputeClass Kxy[1];
@@ -216,7 +216,7 @@ static void GenericInner(ContainerClass* const FRestrict inTargets, const Matrix
                MatrixKernel->evaluateBlockAndDerivative(tx,ty,tz,
                                                         sourcesX[idxSource],sourcesY[idxSource],sourcesZ[idxSource],
                                                         Kxy,dKxy);
-                const ComputeClass mutual_coeff = FMath::ConvertTo<ComputeClass, FReal>(MatrixKernel->getMutualCoefficient()); // 1 if symmetric; -1 if antisymmetric
+                const ComputeClass mutual_coeff = ComputeClass(MatrixKernel->getMutualCoefficient()); // 1 if symmetric; -1 if antisymmetric

                const ComputeClass coef = (tv * sourcesPhysicalValues[idxSource]);

@@ -227,18 +227,18 @@ static void GenericInner(ContainerClass* const FRestrict inTargets, const Matrix
                tfx += dKxy[0];
                tfy += dKxy[1];
                tfz += dKxy[2];
-        		tpo = FMath::FMAdd(Kxy[0],sourcesPhysicalValues[idxSource],tpo);
+                tpo += Kxy[0]*sourcesPhysicalValues[idxSource];

                sourcesForcesX[idxSource] -= dKxy[0];
                sourcesForcesY[idxSource] -= dKxy[1];
                sourcesForcesZ[idxSource] -= dKxy[2];
-        		sourcesPotentials[idxSource] = FMath::FMAdd(mutual_coeff * Kxy[0],tv,sourcesPotentials[idxSource]);
+                sourcesPotentials[idxSource] += (mutual_coeff * Kxy[0])*tv;
            }

-            targetsForcesX[idxTarget] += FMath::ConvertTo<FReal, ComputeClass>(tfx);
-            targetsForcesY[idxTarget] += FMath::ConvertTo<FReal, ComputeClass>(tfy);
-            targetsForcesZ[idxTarget] += FMath::ConvertTo<FReal, ComputeClass>(tfz);
-            targetsPotentials[idxTarget] += FMath::ConvertTo<FReal, ComputeClass>(tpo);
+            targetsForcesX[idxTarget] += tfx.horizontalSum();
+            targetsForcesY[idxTarget] += tfy.horizontalSum();
+            targetsForcesZ[idxTarget] += tfz.horizontalSum();
+            targetsPotentials[idxTarget] += tpo.horizontalSum();
        }
    }

@@ -295,14 +295,14 @@ static void GenericFullRemote(ContainerClass* const FRestrict inTargets, const C
            const ComputeClass*const sourcesZ = (const ComputeClass*)inNeighbors[idxNeighbors]->getPositions()[2];

            for(FSize idxTarget = 0 ; idxTarget < nbParticlesTargets ; ++idxTarget){
-                const ComputeClass tx = FMath::ConvertTo<ComputeClass, const FReal*>(&targetsX[idxTarget]);
-                const ComputeClass ty = FMath::ConvertTo<ComputeClass, const FReal*>(&targetsY[idxTarget]);
-                const ComputeClass tz = FMath::ConvertTo<ComputeClass, const FReal*>(&targetsZ[idxTarget]);
-                const ComputeClass tv = FMath::ConvertTo<ComputeClass, const FReal*>(&targetsPhysicalValues[idxTarget]);
-                ComputeClass  tfx = FMath::Zero<ComputeClass>();
-                ComputeClass  tfy = FMath::Zero<ComputeClass>();
-                ComputeClass  tfz = FMath::Zero<ComputeClass>();
-                ComputeClass  tpo = FMath::Zero<ComputeClass>();
+                const ComputeClass tx = ComputeClass(&targetsX[idxTarget]);
+                const ComputeClass ty = ComputeClass(&targetsY[idxTarget]);
+                const ComputeClass tz = ComputeClass(&targetsZ[idxTarget]);
+                const ComputeClass tv = ComputeClass(&targetsPhysicalValues[idxTarget]);
+                ComputeClass  tfx = ComputeClass(0.);
+                ComputeClass  tfy = ComputeClass(0.);
+                ComputeClass  tfz = ComputeClass(0.);
+                ComputeClass  tpo = ComputeClass(0.);

                for(FSize idxSource = 0 ; idxSource < nbParticlesSources ; ++idxSource){
                    ComputeClass Kxy[1];
@@ -322,10 +322,10 @@ static void GenericFullRemote(ContainerClass* const FRestrict inTargets, const C
                    tpo += Kxy[0] * sourcesPhysicalValues[idxSource];
                }

-                targetsForcesX[idxTarget] += FMath::ConvertTo<FReal, ComputeClass>(tfx);
-                targetsForcesY[idxTarget] += FMath::ConvertTo<FReal, ComputeClass>(tfy);
-                targetsForcesZ[idxTarget] += FMath::ConvertTo<FReal, ComputeClass>(tfz);
-                targetsPotentials[idxTarget] += FMath::ConvertTo<FReal, ComputeClass>(tpo);
+                targetsForcesX[idxTarget] += tfx.horizontalSum();
+                targetsForcesY[idxTarget] += tfy.horizontalSum();
+                targetsForcesZ[idxTarget] += tfz.horizontalSum();
+                targetsPotentials[idxTarget] += tpo.horizontalSum();
            }
        }
    }
@@ -337,107 +337,26 @@ template <class FReal>
 struct FP2PT{
 };

-#if defined(SCALFMM_USE_AVX)
-template <>
-struct FP2PT<double>{
-    template <class ContainerClass, class MatrixKernelClass>
-    static void FullMutual(ContainerClass* const FRestrict inTargets, ContainerClass* const inNeighbors[],
-                           const int limiteNeighbors, const MatrixKernelClass *const MatrixKernel){
-        FP2P::GenericFullMutual<double, ContainerClass, MatrixKernelClass, __m256d, 4>(inTargets, inNeighbors, limiteNeighbors, MatrixKernel);
-    }
-
-
-    template <class ContainerClass, class MatrixKernelClass>
-    static void Inner(ContainerClass* const FRestrict inTargets, const MatrixKernelClass *const MatrixKernel){
-        FP2P::GenericInner<double, ContainerClass, MatrixKernelClass, __m256d, 4>(inTargets, MatrixKernel);
-    }
-
-    template <class ContainerClass, class MatrixKernelClass>
-    static void FullRemote(ContainerClass* const FRestrict inTargets, const ContainerClass* const inNeighbors[],
-                           const int limiteNeighbors, const MatrixKernelClass *const MatrixKernel){
-        FP2P::GenericFullRemote<double, ContainerClass, MatrixKernelClass, __m256d, 4>(inTargets, inNeighbors, limiteNeighbors, MatrixKernel);
-    }
-};
-
-template <>
-struct FP2PT<float>{
-    template <class ContainerClass, class MatrixKernelClass>
-    static void FullMutual(ContainerClass* const FRestrict inTargets, ContainerClass* const inNeighbors[],
-                           const int limiteNeighbors, const MatrixKernelClass *const MatrixKernel){
-        FP2P::GenericFullMutual<float, ContainerClass, MatrixKernelClass, __m256, 8>(inTargets, inNeighbors, limiteNeighbors, MatrixKernel);
-    }
-
-    template <class ContainerClass, class MatrixKernelClass>
-    static void Inner(ContainerClass* const FRestrict inTargets, const MatrixKernelClass *const MatrixKernel){
-        FP2P::GenericInner<float, ContainerClass, MatrixKernelClass, __m256, 8>(inTargets, MatrixKernel);
-    }
+#include "InastempCompileConfig.h"

-    template <class ContainerClass, class MatrixKernelClass>
-    static void FullRemote(ContainerClass* const FRestrict inTargets, const ContainerClass* const inNeighbors[],
-                           const int limiteNeighbors, const MatrixKernelClass *const MatrixKernel){
-        FP2P::GenericFullRemote<float, ContainerClass, MatrixKernelClass, __m256, 8>(inTargets, inNeighbors, limiteNeighbors, MatrixKernel);
-    }
-};
-#elif defined(SCALFMM_USE_AVX2)
 template <>
 struct FP2PT<double>{
    template <class ContainerClass, class MatrixKernelClass>
    static void FullMutual(ContainerClass* const FRestrict inTargets, ContainerClass* const inNeighbors[],
                           const int limiteNeighbors, const MatrixKernelClass *const MatrixKernel){
-        FP2P::GenericFullMutual<double, ContainerClass, MatrixKernelClass, __m512d, 8>(inTargets, inNeighbors, limiteNeighbors, MatrixKernel);
-    }
-
-    template <class ContainerClass, class MatrixKernelClass>
-    static void Inner(ContainerClass* const FRestrict inTargets, const MatrixKernelClass *const MatrixKernel){
-        FP2P::GenericInner<double, ContainerClass, MatrixKernelClass, __m512d, 8>(inTargets, MatrixKernel);
-    }
-
-    template <class ContainerClass, class MatrixKernelClass>
-    static void FullRemote(ContainerClass* const FRestrict inTargets, const ContainerClass* const inNeighbors[],
-                           const int limiteNeighbors, const MatrixKernelClass *const MatrixKernel){
-        FP2P::GenericFullRemote<double, ContainerClass, MatrixKernelClass, __m512d, 8>(inTargets, inNeighbors, limiteNeighbors, MatrixKernel);
-    }
-};
-
-template <>
-struct FP2PT<float>{
-    template <class ContainerClass, class MatrixKernelClass>
-    static void FullMutual(ContainerClass* const FRestrict inTargets, ContainerClass* const inNeighbors[],
-                           const int limiteNeighbors, const MatrixKernelClass *const MatrixKernel){
-        FP2P::GenericFullMutual<float, ContainerClass, MatrixKernelClass, __m512, 16>(inTargets, inNeighbors, limiteNeighbors, MatrixKernel);
+        FP2P::GenericFullMutual<double, ContainerClass, MatrixKernelClass, InaVecBestTypeDouble, InaVecBestTypeDouble::VecLength>(inTargets, inNeighbors, limiteNeighbors, MatrixKernel);
    }


    template <class ContainerClass, class MatrixKernelClass>
    static void Inner(ContainerClass* const FRestrict inTargets, const MatrixKernelClass *const MatrixKernel){
-        FP2P::GenericInner<float, ContainerClass, MatrixKernelClass, __m512, 16>(inTargets, MatrixKernel);
+        FP2P::GenericInner<double, ContainerClass, MatrixKernelClass, InaVecBestTypeDouble, InaVecBestTypeDouble::VecLength>(inTargets, MatrixKernel);
    }

    template <class ContainerClass, class MatrixKernelClass>
    static void FullRemote(ContainerClass* const FRestrict inTargets, const ContainerClass* const inNeighbors[],
                           const int limiteNeighbors, const MatrixKernelClass *const MatrixKernel){
-        FP2P::GenericFullRemote<float, ContainerClass, MatrixKernelClass, __m512, 16>(inTargets, inNeighbors, limiteNeighbors, MatrixKernel);
-    }
-};
-#elif defined(SCALFMM_USE_SSE)
-template <>
-struct FP2PT<double>{
-    template <class ContainerClass, class MatrixKernelClass>
-    static void FullMutual(ContainerClass* const FRestrict inTargets, ContainerClass* const inNeighbors[],
-                           const int limiteNeighbors, const MatrixKernelClass *const MatrixKernel){
-        FP2P::GenericFullMutual<double, ContainerClass, MatrixKernelClass, __m128d, 2>(inTargets, inNeighbors, limiteNeighbors, MatrixKernel);
-    }
-
-
-    template <class ContainerClass, class MatrixKernelClass>
-    static void Inner(ContainerClass* const FRestrict inTargets, const MatrixKernelClass *const MatrixKernel){
-        FP2P::GenericInner<double, ContainerClass, MatrixKernelClass, __m128d, 2>(inTargets, MatrixKernel);
-    }
-
-    template <class ContainerClass, class MatrixKernelClass>
-    static void FullRemote(ContainerClass* const FRestrict inTargets, const ContainerClass* const inNeighbors[],
-                           const int limiteNeighbors, const MatrixKernelClass *const MatrixKernel){
-        FP2P::GenericFullRemote<double, ContainerClass, MatrixKernelClass, __m128d, 2>(inTargets, inNeighbors, limiteNeighbors, MatrixKernel);
+        FP2P::GenericFullRemote<double, ContainerClass, MatrixKernelClass, InaVecBestTypeDouble, InaVecBestTypeDouble::VecLength>(inTargets, inNeighbors, limiteNeighbors, MatrixKernel);
    }
 };

@@ -446,62 +365,21 @@ struct FP2PT<float>{
    template <class ContainerClass, class MatrixKernelClass>
    static void FullMutual(ContainerClass* const FRestrict inTargets, ContainerClass* const inNeighbors[],
                           const int limiteNeighbors, const MatrixKernelClass *const MatrixKernel){
-        FP2P::GenericFullMutual<float, ContainerClass, MatrixKernelClass, __m128, 4>(inTargets, inNeighbors, limiteNeighbors, MatrixKernel);
-    }
-
-    template <class ContainerClass, class MatrixKernelClass>
-    static void Inner(ContainerClass* const FRestrict inTargets, ContainerClass* const inNeighbors[],
-                           const int limiteNeighbors, const MatrixKernelClass *const MatrixKernel){
-        FP2P::GenericInner<float, ContainerClass, MatrixKernelClass, __m128, 4>(inTargets, MatrixKernel);
-    }
-
-    template <class ContainerClass, class MatrixKernelClass>
-    static void FullRemote(ContainerClass* const FRestrict inTargets, const ContainerClass* const inNeighbors[],
-                           const int limiteNeighbors, const MatrixKernelClass *const MatrixKernel){
-        FP2P::GenericFullRemote<float, ContainerClass, MatrixKernelClass, __m128, 4>(inTargets, inNeighbors, limiteNeighbors, MatrixKernel);
-    }
-};
-#else
-template <>
-struct FP2PT<double>{
-    template <class ContainerClass, class MatrixKernelClass>
-    static void FullMutual(ContainerClass* const FRestrict inTargets, ContainerClass* const inNeighbors[],
-                           const int limiteNeighbors, const MatrixKernelClass *const MatrixKernel){
-        FP2P::GenericFullMutual<double, ContainerClass, MatrixKernelClass, double, 1>(inTargets, inNeighbors, limiteNeighbors, MatrixKernel);
+        FP2P::GenericFullMutual<float, ContainerClass, MatrixKernelClass, InaVecBestTypeFloat, InaVecBestTypeFloat::VecLength>(inTargets, inNeighbors, limiteNeighbors, MatrixKernel);
    }

    template <class ContainerClass, class MatrixKernelClass>
    static void Inner(ContainerClass* const FRestrict inTargets, const MatrixKernelClass *const MatrixKernel){
-        FP2P::GenericInner<double, ContainerClass, MatrixKernelClass, double, 1>(inTargets, MatrixKernel);
+        FP2P::GenericInner<float, ContainerClass, MatrixKernelClass, InaVecBestTypeFloat, InaVecBestTypeFloat::VecLength>(inTargets, MatrixKernel);
    }

    template <class ContainerClass, class MatrixKernelClass>
    static void FullRemote(ContainerClass* const FRestrict inTargets, const ContainerClass* const inNeighbors[],
                           const int limiteNeighbors, const MatrixKernelClass *const MatrixKernel){
-        FP2P::GenericFullRemote<double, ContainerClass, MatrixKernelClass, double, 1>(inTargets, inNeighbors, limiteNeighbors, MatrixKernel);
+        FP2P::GenericFullRemote<float, ContainerClass, MatrixKernelClass, InaVecBestTypeFloat, InaVecBestTypeFloat::VecLength>(inTargets, inNeighbors, limiteNeighbors, MatrixKernel);
    }
 };

-template <>
-struct FP2PT<float>{
-    template <class ContainerClass, class MatrixKernelClass>
-    static void FullMutual(ContainerClass* const FRestrict inTargets, ContainerClass* const inNeighbors[],
-                           const int limiteNeighbors, const MatrixKernelClass *const MatrixKernel){
-        FP2P::GenericFullMutual<float, ContainerClass, MatrixKernelClass, float, 1>(inTargets, inNeighbors, limiteNeighbors, MatrixKernel);
-    }
-
-    template <class ContainerClass, class MatrixKernelClass>
-    static void Inner(ContainerClass* const FRestrict inTargets, const MatrixKernelClass *const MatrixKernel){
-        FP2P::GenericInner<float, ContainerClass, MatrixKernelClass, float, 1>(inTargets, MatrixKernel);
-    }
-
-    template <class ContainerClass, class MatrixKernelClass>
-    static void FullRemote(ContainerClass* const FRestrict inTargets, const ContainerClass* const inNeighbors[],
-                           const int limiteNeighbors, const MatrixKernelClass *const MatrixKernel){
-        FP2P::GenericFullRemote<float, ContainerClass, MatrixKernelClass, float, 1>(inTargets, inNeighbors, limiteNeighbors, MatrixKernel);
-    }
-};
-#endif

 #include "FP2PTensorialKij.hpp"


--- a/Src/Kernels/P2P/FP2PR.hpp
+++ b/Src/Kernels/P2P/FP2PR.hpp
@@ -81,7 +81,7 @@ static void GenericFullMutual(ContainerClass* const FRestrict inTargets, Contain
    FReal*const targetsForcesZ = inTargets->getForcesZ();
    FReal*const targetsPotentials = inTargets->getPotentials();

-    const ComputeClass mOne = FMath::One<ComputeClass>();
+    const ComputeClass mOne = ComputeClass(1);

    for(FSize idxNeighbors = 0 ; idxNeighbors < limiteNeighbors ; ++idxNeighbors){
        if( inNeighbors[idxNeighbors] ){
@@ -96,14 +96,14 @@ static void GenericFullMutual(ContainerClass* const FRestrict inTargets, Contain
            ComputeClass*const sourcesPotentials = (ComputeClass*)inNeighbors[idxNeighbors]->getPotentials();

            for(FSize idxTarget = 0 ; idxTarget < nbParticlesTargets ; ++idxTarget){
-                const ComputeClass tx = FMath::ConvertTo<ComputeClass, const FReal*>(&targetsX[idxTarget]);
-                const ComputeClass ty = FMath::ConvertTo<ComputeClass, const FReal*>(&targetsY[idxTarget]);
-                const ComputeClass tz = FMath::ConvertTo<ComputeClass, const FReal*>(&targetsZ[idxTarget]);
-                const ComputeClass tv = FMath::ConvertTo<ComputeClass, const FReal*>(&targetsPhysicalValues[idxTarget]);
-                ComputeClass  tfx = FMath::Zero<ComputeClass>();
-                ComputeClass  tfy = FMath::Zero<ComputeClass>();
-                ComputeClass  tfz = FMath::Zero<ComputeClass>();
-                ComputeClass  tpo = FMath::Zero<ComputeClass>();
+                const ComputeClass tx = ComputeClass(&targetsX[idxTarget]);
+                const ComputeClass ty = ComputeClass(&targetsY[idxTarget]);
+                const ComputeClass tz = ComputeClass(&targetsZ[idxTarget]);
+                const ComputeClass tv = ComputeClass(&targetsPhysicalValues[idxTarget]);
+                ComputeClass  tfx = ComputeClass(0.);
+                ComputeClass  tfy = ComputeClass(0.);
+                ComputeClass  tfz = ComputeClass(0.);
+                ComputeClass  tpo = ComputeClass(0.);

                for(FSize idxSource = 0 ; idxSource < nbParticlesSources ; ++idxSource){
                    ComputeClass dx = tx - sourcesX[idxSource];
@@ -111,7 +111,7 @@ static void GenericFullMutual(ContainerClass* const FRestrict inTargets, Contain
                    ComputeClass dz = tz - sourcesZ[idxSource];

                    ComputeClass inv_square_distance = mOne / (dx*dx + dy*dy + dz*dz);
-                    const ComputeClass inv_distance = FMath::Sqrt(inv_square_distance);
+                    const ComputeClass inv_distance = inv_square_distance.sqrt();

                    inv_square_distance *= inv_distance;
                    inv_square_distance *= tv * sourcesPhysicalValues[idxSource];
@@ -131,10 +131,10 @@ static void GenericFullMutual(ContainerClass* const FRestrict inTargets, Contain
                    sourcesPotentials[idxSource] += inv_distance * tv;
                }

-                targetsForcesX[idxTarget] += FMath::ConvertTo<FReal, ComputeClass>(tfx);
-                targetsForcesY[idxTarget] += FMath::ConvertTo<FReal, ComputeClass>(tfy);
-                targetsForcesZ[idxTarget] += FMath::ConvertTo<FReal, ComputeClass>(tfz);
-                targetsPotentials[idxTarget] += FMath::ConvertTo<FReal, ComputeClass>(tpo);
+                targetsForcesX[idxTarget] += tfx.horizontalSum();
+                targetsForcesY[idxTarget] += tfy.horizontalSum();
+                targetsForcesZ[idxTarget] += tfz.horizontalSum();
+                targetsPotentials[idxTarget] += tpo.horizontalSum();
            }
        }
    }
@@ -153,7 +153,7 @@ static void GenericInner(ContainerClass* const FRestrict inTargets){
    FReal*const targetsForcesZ = inTargets->getForcesZ();
    FReal*const targetsPotentials = inTargets->getPotentials();

-    const ComputeClass mOne = FMath::One<ComputeClass>();
+    const ComputeClass mOne = ComputeClass(1);

    {//In this part, we compute (vectorially) the interaction
        //within the target leaf.
@@ -169,14 +169,14 @@ static void GenericInner(ContainerClass* const FRestrict inTargets){
        ComputeClass*const sourcesPotentials = (ComputeClass*)targetsPotentials;

        for(FSize idxTarget = 0 ; idxTarget < nbParticlesTargets ; ++idxTarget){
-            const ComputeClass tx = FMath::ConvertTo<ComputeClass, const FReal*>(&targetsX[idxTarget]);
-            const ComputeClass ty = FMath::ConvertTo<ComputeClass, const FReal*>(&targetsY[idxTarget]);
-            const ComputeClass tz = FMath::ConvertTo<ComputeClass, const FReal*>(&targetsZ[idxTarget]);
-            const ComputeClass tv = FMath::ConvertTo<ComputeClass, const FReal*>(&targetsPhysicalValues[idxTarget]);
-            ComputeClass  tfx = FMath::Zero<ComputeClass>();
-            ComputeClass  tfy = FMath::Zero<ComputeClass>();
-            ComputeClass  tfz = FMath::Zero<ComputeClass>();
-            ComputeClass  tpo = FMath::Zero<ComputeClass>();
+            const ComputeClass tx = ComputeClass(&targetsX[idxTarget]);
+            const ComputeClass ty = ComputeClass(&targetsY[idxTarget]);
+            const ComputeClass tz = ComputeClass(&targetsZ[idxTarget]);
+            const ComputeClass tv = ComputeClass(&targetsPhysicalValues[idxTarget]);
+            ComputeClass  tfx = ComputeClass(0.);
+            ComputeClass  tfy = ComputeClass(0.);
+            ComputeClass  tfz = ComputeClass(0.);
+            ComputeClass  tpo = ComputeClass(0.);

            for(FSize idxSource = (idxTarget+NbFRealInComputeClass)/NbFRealInComputeClass ; idxSource < nbParticlesSources ; ++idxSource){

@@ -184,7 +184,7 @@ static void GenericInner(ContainerClass* const FRestrict inTargets){
                ComputeClass dy = ty - sourcesY[idxSource];
                ComputeClass dz = tz - sourcesZ[idxSource];
                ComputeClass inv_square_distance = mOne / (dx*dx + dy*dy + dz*dz);
-                const ComputeClass inv_distance = FMath::Sqrt(inv_square_distance);
+                const ComputeClass inv_distance = inv_square_distance.sqrt();

                inv_square_distance *= inv_distance;
                inv_square_distance *= tv * sourcesPhysicalValues[idxSource];
@@ -204,10 +204,10 @@ static void GenericInner(ContainerClass* const FRestrict inTargets){
                sourcesPotentials[idxSource] += inv_distance * tv;
            }

-            targetsForcesX[idxTarget] += FMath::ConvertTo<FReal, ComputeClass>(tfx);
-            targetsForcesY[idxTarget] += FMath::ConvertTo<FReal, ComputeClass>(tfy);
-            targetsForcesZ[idxTarget] += FMath::ConvertTo<FReal, ComputeClass>(tfz);
-            targetsPotentials[idxTarget] += FMath::ConvertTo<FReal, ComputeClass>(tpo);
+            targetsForcesX[idxTarget] += tfx.horizontalSum();
+            targetsForcesY[idxTarget] += tfy.horizontalSum();
+            targetsForcesZ[idxTarget] += tfz.horizontalSum();
+            targetsPotentials[idxTarget] += tpo.horizontalSum();
        }
    }

@@ -255,7 +255,7 @@ static void GenericFullRemote(ContainerClass* const FRestrict inTargets, const C
    FReal*const targetsForcesZ = inTargets->getForcesZ();
    FReal*const targetsPotentials = inTargets->getPotentials();

-    const ComputeClass mOne = FMath::One<ComputeClass>();
+    const ComputeClass mOne = ComputeClass(1);

    for(FSize idxNeighbors = 0 ; idxNeighbors < limiteNeighbors ; ++idxNeighbors){
        if( inNeighbors[idxNeighbors] ){
@@ -266,14 +266,14 @@ static void GenericFullRemote(ContainerClass* const FRestrict inTargets, const C
            const ComputeClass*const sourcesZ = (const ComputeClass*)inNeighbors[idxNeighbors]->getPositions()[2];

            for(FSize idxTarget = 0 ; idxTarget < nbParticlesTargets ; ++idxTarget){
-                const ComputeClass tx = FMath::ConvertTo<ComputeClass, const FReal*>(&targetsX[idxTarget]);
-                const ComputeClass ty = FMath::ConvertTo<ComputeClass, const FReal*>(&targetsY[idxTarget]);
-                const ComputeClass tz = FMath::ConvertTo<ComputeClass, const FReal*>(&targetsZ[idxTarget]);
-                const ComputeClass tv = FMath::ConvertTo<ComputeClass, const FReal*>(&targetsPhysicalValues[idxTarget]);
-                ComputeClass  tfx = FMath::Zero<ComputeClass>();
-                ComputeClass  tfy = FMath::Zero<ComputeClass>();
-                ComputeClass  tfz = FMath::Zero<ComputeClass>();
-                ComputeClass  tpo = FMath::Zero<ComputeClass>();
+                const ComputeClass tx = ComputeClass(&targetsX[idxTarget]);
+                const ComputeClass ty = ComputeClass(&targetsY[idxTarget]);
+                const ComputeClass tz = ComputeClass(&targetsZ[idxTarget]);
+                const ComputeClass tv = ComputeClass(&targetsPhysicalValues[idxTarget]);
+                ComputeClass  tfx = ComputeClass(0.);
+                ComputeClass  tfy = ComputeClass(0.);
+                ComputeClass  tfz = ComputeClass(0.);
+                ComputeClass  tpo = ComputeClass(0.);

                for(FSize idxSource = 0 ; idxSource < nbParticlesSources ; ++idxSource){
                    ComputeClass dx = tx - sourcesX[idxSource];
@@ -281,7 +281,7 @@ static void GenericFullRemote(ContainerClass* const FRestrict inTargets, const C
                    ComputeClass dz = tz - sourcesZ[idxSource];

                    ComputeClass inv_square_distance = mOne / (dx*dx + dy*dy + dz*dz);
-                    const ComputeClass inv_distance = FMath::Sqrt(inv_square_distance);
+                    const ComputeClass inv_distance = inv_square_distance.sqrt();

                    inv_square_distance *= inv_distance;
                    inv_square_distance *= tv * sourcesPhysicalValues[idxSource];
@@ -296,10 +296,10 @@ static void GenericFullRemote(ContainerClass* const FRestrict inTargets, const C
                    tpo += inv_distance * sourcesPhysicalValues[idxSource];
                }

-                targetsForcesX[idxTarget] += FMath::ConvertTo<FReal, ComputeClass>(tfx);
-                targetsForcesY[idxTarget] += FMath::ConvertTo<FReal, ComputeClass>(tfy);
-                targetsForcesZ[idxTarget] += FMath::ConvertTo<FReal, ComputeClass>(tfz);
-                targetsPotentials[idxTarget] += FMath::ConvertTo<FReal, ComputeClass>(tpo);
+                targetsForcesX[idxTarget] += tfx.horizontalSum();
+                targetsForcesY[idxTarget] += tfy.horizontalSum();
+                targetsForcesZ[idxTarget] += tfz.horizontalSum();
+                targetsPotentials[idxTarget] += tpo.horizontalSum();
            }
        }
    }
@@ -311,25 +311,25 @@ template <class FReal>
 struct FP2PRT{
 };

-#if defined(SCALFMM_USE_AVX)
+#include "InastempCompileConfig.h"

 template <>
 struct FP2PRT<double>{
    template <class ContainerClass>
    static void FullMutual(ContainerClass* const FRestrict inTargets, ContainerClass* const inNeighbors[],
                           const int limiteNeighbors){
-        FP2PR::GenericFullMutual<double, ContainerClass, __m256d, 4>(inTargets, inNeighbors, limiteNeighbors);
+        FP2PR::GenericFullMutual<double, ContainerClass, InaVecBestTypeDouble, InaVecBestTypeDouble::VecLength>(inTargets, inNeighbors, limiteNeighbors);
    }

    template <class ContainerClass>
    static void Inner(ContainerClass* const FRestrict inTargets){
-        FP2PR::GenericInner<double, ContainerClass, __m256d, 4>(inTargets);
+        FP2PR::GenericInner<double, ContainerClass, InaVecBestTypeDouble, InaVecBestTypeDouble::VecLength>(inTargets);
    }

    template <class ContainerClass>
    static void FullRemote(ContainerClass* const FRestrict inTargets, const ContainerClass* const inNeighbors[],
               const int limiteNeighbors){
-        FP2PR::GenericFullRemote<double, ContainerClass, __m256d, 4>(inTargets, inNeighbors, limiteNeighbors);
+        FP2PR::GenericFullRemote<double, ContainerClass, InaVecBestTypeDouble, InaVecBestTypeDouble::VecLength>(inTargets, inNeighbors, limiteNeighbors);
    }
 };

@@ -338,143 +338,21 @@ struct FP2PRT<float>{
    template <class ContainerClass>
    static void FullMutual(ContainerClass* const FRestrict inTargets, ContainerClass* const inNeighbors[],
                           const int limiteNeighbors){
-        FP2PR::GenericFullMutual<float, ContainerClass, __m256, 8>(inTargets, inNeighbors, limiteNeighbors);
+        FP2PR::GenericFullMutual<float, ContainerClass, InaVecBestTypeFloat, InaVecBestTypeFloat::VecLength>(inTargets, inNeighbors, limiteNeighbors);
    }

    template <class ContainerClass>
    static void Inner(ContainerClass* const FRestrict inTargets){
-        FP2PR::GenericFullMutual<float, ContainerClass, __m256, 8>(inTargets);
+        FP2PR::GenericFullMutual<float, ContainerClass, InaVecBestTypeFloat, InaVecBestTypeFloat::VecLength>(inTargets);
    }

    template <class ContainerClass>
    static void FullRemote(ContainerClass* const FRestrict inTargets, const ContainerClass* const inNeighbors[],
               const int limiteNeighbors){
-        FP2PR::GenericFullRemote<float, ContainerClass, __m256, 8>(inTargets, inNeighbors, limiteNeighbors);
-    }
-};
-#elif defined(SCALFMM_USE_AVX2)
-template <>
-struct FP2PRT<double>{
-    template <class ContainerClass>
-    static void FullMutual(ContainerClass* const FRestrict inTargets, ContainerClass* const inNeighbors[],
-                           const int limiteNeighbors){
-        FP2PR::GenericFullMutual<double, ContainerClass, __m512d, 8>(inTargets, inNeighbors, limiteNeighbors);
-    }
-
-    template <class ContainerClass>
-    static void Inner(ContainerClass* const FRestrict inTargets){
-        FP2PR::GenericInner<double, ContainerClass, __m512d, 8>(inTargets);
-    }
-
-    template <class ContainerClass>
-    static void FullRemote(ContainerClass* const FRestrict inTargets, const ContainerClass* const inNeighbors[],
-               const int limiteNeighbors){
-        FP2PR::GenericFullRemote<double, ContainerClass, __m512d, 8>(inTargets, inNeighbors, limiteNeighbors);
-    }
-};
-
-template <>
-struct FP2PRT<float>{
-    template <class ContainerClass>
-    static void FullMutual(ContainerClass* const FRestrict inTargets, ContainerClass* const inNeighbors[],
-                           const int limiteNeighbors){
-        FP2PR::GenericFullMutual<float, ContainerClass, __m512, 16>(inTargets, inNeighbors, limiteNeighbors);
-    }
-
-    template <class ContainerClass>
-    static void Inner(ContainerClass* const FRestrict inTargets){
-        FP2PR::GenericFullMutual<float, ContainerClass, __m512, 16>(inTargets);
-    }
-
-    template <class ContainerClass>
-    static void FullRemote(ContainerClass* const FRestrict inTargets, const ContainerClass* const inNeighbors[],
-               const int limiteNeighbors){
-        FP2PR::GenericFullRemote<float, ContainerClass, __m512, 16>(inTargets, inNeighbors, limiteNeighbors);
-    }
-};
-
-#elif defined(SCALFMM_USE_SSE)
-template <>
-struct FP2PRT<double>{
-    template <class ContainerClass>
-    static void FullMutual(ContainerClass* const FRestrict inTargets, ContainerClass* const inNeighbors[],
-                           const int limiteNeighbors){
-        FP2PR::GenericFullMutual<double, ContainerClass, __m128d, 2>(inTargets, inNeighbors, limiteNeighbors);
-    }
-
-    template <class ContainerClass>
-    static void Inner(ContainerClass* const FRestrict inTargets){
-        FP2PR::GenericInner<double, ContainerClass, __m128d, 2>(inTargets);
-    }
-
-    template <class ContainerClass>
-    static void FullRemote(ContainerClass* const FRestrict inTargets, const ContainerClass* const inNeighbors[],
-               const int limiteNeighbors){
-        FP2PR::GenericFullRemote<double, ContainerClass, __m128d, 2>(inTargets, inNeighbors, limiteNeighbors);
+        FP2PR::GenericFullRemote<float, ContainerClass, InaVecBestTypeFloat, InaVecBestTypeFloat::VecLength>(inTargets, inNeighbors, limiteNeighbors);
    }
 };

-template <>
-struct FP2PRT<float>{
-    template <class ContainerClass>
-    static void FullMutual(ContainerClass* const FRestrict inTargets, ContainerClass* const inNeighbors[],
-                           const int limiteNeighbors){
-        FP2PR::GenericFullMutual<float, ContainerClass, __m128, 4>(inTargets, inNeighbors, limiteNeighbors);
-    }
-
-    template <class ContainerClass>
-    static void Inner(ContainerClass* const FRestrict inTargets){
-        FP2PR::GenericInner<float, ContainerClass, __m128, 4>(inTargets);
-    }
-
-    template <class ContainerClass>
-    static void FullRemote(ContainerClass* const FRestrict inTargets, const ContainerClass* const inNeighbors[],
-               const int limiteNeighbors){
-        FP2PR::GenericFullRemote<float, ContainerClass, __m128, 4>(inTargets, inNeighbors, limiteNeighbors);
-    }
-};
-
-#else
-template <>
-struct FP2PRT<double>{
-    template <class ContainerClass>
-    static void FullMutual(ContainerClass* const FRestrict inTargets, ContainerClass* const inNeighbors[],
-                           const int limiteNeighbors){
-        FP2PR::GenericFullMutual<double, ContainerClass, double, 1>(inTargets, inNeighbors, limiteNeighbors);
-    }
-
-    template <class ContainerClass>
-    static void Inner(ContainerClass* const FRestrict inTargets){
-        FP2PR::GenericInner<double, ContainerClass, double, 1>(inTargets);
-    }
-
-    template <class ContainerClass>
-    static void FullRemote(ContainerClass* const FRestrict inTargets, const ContainerClass* const inNeighbors[],
-               const int limiteNeighbors){
-        FP2PR::GenericFullRemote<double, ContainerClass, double, 1>(inTargets, inNeighbors, limiteNeighbors);
-    }
-};
-
-template <>
-struct FP2PRT<float>{
-    template <class ContainerClass>
-    static void FullMutual(ContainerClass* const FRestrict inTargets, ContainerClass* const inNeighbors[],
-                           const int limiteNeighbors){
-        FP2PR::GenericFullMutual<float, ContainerClass, float, 1>(inTargets, inNeighbors, limiteNeighbors);
-    }
-
-    template <class ContainerClass>
-    static void Inner(ContainerClass* const FRestrict inTargets){
-        FP2PR::GenericInner<float, ContainerClass, float, 1>(inTargets);
-    }
-
-    template <class ContainerClass>
-    static void FullRemote(ContainerClass* const FRestrict inTargets, const ContainerClass* const inNeighbors[],
-               const int limiteNeighbors){
-        FP2PR::GenericFullRemote<float, ContainerClass, float, 1>(inTargets, inNeighbors, limiteNeighbors);
-    }
-};
-#endif




--- a/Src/ScalFmmConfig.h.cmake
+++ b/Src/ScalFmmConfig.h.cmake
@@ -28,7 +28,6 @@
 #cmakedefine SCALFMM_BLAS_ADD_
 #cmakedefine SCALFMM_BLAS_UPCASE
 #cmakedefine SCALFMM_BLAS_NOCHANGE
-
 ////////////////////////////////////////////////////////
 // FFT
 ///////////////////////////////////////////////////////
@@ -68,20 +67,6 @@
 #cmakedefine SCALFMM_USE_STARPU
 #cmakedefine SCALFMM_DISABLE_NATIVE_OMP4

-///////////////////////////////////////////////////////
-// SSE
-///////////////////////////////////////////////////////
-
-#cmakedefine SCALFMM_USE_SSE
-#cmakedefine __AVXPE_INTEL_COMPILER
-
-///////////////////////////////////////////////////////
-// AVX
-///////////////////////////////////////////////////////
-
-#cmakedefine SCALFMM_USE_AVX
-#cmakedefine __SSEPE_INTEL_COMPILER
-
 ///////////////////////////////////////////////////////
 // EZTRACE
 ///////////////////////////////////////////////////////