diff --git a/.gitmodules b/.gitmodules
index e1e8c690c3b08b1f2a8c52f94e9aae76828eaffb..2b1f354a63acc05660f82038e3c15c1b0170cc94 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -1,3 +1,6 @@
 [submodule "CMakeModules/morse_cmake"]
 	path = CMakeModules/morse_cmake
 	url = https://gitlab.inria.fr/solverstack/morse_cmake.git
+[submodule "inastemp"]
+	path = inastemp
+	url = https://gitlab.mpcdf.mpg.de/bbramas/inastemp.git
diff --git a/CMakeLists.txt b/CMakeLists.txt
index f0a4397ee7f70be74868b0800e6fe406871318ca..df83536abe2f8487f9a229eb21365269e6c3e1a5 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -22,10 +22,6 @@ SET(SCALFMM_CMAKE_MODULE_PATH  ${CMAKE_CURRENT_SOURCE_DIR}/CMakeModules/)
 #
 # Adds the CMAKE_DEPENDENT_OPTION command
 INCLUDE(CMakeDependentOption)
-#  Add to check CPU info
-include(GetCpuInfos)
-GetCpuInfos()
-#
 #===========================================================================
 # Version Number
 #===========================================================================
@@ -89,18 +85,8 @@ if (MORSE_DISTRIB_DIR OR EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/CMakeModules/morse_
   option( SCALFMM_USE_EZTRACE          "Set to ON to compile with eztrace framwork" OFF )
   option( SCALFMM_USE_STARPU           "Set to ON to build SCALFMM with StarPU"     OFF )
   option( SCALFMM_BUILD_UTILS          "Set to ON to build utils Tests"             OFF )
-  #
-  #  VECTORISATION
-  #
-  if( APPLE ) # to fix problem with  GCC and avx
-    CMAKE_DEPENDENT_OPTION( SCALFMM_USE_SSE              "Set to ON to compile with SSE support (and use intrinsec SSE P2P)" ON "CPUOPTION_SSE3;NOT CPUOPTION_AVX2" OFF  )
-    CMAKE_DEPENDENT_OPTION( SCALFMM_USE_AVX              "Set to ON to compile with AVX support (and use intrinsec AVX P2P)" OFF "CPUOPTION_AVX; NOT CPUOPTION_AVX2" OFF  )
-  else(APPLE)
-    CMAKE_DEPENDENT_OPTION( SCALFMM_USE_SSE              "Set to ON to compile with SSE support (and use intrinsec SSE P2P)" ON "CPUOPTION_SSE3;NOT CPUOPTION_AVX;NOT CPUOPTION_AVX2" OFF  )
-    CMAKE_DEPENDENT_OPTION( SCALFMM_USE_AVX              "Set to ON to compile with AVX support (and use intrinsec AVX P2P)" ON "CPUOPTION_AVX; NOT CPUOPTION_AVX2" OFF  )
-  endif(APPLE)
-  CMAKE_DEPENDENT_OPTION( SCALFMM_USE_AVX2             "Set to ON to compile with AVX support (and use intrinsec AVX2 P2P)" ON "CPUOPTION_AVX2" OFF )
-  
+
+
   if( SCALFMM_ONLY_DEVEL )
     if(CMAKE_CXX_COMPILER_ID STREQUAL "Intel")
       option( SCALFMM_USE_OMP4 "Set to ON to disable the gcc/intel omp4"    OFF )
@@ -160,63 +146,21 @@ if (MORSE_DISTRIB_DIR OR EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/CMakeModules/morse_
     set(SCALFMM_CXX_FLAGS  "${SCALFMM_CXX_FLAGS} -m64")
   endif()
   ##############################################################################
-  #                           Compile options                                  #
+  #                           Inastemp                                         #
   ##############################################################################
-  #  -xHost -mfpmath=sse
-  # -Wall Wnosign-conversion
-  #
 
-  # Set a fixed template depth
-  # Compilers don't use the same default for template-depth, we can enforce the same one everywhere.
-  # The magic number comes from GCC's default: https://gcc.gnu.org/onlinedocs/gcc/C_002b_002b-Dialect-Options.html#C_002b_002b-Dialect-Options
-  set(SCALFMM_CXX_FLAGS "${SCALFMM_CXX_FLAGS} -ftemplate-depth=900")
-  if(CMAKE_CXX_COMPILER_ID STREQUAL "Intel")
-    # INTEL
-    IF (APPLE)
-      IF( CPUOPTION_SSE42 )
-        set(SSE_FLAGS  "-msse4  -mfpmath=sse")   # -mtune=native -march=native
-      ELSEIF (CPUOPTION_SSE3)
-        set(SSE_FLAGS  "-msse3  -mfpmath=sse")   # -mtune=native -march=native
-      ENDIF (CPUOPTION_SSE42)
-    else(APPLE)
-      set(AVX_FLAGS  "-fp-model source -march=native -axCORE-AVX2,CORE-AVX-I,AVX") #-mavx
-      set(AVX2_FLAGS "-march=native  -axCORE-AVX2,CORE-AVX-I,AVX") #-march=core-avx2
-      set(SSE_FLAGS  "-axSSE4.2  -march=native")
-    endif(APPLE)
-    set(SCALFMM_CXX_FLAGS  "${SCALFMM_CXX_FLAGS} -fma -align -finline-functions")
-    #-Wshadow -Wpointer-arith -Wcast-qual -Wconversion  -Wall -Wnosign-conversion ")
-  elseif(CMAKE_CXX_COMPILER_ID STREQUAL "XL")
-    set(SCALFMM_CXX_FLAGS  "${SCALFMM_CXX_FLAGS} -mcpu=power8 -mtune=power8")
-  else() #if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
-    # NOT      INTEL
-    if(NOT SCALFMM_USE_MPI)
-      include(CheckCCompilerFlag)
-      check_c_compiler_flag(-Wzero-as-null-pointer-constant HAS_WZERO_NULL_PTR_FLAG)
-      if(HAS_WZERO_NULL_PTR_FLAG)
-        set(SCALFMM_CXX_FLAGS  "${SCALFMM_CXX_FLAGS} -Wzero-as-null-pointer-constant")
-      endif()
-    else()
-      include(CheckCCompilerFlag)
-      check_c_compiler_flag(-Wno-literal-suffix HAS_NO_LITERAL_SUFFIX_FLAG)
-      if(HAS_NO_LITERAL_SUFFIX_FLAG)
-        set(SCALFMM_CXX_FLAGS  "${SCALFMM_CXX_FLAGS} -Wno-literal-suffix")
-      endif()
-    endif()
-    IF (APPLE)
-      #      set(SSE_FLAGS  "-msse4  -mfpmath=sse")   # -mtune=native -march=native
-      IF( CPUOPTION_SSE42 )
-        set(SSE_FLAGS  "-msse4  -mfpmath=sse")   # -mtune=native -march=native
-      ELSEIF (CPUOPTION_SSE3)
-        set(SSE_FLAGS  "-msse3  -mfpmath=sse")   # -mtune=native -march=native
-      ENDIF (CPUOPTION_SSE42)
-      set(AVX_FLAGS "-mtune=native -march=avx")
-      set(AVX2_FLAGS "-mtune=native -march=native -mmic")
-    else(APPLE)
-      set(SSE_FLAGS  "-mtune=native -march=native")
-      set(AVX_FLAGS "-mtune=native -march=native")
-      set(AVX2_FLAGS "-mtune=native -march=native -mmic")
-    endif(APPLE)
-  endif()
+  set(INASTEMP_JUST_LIB TRUE)
+  # add the cmakelist directory
+  add_subdirectory(inastemp)
+  # use the filled variables from inastemp
+  INCLUDE_DIRECTORIES(
+           ${INASTEMP_BINARY_DIR}/Src    
+           ${INASTEMP_SOURCE_DIR}/Src 
+           ${INASTEMP_INCLUDE_DIR}
+           ${CMAKE_CURRENT_BINARY_DIR}/inastemp/Src
+      )
+  # propagate the flags to be able to compile 
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${INASTEMP_CXX_FLAGS}")
 
   ##############################################################################
   #                           FUSE list                                        #
@@ -637,129 +581,13 @@ if (MORSE_DISTRIB_DIR OR EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/CMakeModules/morse_
   list(APPEND FUSE_LIST "STARPU")
 
   ##################################################################
-  #                         Use SSE                                #
+  #                         FUSE                                   #
   ##################################################################
 
-  message( STATUS "SCALFMM_USE_SSE              = ${SCALFMM_USE_SSE}" )
-  if( SCALFMM_USE_SSE )
-    if(NOT EXISTS ${SCALFMM_CMAKE_MODULE_PATH}/compileTestSse.cpp)
-      message(FATAL_ERROR "The CompileTestSseFile does not exist (${SCALFMM_CMAKE_MODULE_PATH}/compileTestSse.cpp)" )
-    endif()
-    message( STATUS "SSE_FLAGS ${SSE_FLAGS}  -- ${CMAKE_CXX_FLAGS}  ")
-    try_compile(COMPILE_SSE  ${CMAKE_CURRENT_BINARY_DIR}
-      ${SCALFMM_CMAKE_MODULE_PATH}/compileTestSse.cpp
-      COMPILE_DEFINITIONS "${CMAKE_CXX_FLAGS} ${SSE_FLAGS}"
-      OUTPUT_VARIABLE COMPILE_SSE_OUTPUT)
-
-    if(${COMPILE_SSE})
-      set(SCALFMM_CXX_FLAGS "${SCALFMM_CXX_FLAGS} ${SSE_FLAGS}")
-
-      try_compile(COMPILE_RESULT_VAR ${CMAKE_CURRENT_BINARY_DIR}
-        ${SCALFMM_CMAKE_MODULE_PATH}/checkSSEpe.cpp
-        COMPILE_DEFINITIONS "${CMAKE_CXX_FLAGS} ${SSE_FLAGS}")
-      if( NOT ${COMPILE_RESULT_VAR})
-        set(__SSEPE_INTEL_COMPILER ON)
-      endif()
-      #set(SCALFMM_USE_AVX OFF)
-    else(${COMPILE_SSE})
-      message(FATAL_ERROR "SSE NOT SUPPORTED ; Set SCALFMM_USE_SSE  to OFF \n Output from test is : ${COMPILE_SSE_OUTPUT}")
-    endif(${COMPILE_SSE})
-  endif()
   list(APPEND FUSE_LIST "SSE")
-
-  ##################################################################
-  #                           Use AVX                              #
-  ##################################################################
-
-  message(STATUS "SCALFMM_USE_AVX               = ${SCALFMM_USE_AVX}")
-  if(SCALFMM_USE_AVX)
-    if(NOT EXISTS ${SCALFMM_CMAKE_MODULE_PATH}/compileTestAvx.cpp)
-
-      message(WARNING "SCALFMM_CMAKE_MODULE_PATH ${SCALFMM_CMAKE_MODULE_PATH}/compileTestAvx.cpp" )
-      message(FATAL_ERROR "The CompileTestAvxFile does not exist (${SCALFMM_CMAKE_MODULE_PATH}/compileTestAvx.cpp)" )
-    endif()
-
-    try_compile(COMPILE_AVX ${CMAKE_CURRENT_BINARY_DIR}
-      ${SCALFMM_CMAKE_MODULE_PATH}/compileTestAvx.cpp
-      COMPILE_DEFINITIONS "${CMAKE_CXX_FLAGS} ${AVX_FLAGS}"
-      OUTPUT_VARIABLE COMPILE_AVX_OUTPUT)
-    if(${COMPILE_AVX})
-      message(STATUS "%%%%%%%%%%%% COMPILE_AVX               = ${COMPILE_AVX}  %%%%<    ${AVX_FLAGS}")
-
-      set(SCALFMM_CXX_FLAGS "${SCALFMM_CXX_FLAGS}   ${AVX_FLAGS}")
-      message(STATUS "%%%%%%%%%%%% SCALFMM_CXX_FLAGS               = ${SCALFMM_CXX_FLAGS}")
-      #set( SCALFMM_USE_SSE   OFF   FORCE) # ne marche pas
-      try_compile(COMPILE_RESULT_AVSPE ${CMAKE_CURRENT_BINARY_DIR}
-        ${SCALFMM_CMAKE_MODULE_PATH}/checkAVXpe.cpp
-        COMPILE_DEFINITIONS "${CMAKE_CXX_FLAGS} ${AVX_FLAGS}")
-      if( NOT ${COMPILE_RESULT_AVSPE})
-
-
-        set(__AVXPE_INTEL_COMPILER ON)
-      endif()
-
-      message(STATUS ${CMAKE_CXX_FLAGS} )
-    else(${COMPILE_AVX})
-      message(FATAL_ERROR "AVX NOT SUPPORTED ; Set SCALFMM_USE_AVX  to OFF \n Output from test is : ${COMPILE_AVX_OUTPUT} ")
-    endif(${COMPILE_AVX})
-  endif(SCALFMM_USE_AVX)
   list(APPEND FUSE_LIST "AVX")
-  #
-  # Error if both SCALFMM_USE_AVX AND SCALFMM_USE_SSE are set
-  #
-  if( SCALFMM_USE_AVX AND SCALFMM_USE_SSE)
-    message(FATAL_ERROR "Check SCALFMM_USE_SSE or SCALFMM_USE_AVX BUT NOT BOTH. ")
-  endif(SCALFMM_USE_AVX AND SCALFMM_USE_SSE)
-  ##################################################################
-  #                           Use AVX2                             #
-  ##################################################################
+  list(APPEND FUSE_LIST "MIC")
 
-  message(STATUS "SCALFMM_USE_AVX2               = ${SCALFMM_USE_AVX2}")
-  if(SCALFMM_USE_AVX2)
-    if(NOT EXISTS ${SCALFMM_CMAKE_MODULE_PATH}/compileTestAvx2.cpp)
-      message(FATAL_ERROR "The CompileTestSseFile does not exist (${SCALFMM_CMAKE_MODULE_PATH}/compileTestAvx.cpp)" )
-    endif()
-
-    try_compile(COMPILE_AVX2 ${CMAKE_CURRENT_BINARY_DIR}
-      ${SCALFMM_CMAKE_MODULE_PATH}/compileTestAvx2.cpp
-      COMPILE_DEFINITIONS "${CMAKE_CXX_FLAGS} ${AVX2_FLAGS}"
-      OUTPUT_VARIABLE COMPILE_AVX2_OUTPUT)
-    if(${COMPILE_AVX2})
-      set(SCALFMM_CXX_FLAGS "${SCALFMM_CXX_FLAGS}   ${AVX2_FLAGS}")
-      #set( SCALFMM_USE_SSE   OFF   FORCE) # ne marche pas
-      try_compile(COMPILE_RESULT_AVSPE ${CMAKE_CURRENT_BINARY_DIR}
-        ${SCALFMM_CMAKE_MODULE_PATH}/checkAVX2pe.cpp
-        COMPILE_DEFINITIONS "${CMAKE_CXX_FLAGS} ${AVX2_FLAGS}")
-      if( NOT ${COMPILE_RESULT_AVSPE})
-        set(__AVX2PE_INTEL_COMPILER ON)
-      endif()
-
-      message(STATUS ${CMAKE_CXX_FLAGS} )
-    else(${COMPILE_AVX2})
-      message(FATAL_ERROR "AVX2 NOT SUPPORTED ; Set SCALFMM_USE_AVX2  to OFF \n Output from test is : ${COMPILE_AVX_OUTPUT} ")
-    endif(${COMPILE_AVX2})
-  endif(SCALFMM_USE_AVX2)
-  list(APPEND FUSE_LIST "AVX2")
-  #
-  # Error if both SCALFMM_USE_AVX2 AND SCALFMM_USE_SSE are set
-  #
-  if( SCALFMM_USE_AVX2 AND SCALFMM_USE_SSE)
-    message(FATAL_ERROR "Check SCALFMM_USE_SSE or SCALFMM_USE_AVX2 BUT NOT BOTH. ")
-  endif(SCALFMM_USE_AVX2 AND SCALFMM_USE_SSE)
-  ##################################################################
-  #                     Use  native MIC compilation                #
-  ##################################################################
-#  If( SCALFMM_USE_MIC_NATIVE )
-#    include(CheckCCompilerFlag)
-#    check_c_compiler_flag(-mmic HAS_MMIC_FLAG)
-#    if(NOT HAS_MMIC_FLAG)
-#      message(FATAL_ERROR "MIC NOT SUPPORTED ; Set SCALFMM_USE_MIC_NATIVE to OFF")
-#    endif()
-#    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mmic")
-#  else()
-#    #set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -xhost")
-#  endif()
-#  list(APPEND FUSE_LIST "MIC")
   ##################################################################
   #
   #   Set EZTRACE
diff --git a/CMakeModules/GetCpuInfos.cmake b/CMakeModules/GetCpuInfos.cmake
deleted file mode 100644
index e6fd0611e7fb064341681745a0ab64d9a6495702..0000000000000000000000000000000000000000
--- a/CMakeModules/GetCpuInfos.cmake
+++ /dev/null
@@ -1,52 +0,0 @@
-###########################################################################################
-# Berenger Bramas Inria
-# This goes with the getCpuInfos.cpp
-# This will create one CMAKE value per output option from the cpp file.
-# For example the output of the CPP file can be:
-# SSE3=TRUE;AVX=FALSE
-# Then it will create:
-# CPUOPTION_SSE3 = TRUE
-# CPUOPTION_AVX = FALSE
-#
-# The binary should return 0 on success.
-###########################################################################################
-macro(GetCpuInfos)
-# The original CPP file
-set(GetCpuInfosFile "${PROJECT_SOURCE_DIR}/CMakeModules/getCpuInfos.cpp")
-
-# Fatal error if the file does not exist
-if(NOT EXISTS ${GetCpuInfosFile})
-	message(FATAL_ERROR "The GetCpuInfosFile does not exist (${GetCpuInfosFile})")
-endif()
-
-# Compile and execute the file
-try_run(RUN_RESULT_VAR COMPILE_RESULT_VAR
-          ${CMAKE_BINARY_DIR} ${GetCpuInfosFile}  # [CMAKE_FLAGS <Flags>] [COMPILE_DEFINITIONS <flags>]
-          COMPILE_OUTPUT_VARIABLE comp
-          RUN_OUTPUT_VARIABLE run)
-
-# If it has successfuly compiled an run
-if(COMPILE_RESULT_VAR AND (RUN_RESULT_VAR EQUAL 0) )
-	set( CPU_OPTIONS ${run} )
-	# For each value
-	foreach(optionNode ${run})
-		# Get name and value
-		string(REPLACE "=" ";" optionNameAndValue ${optionNode})
-		list(LENGTH optionNameAndValue optionLength)
-		# If we get both
-		if(optionLength EQUAL 2)
-			list(GET optionNameAndValue 0 optionName)
-			list(GET optionNameAndValue 1 optionValue)
-			# create cmake variable
-			set(CPUOPTION_${optionName} ${optionValue})
-		else()
-			message(WARNING "GetCpuInfosFile wrong format for ${optionNode}.")
-		endif()
-	endforeach()
-	# output the sentence from the binrary
-	message(STATUS "CPUOPTION : ${CPU_OPTIONS}")
-else()
-	message(WARNING "GetCpuInfosFile did not return correctly.")
-endif()
-
-endmacro(GetCpuInfos)
diff --git a/CMakeModules/checkAVX2pe.cpp b/CMakeModules/checkAVX2pe.cpp
deleted file mode 100644
index 8cf8b1c7d888b39e92d72f8aa3fea7d6ba77c366..0000000000000000000000000000000000000000
--- a/CMakeModules/checkAVX2pe.cpp
+++ /dev/null
@@ -1,12 +0,0 @@
-
-#include "immintrin.h"
-
-
-int main() {
-#ifdef __MIC__
-	__m512 tx, ty ;
-	tx += ty ;
-#endif
-  return 0;
-}
-
diff --git a/CMakeModules/checkAVXpe.cpp b/CMakeModules/checkAVXpe.cpp
deleted file mode 100644
index 8f4838ac740f8b747296379fefb763d71ace2b53..0000000000000000000000000000000000000000
--- a/CMakeModules/checkAVXpe.cpp
+++ /dev/null
@@ -1,9 +0,0 @@
-
-#include "immintrin.h"
-
-
-int main() {
-	__m256d tx, ty ;
-	tx += ty ;
-  return 0;
-}
diff --git a/CMakeModules/checkSSEpe.cpp b/CMakeModules/checkSSEpe.cpp
deleted file mode 100644
index fe26f30df6d45ce4baa67bf71fdcf89be797a43e..0000000000000000000000000000000000000000
--- a/CMakeModules/checkSSEpe.cpp
+++ /dev/null
@@ -1,15 +0,0 @@
-#include <xmmintrin.h>  // SSE
-#include <emmintrin.h>  //SSE2
-#include <pmmintrin.h> //SSE3
-#ifdef __SSSE3__
-#include <tmmintrin.h>  //SSSE3
-#endif
-#ifdef __SSSE4_1__
-#include <smmintrin.h> // SSE4
-#endif
-
-int main() {
-	__m128d tx, ty ;
-	tx += ty ;
-  return 0;
-}
diff --git a/CMakeModules/compileTestAvx.cpp b/CMakeModules/compileTestAvx.cpp
deleted file mode 100644
index fc9a21b637ed21e64b8e9aa586304bfb5dd29aa1..0000000000000000000000000000000000000000
--- a/CMakeModules/compileTestAvx.cpp
+++ /dev/null
@@ -1,27 +0,0 @@
-
-#include <x86intrin.h>
-#include <xmmintrin.h> // SSE
-#include <emmintrin.h> // SSE2
-#include <pmmintrin.h> // SSE3
-#include <tmmintrin.h> // SSSE3
-#include <smmintrin.h> // SSE4
-
-#include <immintrin.h> // AVX
-
-int main(){
-	{
-		__m256d res0d, res1d;
-		res0d = _mm256_hadd_pd(res0d, res1d);
-
-		__m256 res0, res1;
-		res0 = _mm256_hadd_ps(res0, res1);
-	}
-	{
-		__m128d res0d, res1d;
-		res0d = _mm_hadd_pd(res0d, res1d);
-
-		__m128 res0, res1;
-		res0 = _mm_hadd_ps(res0, res1);
-	}
-	return 0;
-}
diff --git a/CMakeModules/compileTestAvx2.cpp b/CMakeModules/compileTestAvx2.cpp
deleted file mode 100644
index cc5138c77dbba62a31952b1e7a8af5fc6bb07b3f..0000000000000000000000000000000000000000
--- a/CMakeModules/compileTestAvx2.cpp
+++ /dev/null
@@ -1,37 +0,0 @@
-
-#include <x86intrin.h>
-#include <xmmintrin.h> // SSE
-#include <emmintrin.h> // SSE2
-#include <pmmintrin.h> // SSE3
-#include <tmmintrin.h> // SSSE3
-#include <smmintrin.h> // SSE4
-
-#include <immintrin.h> // AVX
-
-int main(){
-	{
-	#ifdef __MIC__
-		__m512d res0d, res1d;
-		res0d = _mm512_hadd_pd(res0d, res1d);
-
-		__m512 res0, res1;
-		res0 = _mm512_hadd_ps(res0, res1);
-	#endif
-	}
-	{
-		__m256d res0d, res1d;
-		res0d = _mm256_hadd_pd(res0d, res1d);
-
-		__m256 res0, res1;
-		res0 = _mm256_hadd_ps(res0, res1);
-	}
-	{
-		__m128d res0d, res1d;
-		res0d = _mm_hadd_pd(res0d, res1d);
-
-		__m128 res0, res1;
-		res0 = _mm_hadd_ps(res0, res1);
-	}
-	return 0;
-}
-
diff --git a/CMakeModules/compileTestIntel.cpp b/CMakeModules/compileTestIntel.cpp
deleted file mode 100644
index 3db518c378cc0e36a067bea93d7cf0b45a5ff222..0000000000000000000000000000000000000000
--- a/CMakeModules/compileTestIntel.cpp
+++ /dev/null
@@ -1,13 +0,0 @@
-int main(){
-
-    int i ;
-#ifdef __INTEL_COMPILER
-
-     i = 0;
-
-#else
-
-#error 'Not Intel Compiler "
-
-#endif
-}
diff --git a/CMakeModules/compileTestSse.cpp b/CMakeModules/compileTestSse.cpp
deleted file mode 100644
index 0ff249bb5bd69662efb5f7956c2a9c980b54e8fc..0000000000000000000000000000000000000000
--- a/CMakeModules/compileTestSse.cpp
+++ /dev/null
@@ -1,20 +0,0 @@
-
-#include <x86intrin.h>
-#include <xmmintrin.h> // SSE
-#include <emmintrin.h> // SSE2
-#include <pmmintrin.h> // SSE3
-#ifdef __SSSE3__
-#include <tmmintrin.h>  //SSSE3
-#endif
-#ifdef __SSSE4_1__
-#include <smmintrin.h> // SSE4
-#endif
-int main(){
-	__m128d res0d, res1d;
-	res0d = _mm_hadd_pd(res0d, res1d);
-
-	__m128 res0, res1;
-	res0 = _mm_hadd_ps(res0, res1);
-
-	return 0;
-}
diff --git a/CMakeModules/getCpuInfos.cpp b/CMakeModules/getCpuInfos.cpp
deleted file mode 100644
index 4cbd3143cb373f01739af20ec4d7ea13c8389495..0000000000000000000000000000000000000000
--- a/CMakeModules/getCpuInfos.cpp
+++ /dev/null
@@ -1,338 +0,0 @@
-///////////////////////////////////////////////////////////////////////////
-// Berenger Bramas INRIA - 2014
-// Code provided under GNU Lesser General Public License
-//
-//
-// This file ask the cpuid to get access to CPU properties.
-// The file contains 3 mains parts:
-// × First part is a wrapper in case we are on Windows or Linux
-// × Second part is several call to the function and fill of an list
-// × Third is out of scope, it prints the state and the properties
-//   in a strict format in order to post process
-///////////////////////////////////////////////////////////////////////////
-
-
-
-///////////////////////////////////////////////////////////////////////////
-// Part 1:
-// Defines cpuid:
-// × A Wrapper if we are on windows
-// × A call to assembly else
-///////////////////////////////////////////////////////////////////////////
-
-enum RegistersNum {
-    EaxRegister = 0,
-    EbxRegister,
-    EcxRegister,
-    EdxRegister
-};
-
-#ifdef _WIN32
-
-// On windows __cpuid exists: http://msdn.microsoft.com/en-us/library/hskdteyh(v=vs.90).aspx
-// void __cpuid(int CPUInfo[4],int InfoType);
-// we would like to have the same name for not windows
-#define cpuid    __cpuid
-
-#elif  _ARCH_PPC
-#error("PPC") 
-#else
-
-// Else we have to ask the CPU directly by executin cpuid.
-// eax should contains the information querry argument.
-// Then we have to take the results from the different registers.
-//
-//    From : http://www.ibiblio.org/gferg/ldp/GCC-Inline-Assembly-HOWTO.html
-//
-//    asm ( assembler template
-//        : output operands                  // optional
-//        : input operands                   // optional
-//        : list of clobbered registers      // optional
-//        );
-//
-//    +---+--------------------+
-//    | r |    Register(s)     |
-//    +---+--------------------+
-//    | a |   %eax, %ax, %al   |
-//    | b |   %ebx, %bx, %bl   |
-//    | c |   %ecx, %cx, %cl   |
-//    | d |   %edx, %dx, %dl   |
-//    | S |   %esi, %si        |
-//    | D |   %edi, %di        |
-//    +---+--------------------+
-//
-
-
-//  GCC Inline Assembly but with the same prototype as windows
-void cpuid(int CPUInfo[4],int InfoType){
-    __asm__ __volatile__ (
-        "cpuid":            // Execute this instruction
-        "=a" (CPUInfo[EaxRegister]),  // Store eax in 0
-        "=b" (CPUInfo[EbxRegister]),  // Store ebx in 1
-        "=c" (CPUInfo[EcxRegister]),  // Store ecx in 2
-        "=d" (CPUInfo[EdxRegister]) : // Store edx in 3
-        "a" (InfoType)      // Input InfoType in eax before instruction
-    );
-}
-
-#endif
-
-#ifndef  _ARCH_PPC
-bool CPUInfoGetEAX(const int CPUInfo[4], const int position){
-    return (CPUInfo[EaxRegister] & ((int)1 << position)) != 0;
-}
-
-bool CPUInfoGetEBX(const int CPUInfo[4], const int position){
-    return (CPUInfo[EbxRegister] & ((int)1 << position)) != 0;
-}
-
-bool CPUInfoGetECX(const int CPUInfo[4], const int position){
-    return (CPUInfo[EcxRegister] & ((int)1 << position)) != 0;
-}
-
-bool CPUInfoGetEDX(const int CPUInfo[4], const int position){
-    return (CPUInfo[EdxRegister] & ((int)1 << position)) != 0;
-}
-
-///////////////////////////////////////////////////////////////////////////
-// Part 2:
-// Call the cpuid function and ask for particular information.
-// In our case we want to use these information to print it (and later use it
-// in a CMake file).
-// So you can change this file to get more informations and do something else with them.
-//
-//    From 64-ia-32-architectures-software-developer-instruction-set-reference-manual-325383.pdf
-//    Or recommanded : AMD CPUID_Specification.pdf
-//    We know in part CPUID—CPU Identification that a call to the cpuid instruction fill the registers
-//    with the cpu property.
-///////////////////////////////////////////////////////////////////////////
-
-#include <string>
-#include <list>
-
-struct CpuProperty {
-    CpuProperty(const char inName[], const bool IsEnable)
-        : name(inName), enabled(IsEnable){
-    }
-
-    std::string name;
-    bool enabled;
-};
-
-std::list<CpuProperty> getProperties(){
-    std::list<CpuProperty> properties;
-
-    // To store the registers value
-	int info[4];
-
-    // Basic CPUID Information
-    cpuid(info, 0);
-    // The largest CPUID standard-function input value supported by the processor implementation.
-    const int limitStandardFunction = info[EaxRegister];
-
-    // Extended Function CPUID Information
-    cpuid(info, 0x80000000);
-    // The largest CPUID extended-function input value supported by the processor implementation
-    int limitExtendedFunction = info[EaxRegister];
-
-	//  Detect Instruction Set
-    if (limitStandardFunction >= 1){
-        cpuid(info,0x00000001); // Basic CPUID Information
-        /*
-        0x00000001 - EDX :
-            31:29 Reserved.
-            28 HTT: Hyper-Threading Technology. Indicates either that there is more than one thread per CPU core
-              or more than one CPU core per processor. AMD currently does not support more than one thread per
-             CPU core. See “Legacy Method” on page 23.
-            27 Reserved.
-            26 SSE2: SSE2 extensions. See Appendix D “CPUID Feature Sets” in APM3.
-            25 SSE: SSE extensions. See Appendix D “CPUID Feature Sets” in APM3 appendix and “64-Bit Media
-                    Programming” in APM1.
-            24 FXSR: FXSAVE and FXRSTOR instructions. See “FXSAVE” and “FXRSTOR” in APM4.
-            23 MMX: MMXTM instructions. See Appendix D “CPUID Feature Sets” in APM3 and “128-Bit Media
-                      and Scientific Programming” in APM1.
-            22:20 Reserved.
-            19 18 Reserved.
-            17 PSE36: Page-size extensions. The PDE[20:13] supplies physical address [39:32]. See “Page Translation and Protection” in APM2.
-            16 PAT: Page attribute table. PCD, PWT, and PATi are used to alter memory type. See “Page-Attribute Table Mechanism” in APM2.
-            15 CMOV: Conditional move instructions, CMOV, FCMOV. See “CMOV”, “FCMOV” in APM3.
-            14 MCA: Machine check architecture, MCG_CAP. See “Machine Check Mechanism” in APM2.
-            13 PGE: Page global extension, CR4.PGE. See “Page Translation and Protection” in APM2.
-            12 MTRR: Memory-type range registers. MTRRcap supported. See “Page Translation and Protection” in APM2.
-            11 SysEnterSysExit: SYSENTER and SYSEXIT instructions. See “SYSENTER”, “SYSEXIT“ in APM3.
-            10 Reserved
-            9 APIC. Advanced programmable interrupt controller (APIC) exists and is enabled. See “Exceptions
-               and Interrupts” in APM2.
-            8 CMPXCHG8B: CMPXCHG8B instruction. See “CMPXCHG8B” in APM3.
-            7 MCE: Machine check exception, CR4.MCE. See “Machine Check Mechanism” in APM2.
-            6 PAE: Physical-address extensions (PAE), support for physical addresses ≥ 32b. Number of physical
-               address bits above 32b is implementation specific. See “Page Translation and Protection” in APM2.
-            5 MSR: AMD model-specific registers (MSRs), with RDMSR and WRMSR instructions. See “Model
-               Specific Registers” in APM2.
-            4 TSC: Time stamp counter. RDTSC and RDTSCP instruction support. See “Debug and Performance
-               Resources” in APM2.
-            3 PSE: Page-size extensions (4 MB pages). See “Page Translation and Protection” in APM2.
-            2 DE: Debugging extensions, I/O breakpoints, CR4.DE. See “Debug and Performance Resources” in
-                 APM2.
-            1 VME: Virtual-mode enhancements, CR4.VME, CR4.PVI, software interrupt indirection, expansion
-             of the TSS with the software, indirection bitmap, EFLAGS.VIF, EFLAGS.VIP. See “System
-              Resources” in APM2.
-            0 FPU: x87 floating point unit on-chip. See “x87 Floating Point Programming” in APM1
-         */
-
-        properties.push_back(CpuProperty("MMX", CPUInfoGetEDX(info, 23)));
-        properties.push_back(CpuProperty("SSE", CPUInfoGetEDX(info, 25)));
-        properties.push_back(CpuProperty("SSE2", CPUInfoGetEDX(info, 26)));
-
-        /*
-         0x00000001 - ECX :
-            0 SSE3 Streaming SIMD Extensions 3 (SSE3). A value of 1 indicates the processor supports this
-            technology.
-            1 PCLMULQDQ PCLMULQDQ. A value of 1 indicates the processor supports the PCLMULQDQ instruction
-            2 DTES64 64-bit DS Area. A value of 1 indicates the processor supports DS area using 64-bit layout
-            3 MONITOR MONITOR/MWAIT. A value of 1 indicates the processor supports this feature.
-            4 DS-CPL CPL Qualified Debug Store. A value of 1 indicates the processor supports the extensions to the
-            Debug Store feature to allow for branch message storage qualified by CPL.
-            5 VMX Virtual Machine Extensions. A value of 1 indicates that the processor supports this technology
-            6 SMX Safer Mode Extensions. A value of 1 indicates that the processor supports this technology. See
-            Chapter 5, “Safer Mode Extensions Reference”.
-            7 EIST Enhanced Intel SpeedStep® technology. A value of 1 indicates that the processor supports this
-            technology.
-            8 TM2 Thermal Monitor 2. A value of 1 indicates whether the processor supports this technology.
-            9 SSSE3 A value of 1 indicates the presence of the Supplemental Streaming SIMD Extensions 3 (SSSE3). A
-            value of 0 indicates the instruction extensions are not present in the processor
-            10 CNXT-ID L1 Context ID. A value of 1 indicates the L1 data cache mode can be set to either adaptive mode
-            or shared mode. A value of 0 indicates this feature is not supported. See definition of the
-            IA32_MISC_ENABLE MSR Bit 24 (L1 Data Cache Context Mode) for details.
-            11 SDBG A value of 1 indicates the processor supports IA32_DEBUG_INTERFACE MSR for silicon debug.
-            12 FMA A value of 1 indicates the processor supports FMA extensions using YMM state.
-            13 CMPXCHG16B CMPXCHG16B Available. A value of 1 indicates that the feature is available. See the
-            “CMPXCHG8B/CMPXCHG16B—Compare and Exchange Bytes” section in this chapter for a
-            description.
-            14 xTPR Update
-            Control
-            xTPR Update Control. A value of 1 indicates that the processor supports changing
-            IA32_MISC_ENABLE[bit 23].
-            15 PDCM Perfmon and Debug Capability: A value of 1 indicates the processor supports the performance
-            and debug feature indication MSR IA32_PERF_CAPABILITIES.
-            16 Reserved Reserved
-            17 PCID Process-context identifiers. A value of 1 indicates that the processor supports PCIDs and that
-            software may set CR4.PCIDE to 1.
-            18 DCA A value of 1 indicates the processor supports the ability to prefetch data from a memory mapped
-            device.
-            19 SSE4.1 A value of 1 indicates that the processor supports SSE4.1.
-            20 SSE4.2 A value of 1 indicates that the processor supports SSE4.2.
-            21 x2APIC A value of 1 indicates that the processor supports x2APIC feature.
-            22 MOVBE A value of 1 indicates that the processor supports MOVBE instruction.
-            23 POPCNT A value of 1 indicates that the processor supports the POPCNT instruction.
-            24 TSC-Deadline A value of 1 indicates that the processor’s local APIC timer supports one-shot operation using a
-            TSC deadline value.
-            25 AESNI A value of 1 indicates that the processor supports the AESNI instruction extensions.
-            26 XSAVE A value of 1 indicates that the processor supports the XSAVE/XRSTOR processor extended states
-            feature, the XSETBV/XGETBV instructions, and XCR0.
-            27 OSXSAVE A value of 1 indicates that the OS has set CR4.OSXSAVE[bit 18] to enable the XSAVE feature set.
-            28 AVX A value of 1 indicates the processor supports the AVX instruction extensions.
-            29 F16C A value of 1 indicates that processor supports 16-bit floating-point conversion instructions.
-            30 RDRAND A value of 1 indicates that processor supports RDRAND instruction.
-            31 Not Used Always returns 0.
-          */
-        properties.push_back(CpuProperty("SSE3", CPUInfoGetECX(info,  0)));
-        properties.push_back(CpuProperty("SSSE3", CPUInfoGetECX(info,  9)));
-        properties.push_back(CpuProperty("SSE41", CPUInfoGetECX(info, 19)));
-        properties.push_back(CpuProperty("SSE42", CPUInfoGetECX(info, 20)));
-        properties.push_back(CpuProperty("AVX",  CPUInfoGetECX(info, 28)));
-        properties.push_back(CpuProperty("FMA3", CPUInfoGetECX(info, 12)));
-	}
-
-    if (limitExtendedFunction >= 0x80000001){
-        cpuid(info,0x80000001); // Extended Function CPUID Information
-        /*
-        0x80000001 - EDX :
-            31 3DNow: 3DNow!TM instructions. See Appendix D “Instruction Subsets and CPUID Feature Sets” in APM3.
-            30 3DNowExt: AMD extensions to 3DNow! instructions. See Appendix D “Instruction Subsets and
-                CPUID Feature Sets” in APM3.
-            29 LM: Long mode. See “Processor Initialization and Long-Mode Activation” in APM2.
-            28 Reserved.
-            27 RDTSCP: RDTSCP instruction. See “RDTSCP” in APM3.
-            26 Page1GB: 1-GB large page support. See “1-GB Paging Support” in APM2.
-            25 FFXSR: FXSAVE and FXRSTOR instruction optimizations. See “FXSAVE” and “FXRSTOR” in APM4.
-            24 FXSR: FXSAVE and FXRSTOR instructions. Same as CPUID Fn0000_0001_EDX[FXSR].
-            23 MMX: MMXTM instructions. Same as CPUID Fn0000_0001_EDX[MMX].
-            22 MmxExt: AMD extensions to MMX instructions. See Appendix D “Instruction Subsets and CPUID
-                Feature Sets” in APM3 and “128-Bit Media and Scientific Programming” in APM1.
-            21 Reserved.
-            20 NX: No-execute page protection. See “Page Translation and Protection” in APM2.
-            19:18 Reserved.
-            17 PSE36: Page-size extensions. Same as CPUID Fn0000_0001_EDX[PSE36].
-            16 PAT: Page attribute table. Same as CPUID Fn0000_0001_EDX[PAT].
-            15 CMOV: Conditional move instructions. Same as CPUID Fn0000_0001_EDX[CMOV]
-            14 MCA: Machine check architecture. Same as CPUID Fn0000_0001_EDX[MCA].
-            13 PGE: Page global extension. Same as CPUID Fn0000_0001_EDX[PGE].
-            12 MTRR: Memory-type range registers. Same as CPUID Fn0000_0001_EDX[MTRR].
-            11 SysCallSysRet: SYSCALL and SYSRET instructions. See “SYSCALL” and “SYSRET” in APM3.
-            10 Reserved.
-            9 APIC. Advanced programmable interrupt controller. Same as CPUID Fn0000_0001_EDX[APIC].
-            8 CMPXCHG8B: CMPXCHG8B instruction. Same as CPUID Fn0000_0001_EDX[CMPXCHG8B].
-            7 MCE: Machine check exception. Same as CPUID Fn0000_0001_EDX[MCE].
-            6 PAE: Physical-address extensions. Same as CPUID Fn0000_0001_EDX[PAE].
-            5 MSR: AMD model-specific registers. Same as CPUID Fn0000_0001_EDX[MSR].
-            4 TSC: Time stamp counter. Same as CPUID Fn0000_0001_EDX[TSC].
-            3 PSE: Page-size extensions. Same as CPUID Fn0000_0001_EDX[PSE].
-            2 DE: Debugging extensions. Same as CPUID Fn0000_0001_EDX[DE].
-            1 VME: Virtual-mode enhancements. Same as CPUID Fn0000_0001_EDX[VME].
-            0 FPU: x87 floating-point unit on-chip. Same as CPUID Fn0000_0001_EDX[FPU].
-          */
-        properties.push_back(CpuProperty("x64", CPUInfoGetEDX(info, 29)));
-        /*
-        0x80000001 - ECX :
-            31:14 Reserved.
-            13 WDT: Watchdog timer support.
-            12 SKINIT: SKINIT, STGI, and DEV support.
-            11:10 Reserved.
-            9 OSVW: OS visible workaround. Indicates OS-visible workaround support. See “OS Visible Work-
-             around (OSVW) Information” in APM2.
-            8 3DNowPrefetch: PREFETCH and PREFETCHW instruction support. See “PREFETCH” and “PREFETCHW” in APM3.
-            7 MisAlignSse: Misaligned SSE mode. See “Misaligned Access Support Added for SSE Instructions”
-                 in APM1.
-            6 SSE4A: EXTRQ, INSERTQ, MOVNTSS, and MOVNTSD instruction support. See “EXTRQ”,
-                 “INSERTQ”, “MOVNTSS”, and “MOVNTSD” in APM4.
-            5 ABM: Advanced bit manipulation. LZCNT instruction support. See “LZCNT” in APM3.
-            4 AltMovCr8: LOCK MOV CR0 means MOV CR8. See “MOV(CRn)” in APM3.
-            3 ExtApicSpace: This bit indicates the presence of extended APIC register space starting at offset
-             400h from the “APIC Base Address Register,” as specified in the BKDG..
-            2 SVM: Secure virtual machine feature. See “Secure Virtual Machine” in APM2.
-            1 CmpLegacy: Core multi-processing legacy mode. See “Legacy Method” on page 23.
-            0 LahfSahf: LAHF and SAHF instruction support in 64-bit mode. See “LAHF” and “SAHF” in   APM3.
-          */
-        properties.push_back(CpuProperty("SSE4a", CPUInfoGetECX(info,  6)));
-        properties.push_back(CpuProperty("FMA4", CPUInfoGetECX(info, 16)));
-        properties.push_back(CpuProperty("XOP", CPUInfoGetECX(info, 11)));
-	}
-
-    return properties;
-}
-#else
-// POWER (IBM)
-#endif
-
-
-///////////////////////////////////////////////////////////////////////////
-// Part 3:
-// Print the information in a format to use it with CMake
-///////////////////////////////////////////////////////////////////////////
-
-#include <iostream>
-
-int main(){
-    const std::list<CpuProperty> properties = getProperties();
-
-    const std::list<CpuProperty>::const_iterator endIterProperties = properties.end();
-    for(std::list<CpuProperty>::const_iterator iterProperties = properties.begin()
-            ; iterProperties != endIterProperties
-            ; ++iterProperties){
-        // Print the status
-        std::cout << (*iterProperties).name << "=" << ((*iterProperties).enabled?"TRUE":"FALSE") << ";";
-    }
-
-    return 0;
-}
diff --git a/README.md b/README.md
index a8a8f525e55f6bbd260402481f6d898075142353..7c6c11fb63784ea5c3d2d2cd926d8fcec501806f 100644
--- a/README.md
+++ b/README.md
@@ -30,7 +30,7 @@ The following are optional:
 
 ### Get and Build ScalFMM
 To use last development states of ScalFMM, please clone the develop
-  branch. Note that ScalFMM contains a git submodule `morse_cmake`.
+  branch. Note that ScalFMM contains two git submodules `morse_cmake` and `inastemp`.
   To get sources please use these commands:
 ``` bash
 git clone --recursive git@gitlab.inria.fr:solverstack/ScalFMM.git -b develop
diff --git a/Src/Adaptive/FAdaptChebKernel.hpp b/Src/Adaptive/FAdaptChebKernel.hpp
index 1c3f753436b458b108f27503eb58221686b0b86b..2ed9b00e26e568b280f1e9a8d1e2b00b3fac9649 100644
--- a/Src/Adaptive/FAdaptChebKernel.hpp
+++ b/Src/Adaptive/FAdaptChebKernel.hpp
@@ -95,13 +95,13 @@ public:
         // apply P2L
         for(int idxRhs = 0 ; idxRhs < NVALS ; ++idxRhs){
             for (unsigned int m = 0; m<FBase::nnodes; ++m) {
-                ComputeClass XX = FMath::ConvertTo<ComputeClass>(X[m].getX());
-                ComputeClass XY = FMath::ConvertTo<ComputeClass>(X[m].getY());
-                ComputeClass XZ = FMath::ConvertTo<ComputeClass>(X[m].getZ());
+                ComputeClass XX = ComputeClass(X[m].getX());
+                ComputeClass XY = ComputeClass(X[m].getY());
+                ComputeClass XZ = ComputeClass(X[m].getZ());
 
                 std::size_t idxPart = 0;
                 // Compute using vectorization for all but the last array elements
-                ComputeClass tmpLocalExp = FMath::Zero<ComputeClass>();
+                ComputeClass tmpLocalExp = ComputeClass(0);
                 for (;
                      idxPart < ((particles->getNbParticles())
                                 / FRealCount);
@@ -114,7 +114,7 @@ public:
                         * physicalValues[idxPart];
                 }
 
-                local->get(idxRhs)[m] += FMath::ConvertTo<FReal>(tmpLocalExp);
+                local->get(idxRhs)[m] += ComputeClass(tmpLocalExp);
 
                 // Compute the last array elements one by one if they exist
                 if(idxPart < ((particles->getNbParticles() + FRealCount - 1) / FRealCount)) {
@@ -172,11 +172,11 @@ public:
             for (unsigned int n=0; n<FBase::nnodes; ++n){
 
                 ComputeClass  MultipoleExpansion =
-                    FMath::ConvertTo<ComputeClass, FReal>(pole->get(idxRhs)[n]);
+                    ComputeClass(pole->get(idxRhs)[n]);
 
-                ComputeClass YX = FMath::ConvertTo<ComputeClass, FReal>(Y[n].getX());
-                ComputeClass YY = FMath::ConvertTo<ComputeClass, FReal>(Y[n].getY());
-                ComputeClass YZ = FMath::ConvertTo<ComputeClass, FReal>(Y[n].getZ());
+                ComputeClass YX = ComputeClass(Y[n].getX());
+                ComputeClass YY = ComputeClass(Y[n].getY());
+                ComputeClass YZ = ComputeClass(Y[n].getZ());
 
                 for(std::size_t idxPart = 0;
                     idxPart < ( (particles->getNbParticles() + FRealCount - 1)
diff --git a/Src/Adaptive/FAdaptUnifKernel.hpp b/Src/Adaptive/FAdaptUnifKernel.hpp
index 16f7254e1212c8bd49c91937bb26b1af3cf3ccfd..75063ba960d135598deba4b5faefd9f8999953e5 100644
--- a/Src/Adaptive/FAdaptUnifKernel.hpp
+++ b/Src/Adaptive/FAdaptUnifKernel.hpp
@@ -140,11 +140,11 @@ public:
         // apply P2L
         for(int idxRhs = 0 ; idxRhs < NVALS ; ++idxRhs){
             for (unsigned int m = 0; m < FBase::nnodes; ++m) {
-                ComputeClass XX = FMath::ConvertTo<ComputeClass>(X[m].getX());
-                ComputeClass XY = FMath::ConvertTo<ComputeClass>(X[m].getY());
-                ComputeClass XZ = FMath::ConvertTo<ComputeClass>(X[m].getZ());
+                ComputeClass XX = ComputeClass(X[m].getX());
+                ComputeClass XY = ComputeClass(X[m].getY());
+                ComputeClass XZ = ComputeClass(X[m].getZ());
 
-                ComputeClass tmpLocalExp = FMath::Zero<ComputeClass>();
+                ComputeClass tmpLocalExp = ComputeClass(0);
                 // Compute using vectorization for all but the last array elements
                 std::size_t idxPart = 0;
                 for (; idxPart < (particles->getNbParticles() / FRealCount);
@@ -157,7 +157,7 @@ public:
                         * physicalValues[idxPart];
                 }
 
-                local->get(idxRhs)[m] += FMath::ConvertTo<FReal>(tmpLocalExp);
+                local->get(idxRhs)[m] += FReal(tmpLocalExp);
 
                 // Compute the last array elements one by one if they exist
                 if(idxPart < ((particles->getNbParticles() + FRealCount - 1) / FRealCount)) {
@@ -221,11 +221,11 @@ public:
             for (unsigned int n=0; n<FBase::nnodes; ++n){
 
                 ComputeClass  MultipoleExpansion =
-                    FMath::ConvertTo<ComputeClass, FReal>(pole->get(idxRhs)[n]);
+                    ComputeClass(pole->get(idxRhs)[n]);
 
-                ComputeClass YX = FMath::ConvertTo<ComputeClass, FReal>(Y[n].getX());
-                ComputeClass YY = FMath::ConvertTo<ComputeClass, FReal>(Y[n].getY());
-                ComputeClass YZ = FMath::ConvertTo<ComputeClass, FReal>(Y[n].getZ());
+                ComputeClass YX = ComputeClass(Y[n].getX());
+                ComputeClass YY = ComputeClass(Y[n].getY());
+                ComputeClass YZ = ComputeClass(Y[n].getZ());
 
                 for(std::size_t idxPart = 0;
                     idxPart < ( (particles->getNbParticles() + FRealCount - 1)
diff --git a/Src/Kernels/Interpolation/FInterpMatrixKernel.hpp b/Src/Kernels/Interpolation/FInterpMatrixKernel.hpp
index eee694ac1aadfcd7b912fd9cd6a7913952489d8a..3f0b1f9f298c6089c903a6e9dd224339071b3067 100644
--- a/Src/Kernels/Interpolation/FInterpMatrixKernel.hpp
+++ b/Src/Kernels/Interpolation/FInterpMatrixKernel.hpp
@@ -89,7 +89,7 @@ struct FInterpMatrixKernelR : FInterpAbstractMatrixKernel<FReal>
         const ValueClass diffx = (xt-xs);
         const ValueClass diffy = (yt-ys);
         const ValueClass diffz = (zt-zs);
-        return FMath::One<ValueClass>() / FMath::Sqrt(diffx*diffx + diffy*diffy + diffz*diffz);
+        return ValueClass(1) / FMath::Sqrt(diffx*diffx + diffy*diffy + diffz*diffz);
     }
 
     // evaluate interaction (blockwise)
@@ -110,7 +110,7 @@ struct FInterpMatrixKernelR : FInterpAbstractMatrixKernel<FReal>
         const ValueClass diffx = (xt-xs);
         const ValueClass diffy = (yt-ys);
         const ValueClass diffz = (zt-zs);
-        const ValueClass one_over_r = FMath::One<ValueClass>() / FMath::Sqrt(diffx*diffx + diffy*diffy + diffz*diffz);
+        const ValueClass one_over_r = ValueClass(1) / FMath::Sqrt(diffx*diffx + diffy*diffy + diffz*diffz);
 
         const ValueClass one_over_r3 = one_over_r*one_over_r*one_over_r;
 
@@ -176,9 +176,9 @@ struct FInterpMatrixKernelRH :FInterpMatrixKernelR<FReal>{
         const ValueClass diffx = (xt-xs);
         const ValueClass diffy = (yt-ys);
         const ValueClass diffz = (zt-zs);
-        return FMath::One<ValueClass>() / FMath::Sqrt(FMath::ConvertTo<ValueClass,FReal>(LX)*diffx*diffx +
-                                       FMath::ConvertTo<ValueClass,FReal>(LY)*diffy*diffy +
-                                       FMath::ConvertTo<ValueClass,FReal>(LZ)*diffz*diffz);
+        return ValueClass(1) / FMath::Sqrt(ValueClass(LX)*diffx*diffx +
+                                       ValueClass(LY)*diffy*diffy +
+                                       ValueClass(LZ)*diffz*diffz);
     }
     void setCoeff(const FReal& a,  const FReal& b, const FReal& c)
     {LX= a*a ; LY = b*b ; LZ = c *c;}
@@ -208,16 +208,16 @@ struct FInterpMatrixKernelRH :FInterpMatrixKernelR<FReal>{
         const ValueClass diffx = (xt-xs);
         const ValueClass diffy = (yt-ys);
         const ValueClass diffz = (zt-zs);
-        const ValueClass one_over_rL = FMath::One<ValueClass>() / FMath::Sqrt(FMath::ConvertTo<ValueClass,FReal>(LX)*diffx*diffx +
-                                                          FMath::ConvertTo<ValueClass,FReal>(LY)*diffy*diffy +
-                                                          FMath::ConvertTo<ValueClass,FReal>(LZ)*diffz*diffz);
+        const ValueClass one_over_rL = ValueClass(1) / (ValueClass(LX)*diffx*diffx +
+                                                          ValueClass(LY)*diffy*diffy +
+                                                          ValueClass(LZ)*diffz*diffz);
         const ValueClass one_over_rL3 = one_over_rL*one_over_rL*one_over_rL;
 
         block[0] = one_over_rL;
 
-        blockDerivative[0] = FMath::ConvertTo<ValueClass,FReal>(LX) * one_over_rL3 * diffx;
-        blockDerivative[1] = FMath::ConvertTo<ValueClass,FReal>(LY)* one_over_rL3 * diffy;
-        blockDerivative[2] = FMath::ConvertTo<ValueClass,FReal>(LZ)* one_over_rL3 * diffz;
+        blockDerivative[0] = ValueClass(LX) * one_over_rL3 * diffx;
+        blockDerivative[1] = ValueClass(LY)* one_over_rL3 * diffy;
+        blockDerivative[2] = ValueClass(LZ)* one_over_rL3 * diffz;
 
     }
 
@@ -283,7 +283,7 @@ struct FInterpMatrixKernelRR : FInterpAbstractMatrixKernel<FReal>
         const ValueClass diffx = (xt-xs);
         const ValueClass diffy = (yt-ys);
         const ValueClass diffz = (zt-zs);
-        return FMath::One<ValueClass>() / FReal(diffx*diffx+diffy*diffy+diffz*diffz);
+        return ValueClass(1) / FReal(diffx*diffx+diffy*diffy+diffz*diffz);
     }
 
     // evaluate interaction (blockwise)
@@ -305,12 +305,12 @@ struct FInterpMatrixKernelRR : FInterpAbstractMatrixKernel<FReal>
         const ValueClass diffy = (yt-ys);
         const ValueClass diffz = (zt-zs);
         const ValueClass r2 = (diffx*diffx+diffy*diffy+diffz*diffz);
-        const ValueClass one_over_r2 = FMath::One<ValueClass>() / (r2);
+        const ValueClass one_over_r2 = ValueClass(1) / (r2);
         const ValueClass one_over_r4 = one_over_r2*one_over_r2;
 
         block[0] = one_over_r2;
 
-        const ValueClass coef = FMath::ConvertTo<ValueClass,FReal>(-2.) * one_over_r4;
+        const ValueClass coef = ValueClass(-2.) * one_over_r4;
         blockDerivative[0] = coef * diffx;
         blockDerivative[1] = coef * diffy;
         blockDerivative[2] = coef * diffz;
@@ -382,7 +382,7 @@ struct FInterpMatrixKernelLJ : FInterpAbstractMatrixKernel<FReal>
         const ValueClass diffz = (zt-zs);
         const ValueClass r = FMath::Sqrt(diffx*diffx+diffy*diffy+diffz*diffz);
         const ValueClass r3 = r*r*r;
-        const ValueClass one_over_r6 = FMath::One<ValueClass>() / (r3*r3);
+        const ValueClass one_over_r6 = ValueClass(1) / (r3*r3);
         //return one_over_r6 * one_over_r6;
         //return one_over_r6;
         return one_over_r6 * one_over_r6 - one_over_r6;
@@ -409,12 +409,12 @@ struct FInterpMatrixKernelLJ : FInterpAbstractMatrixKernel<FReal>
         const ValueClass r = FMath::Sqrt(diffx*diffx+diffy*diffy+diffz*diffz);
         const ValueClass r2 = r*r;
         const ValueClass r3 = r2*r;
-        const ValueClass one_over_r6 = FMath::One<ValueClass>() / (r3*r3);
+        const ValueClass one_over_r6 = ValueClass(1) / (r3*r3);
         const ValueClass one_over_r8 = one_over_r6 / (r2);
 
         block[0] = one_over_r6 * one_over_r6 - one_over_r6;
 
-        const FReal coef = FMath::ConvertTo<ValueClass,FReal>(12.0)*one_over_r6*one_over_r8 - FMath::ConvertTo<ValueClass,FReal>(6.0)*one_over_r8;
+        const FReal coef = ValueClass(12.0)*one_over_r6*one_over_r8 - ValueClass(6.0)*one_over_r8;
         blockDerivative[0]= coef * diffx;
         blockDerivative[1]= coef * diffy;
         blockDerivative[2]= coef * diffz;
@@ -493,7 +493,7 @@ struct FInterpMatrixKernelAPLUSRR : FInterpAbstractMatrixKernel<FReal>
         const ValueClass diffy = (yt-ys);
         const ValueClass diffz = (zt-zs);
         const ValueClass r2 = (diffx*diffx+diffy*diffy+diffz*diffz);
-        return FMath::One<ValueClass>() / (r2 + FMath::ConvertTo<ValueClass,FReal>(CoreWidth));
+        return ValueClass(1) / (r2 + ValueClass(CoreWidth));
     }
 
     // evaluate interaction (blockwise)
@@ -515,13 +515,13 @@ struct FInterpMatrixKernelAPLUSRR : FInterpAbstractMatrixKernel<FReal>
         const ValueClass diffy = (yt-ys);
         const ValueClass diffz = (zt-zs);
         const ValueClass r2 = (diffx*diffx+diffy*diffy+diffz*diffz);
-        const ValueClass one_over_a_plus_r2 = FMath::One<ValueClass>() / (r2 + FMath::ConvertTo<ValueClass,FReal>(CoreWidth));
+        const ValueClass one_over_a_plus_r2 = ValueClass(1) / (r2 + ValueClass(CoreWidth));
         const ValueClass one_over_a_plus_r2_squared = one_over_a_plus_r2*one_over_a_plus_r2;
 
         block[0] = one_over_a_plus_r2;
 
         // TODO Fix derivative
-        const ValueClass coef = FMath::ConvertTo<ValueClass,FReal>(-2.) * one_over_a_plus_r2_squared;
+        const ValueClass coef = ValueClass(-2.) * one_over_a_plus_r2_squared;
         blockDerivative[0] = coef * diffx;
         blockDerivative[1] = coef * diffy;
         blockDerivative[2] = coef * diffz;
diff --git a/Src/Kernels/Interpolation/FInterpMatrixKernel_Covariance.hpp b/Src/Kernels/Interpolation/FInterpMatrixKernel_Covariance.hpp
index 97894794ca2900d89fcce38bd5865174614a2cd3..38f7104099a17615ddfbc0e035ad5ff85ab692e8 100644
--- a/Src/Kernels/Interpolation/FInterpMatrixKernel_Covariance.hpp
+++ b/Src/Kernels/Interpolation/FInterpMatrixKernel_Covariance.hpp
@@ -120,13 +120,13 @@ struct FInterpMatrixKernelGauss : FAbstractCorrelationKernel<FReal>
   {
     const ValueClass diff[3] = {(x1-x2),(y1-y2),(z1-z2)};
 
-    ValueClass dist2 = FMath::Zero<ValueClass>();
+    ValueClass dist2 = ValueClass(0.);
     for(int d=0; d<3; ++d){
-      const ValueClass distX = diff[d] / FMath::ConvertTo<ValueClass,FReal>(lengthScale_);
+      const ValueClass distX = diff[d] / ValueClass(lengthScale_);
       dist2 += distX*distX;
     }
 
-    return FMath::Exp(FMath::ConvertTo<ValueClass,FReal>(-0.5)*dist2);
+    return FMath::Exp(ValueClass(-0.5)*dist2);
 
   }
 
@@ -145,7 +145,7 @@ struct FInterpMatrixKernelGauss : FAbstractCorrelationKernel<FReal>
                                   ValueClass block[1], ValueClass blockDerivative[3]) const
   {
     block[0]=this->evaluate(x1,y1,z1,x2,y2,z2);
-    const ValueClass lengthScaleOpt = FMath::ConvertTo<ValueClass,FReal>(-1/(lengthScale_*lengthScale_));
+    const ValueClass lengthScaleOpt = ValueClass(-1/(lengthScale_*lengthScale_));
     blockDerivative[0] = block[0]*(x1-x2) * lengthScaleOpt;
     blockDerivative[1] = block[0]*(y1-y2) * lengthScaleOpt;
     blockDerivative[2] = block[0]*(z1-z2) * lengthScaleOpt;
diff --git a/Src/Kernels/Interpolation/FInterpMatrixKernel_TensorialInteractions.hpp b/Src/Kernels/Interpolation/FInterpMatrixKernel_TensorialInteractions.hpp
index 7365e70805926ff5381751934beb4d77048430bd..3b86102f213b485bc75139d1f6249bde0ba94a33 100644
--- a/Src/Kernels/Interpolation/FInterpMatrixKernel_TensorialInteractions.hpp
+++ b/Src/Kernels/Interpolation/FInterpMatrixKernel_TensorialInteractions.hpp
@@ -161,7 +161,7 @@ struct FInterpMatrixKernel_R_IJ : FInterpAbstractMatrixKernel<FReal>
         const ValueClass diffy = (yt-ys);
         const ValueClass diffz = (zt-zs);
         const ValueClass r2 = diffx*diffx+diffy*diffy+diffz*diffz;
-        const ValueClass one_over_r = FMath::One<ValueClass>()/FMath::Sqrt(r2 + FMath::ConvertTo<ValueClass,FReal>(_CoreWidth2));
+        const ValueClass one_over_r = ValueClass(1)/FMath::Sqrt(r2 + ValueClass(_CoreWidth2));
         const ValueClass one_over_r3 = one_over_r*one_over_r*one_over_r;
         ValueClass ri,rj;
 
@@ -192,7 +192,7 @@ struct FInterpMatrixKernel_R_IJ : FInterpAbstractMatrixKernel<FReal>
         const ValueClass diffy = (yt-ys);
         const ValueClass diffz = (zt-zs);
         const ValueClass r2 = diffx*diffx+diffy*diffy+diffz*diffz;
-        const ValueClass one_over_r = FMath::One<ValueClass>()/FMath::Sqrt(r2 + FMath::ConvertTo<ValueClass,FReal>(_CoreWidth2));
+        const ValueClass one_over_r = ValueClass(1)/FMath::Sqrt(r2 + ValueClass(_CoreWidth2));
         const ValueClass one_over_r3 = one_over_r*one_over_r*one_over_r;
 
         const ValueClass r[3] = {diffx,diffy,diffz};
@@ -219,14 +219,14 @@ struct FInterpMatrixKernel_R_IJ : FInterpAbstractMatrixKernel<FReal>
         const ValueClass diffy = (yt-ys);
         const ValueClass diffz = (zt-zs);
         const ValueClass r2[3] = {diffx*diffx,diffy*diffy,diffz*diffz};
-        const ValueClass one_over_r2 = FMath::One<ValueClass>() / (r2[0] + r2[1] + r2[2] + FMath::ConvertTo<ValueClass,FReal>(_CoreWidth2));
+        const ValueClass one_over_r2 = ValueClass(1) / (r2[0] + r2[1] + r2[2] + ValueClass(_CoreWidth2));
         const ValueClass one_over_r  = FMath::Sqrt(one_over_r2);
         const ValueClass one_over_r3 = one_over_r2*one_over_r;
 
         const ValueClass r[3] = {diffx,diffy,diffz};
 
-        const ValueClass Three = FMath::ConvertTo<ValueClass,FReal>(3.);
-        const ValueClass MinusOne = - FMath::One<ValueClass>();
+        const ValueClass Three = ValueClass(3.);
+        const ValueClass MinusOne = - ValueClass(1);
 
         for(unsigned int d=0;d<NCMP;++d){
             unsigned int i = indexTab[d];
diff --git a/Src/Kernels/P2P/FP2P.hpp b/Src/Kernels/P2P/FP2P.hpp
index 93c7a8ab964780d601c4607b48379c9537672d1f..ffd98abe0e9114bb62f3927dec225d94f27438aa 100644
--- a/Src/Kernels/P2P/FP2P.hpp
+++ b/Src/Kernels/P2P/FP2P.hpp
@@ -131,14 +131,14 @@ static void GenericFullMutual(ContainerClass* const FRestrict inTargets,
             ComputeClass*const sourcesPotentials = (ComputeClass*)inNeighbors[idxNeighbors]->getPotentials();
 
             for(FSize idxTarget = 0 ; idxTarget < nbParticlesTargets ; ++idxTarget){
-                const ComputeClass tx = FMath::ConvertTo<ComputeClass, const FReal*>(&targetsX[idxTarget]);
-                const ComputeClass ty = FMath::ConvertTo<ComputeClass, const FReal*>(&targetsY[idxTarget]);
-                const ComputeClass tz = FMath::ConvertTo<ComputeClass, const FReal*>(&targetsZ[idxTarget]);
-                const ComputeClass tv = FMath::ConvertTo<ComputeClass, const FReal*>(&targetsPhysicalValues[idxTarget]);
-                ComputeClass  tfx = FMath::Zero<ComputeClass>();
-                ComputeClass  tfy = FMath::Zero<ComputeClass>();
-                ComputeClass  tfz = FMath::Zero<ComputeClass>();
-                ComputeClass  tpo = FMath::Zero<ComputeClass>();
+                const ComputeClass tx = ComputeClass(&targetsX[idxTarget]);
+                const ComputeClass ty = ComputeClass(&targetsY[idxTarget]);
+                const ComputeClass tz = ComputeClass(&targetsZ[idxTarget]);
+                const ComputeClass tv = ComputeClass(&targetsPhysicalValues[idxTarget]);
+                ComputeClass  tfx = ComputeClass(0.);
+                ComputeClass  tfy = ComputeClass(0.);
+                ComputeClass  tfz = ComputeClass(0.);
+                ComputeClass  tpo = ComputeClass(0.);
 
                 for(FSize idxSource = 0 ; idxSource < nbParticlesSources ; ++idxSource){
                     ComputeClass Kxy[1];
@@ -146,7 +146,7 @@ static void GenericFullMutual(ContainerClass* const FRestrict inTargets,
                     MatrixKernel->evaluateBlockAndDerivative(tx,ty,tz,
                                                              sourcesX[idxSource],sourcesY[idxSource],sourcesZ[idxSource],
                                                              Kxy,dKxy);
-                    const ComputeClass mutual_coeff = FMath::ConvertTo<ComputeClass, FReal>(MatrixKernel->getMutualCoefficient());; // 1 if symmetric; -1 if antisymmetric
+                    const ComputeClass mutual_coeff = ComputeClass(MatrixKernel->getMutualCoefficient());; // 1 if symmetric; -1 if antisymmetric
 
                     const ComputeClass coef = (tv * sourcesPhysicalValues[idxSource]);
 
@@ -165,10 +165,10 @@ static void GenericFullMutual(ContainerClass* const FRestrict inTargets,
                     sourcesPotentials[idxSource] += mutual_coeff * Kxy[0] * tv;
                 }
 
-                targetsForcesX[idxTarget] += FMath::ConvertTo<FReal, ComputeClass>(tfx);
-                targetsForcesY[idxTarget] += FMath::ConvertTo<FReal, ComputeClass>(tfy);
-                targetsForcesZ[idxTarget] += FMath::ConvertTo<FReal, ComputeClass>(tfz);
-                targetsPotentials[idxTarget] += FMath::ConvertTo<FReal, ComputeClass>(tpo);
+                targetsForcesX[idxTarget] += tfx.horizontalSum();
+                targetsForcesY[idxTarget] += tfy.horizontalSum();
+                targetsForcesZ[idxTarget] += tfz.horizontalSum();
+                targetsPotentials[idxTarget] += tpo.horizontalSum();
             }
         }
     }
@@ -201,14 +201,14 @@ static void GenericInner(ContainerClass* const FRestrict inTargets, const Matrix
         ComputeClass*const sourcesPotentials = (ComputeClass*)targetsPotentials;
 
         for(FSize idxTarget = 0 ; idxTarget < nbParticlesTargets ; ++idxTarget){
-            const ComputeClass tx = FMath::ConvertTo<ComputeClass, const FReal*>(&targetsX[idxTarget]);
-            const ComputeClass ty = FMath::ConvertTo<ComputeClass, const FReal*>(&targetsY[idxTarget]);
-            const ComputeClass tz = FMath::ConvertTo<ComputeClass, const FReal*>(&targetsZ[idxTarget]);
-            const ComputeClass tv = FMath::ConvertTo<ComputeClass, const FReal*>(&targetsPhysicalValues[idxTarget]);
-            ComputeClass  tfx = FMath::Zero<ComputeClass>();
-            ComputeClass  tfy = FMath::Zero<ComputeClass>();
-            ComputeClass  tfz = FMath::Zero<ComputeClass>();
-            ComputeClass  tpo = FMath::Zero<ComputeClass>();
+            const ComputeClass tx = ComputeClass(&targetsX[idxTarget]);
+            const ComputeClass ty = ComputeClass(&targetsY[idxTarget]);
+            const ComputeClass tz = ComputeClass(&targetsZ[idxTarget]);
+            const ComputeClass tv = ComputeClass(&targetsPhysicalValues[idxTarget]);
+            ComputeClass  tfx = ComputeClass(0.);
+            ComputeClass  tfy = ComputeClass(0.);
+            ComputeClass  tfz = ComputeClass(0.);
+            ComputeClass  tpo = ComputeClass(0.);
 
             for(FSize idxSource = (idxTarget+NbFRealInComputeClass)/NbFRealInComputeClass ; idxSource < nbParticlesSources ; ++idxSource){
                 ComputeClass Kxy[1];
@@ -216,7 +216,7 @@ static void GenericInner(ContainerClass* const FRestrict inTargets, const Matrix
                 MatrixKernel->evaluateBlockAndDerivative(tx,ty,tz,
                                                          sourcesX[idxSource],sourcesY[idxSource],sourcesZ[idxSource],
                                                          Kxy,dKxy);
-                const ComputeClass mutual_coeff = FMath::ConvertTo<ComputeClass, FReal>(MatrixKernel->getMutualCoefficient()); // 1 if symmetric; -1 if antisymmetric
+                const ComputeClass mutual_coeff = ComputeClass(MatrixKernel->getMutualCoefficient()); // 1 if symmetric; -1 if antisymmetric
 
                 const ComputeClass coef = (tv * sourcesPhysicalValues[idxSource]);
 
@@ -227,18 +227,18 @@ static void GenericInner(ContainerClass* const FRestrict inTargets, const Matrix
                 tfx += dKxy[0];
                 tfy += dKxy[1];
                 tfz += dKxy[2];
-        		tpo = FMath::FMAdd(Kxy[0],sourcesPhysicalValues[idxSource],tpo);
+                tpo += Kxy[0]*sourcesPhysicalValues[idxSource];
 
                 sourcesForcesX[idxSource] -= dKxy[0];
                 sourcesForcesY[idxSource] -= dKxy[1];
                 sourcesForcesZ[idxSource] -= dKxy[2];
-        		sourcesPotentials[idxSource] = FMath::FMAdd(mutual_coeff * Kxy[0],tv,sourcesPotentials[idxSource]);
+                sourcesPotentials[idxSource] += (mutual_coeff * Kxy[0])*tv;
             }
 
-            targetsForcesX[idxTarget] += FMath::ConvertTo<FReal, ComputeClass>(tfx);
-            targetsForcesY[idxTarget] += FMath::ConvertTo<FReal, ComputeClass>(tfy);
-            targetsForcesZ[idxTarget] += FMath::ConvertTo<FReal, ComputeClass>(tfz);
-            targetsPotentials[idxTarget] += FMath::ConvertTo<FReal, ComputeClass>(tpo);
+            targetsForcesX[idxTarget] += tfx.horizontalSum();
+            targetsForcesY[idxTarget] += tfy.horizontalSum();
+            targetsForcesZ[idxTarget] += tfz.horizontalSum();
+            targetsPotentials[idxTarget] += tpo.horizontalSum();
         }
     }
 
@@ -295,14 +295,14 @@ static void GenericFullRemote(ContainerClass* const FRestrict inTargets, const C
             const ComputeClass*const sourcesZ = (const ComputeClass*)inNeighbors[idxNeighbors]->getPositions()[2];
 
             for(FSize idxTarget = 0 ; idxTarget < nbParticlesTargets ; ++idxTarget){
-                const ComputeClass tx = FMath::ConvertTo<ComputeClass, const FReal*>(&targetsX[idxTarget]);
-                const ComputeClass ty = FMath::ConvertTo<ComputeClass, const FReal*>(&targetsY[idxTarget]);
-                const ComputeClass tz = FMath::ConvertTo<ComputeClass, const FReal*>(&targetsZ[idxTarget]);
-                const ComputeClass tv = FMath::ConvertTo<ComputeClass, const FReal*>(&targetsPhysicalValues[idxTarget]);
-                ComputeClass  tfx = FMath::Zero<ComputeClass>();
-                ComputeClass  tfy = FMath::Zero<ComputeClass>();
-                ComputeClass  tfz = FMath::Zero<ComputeClass>();
-                ComputeClass  tpo = FMath::Zero<ComputeClass>();
+                const ComputeClass tx = ComputeClass(&targetsX[idxTarget]);
+                const ComputeClass ty = ComputeClass(&targetsY[idxTarget]);
+                const ComputeClass tz = ComputeClass(&targetsZ[idxTarget]);
+                const ComputeClass tv = ComputeClass(&targetsPhysicalValues[idxTarget]);
+                ComputeClass  tfx = ComputeClass(0.);
+                ComputeClass  tfy = ComputeClass(0.);
+                ComputeClass  tfz = ComputeClass(0.);
+                ComputeClass  tpo = ComputeClass(0.);
 
                 for(FSize idxSource = 0 ; idxSource < nbParticlesSources ; ++idxSource){
                     ComputeClass Kxy[1];
@@ -322,10 +322,10 @@ static void GenericFullRemote(ContainerClass* const FRestrict inTargets, const C
                     tpo += Kxy[0] * sourcesPhysicalValues[idxSource];
                 }
 
-                targetsForcesX[idxTarget] += FMath::ConvertTo<FReal, ComputeClass>(tfx);
-                targetsForcesY[idxTarget] += FMath::ConvertTo<FReal, ComputeClass>(tfy);
-                targetsForcesZ[idxTarget] += FMath::ConvertTo<FReal, ComputeClass>(tfz);
-                targetsPotentials[idxTarget] += FMath::ConvertTo<FReal, ComputeClass>(tpo);
+                targetsForcesX[idxTarget] += tfx.horizontalSum();
+                targetsForcesY[idxTarget] += tfy.horizontalSum();
+                targetsForcesZ[idxTarget] += tfz.horizontalSum();
+                targetsPotentials[idxTarget] += tpo.horizontalSum();
             }
         }
     }
@@ -337,107 +337,26 @@ template <class FReal>
 struct FP2PT{
 };
 
-#if defined(SCALFMM_USE_AVX)
-template <>
-struct FP2PT<double>{
-    template <class ContainerClass, class MatrixKernelClass>
-    static void FullMutual(ContainerClass* const FRestrict inTargets, ContainerClass* const inNeighbors[],
-                           const int limiteNeighbors, const MatrixKernelClass *const MatrixKernel){
-        FP2P::GenericFullMutual<double, ContainerClass, MatrixKernelClass, __m256d, 4>(inTargets, inNeighbors, limiteNeighbors, MatrixKernel);
-    }
-
-
-    template <class ContainerClass, class MatrixKernelClass>
-    static void Inner(ContainerClass* const FRestrict inTargets, const MatrixKernelClass *const MatrixKernel){
-        FP2P::GenericInner<double, ContainerClass, MatrixKernelClass, __m256d, 4>(inTargets, MatrixKernel);
-    }
-
-    template <class ContainerClass, class MatrixKernelClass>
-    static void FullRemote(ContainerClass* const FRestrict inTargets, const ContainerClass* const inNeighbors[],
-                           const int limiteNeighbors, const MatrixKernelClass *const MatrixKernel){
-        FP2P::GenericFullRemote<double, ContainerClass, MatrixKernelClass, __m256d, 4>(inTargets, inNeighbors, limiteNeighbors, MatrixKernel);
-    }
-};
-
-template <>
-struct FP2PT<float>{
-    template <class ContainerClass, class MatrixKernelClass>
-    static void FullMutual(ContainerClass* const FRestrict inTargets, ContainerClass* const inNeighbors[],
-                           const int limiteNeighbors, const MatrixKernelClass *const MatrixKernel){
-        FP2P::GenericFullMutual<float, ContainerClass, MatrixKernelClass, __m256, 8>(inTargets, inNeighbors, limiteNeighbors, MatrixKernel);
-    }
-
-    template <class ContainerClass, class MatrixKernelClass>
-    static void Inner(ContainerClass* const FRestrict inTargets, const MatrixKernelClass *const MatrixKernel){
-        FP2P::GenericInner<float, ContainerClass, MatrixKernelClass, __m256, 8>(inTargets, MatrixKernel);
-    }
+#include "InastempCompileConfig.h"
 
-    template <class ContainerClass, class MatrixKernelClass>
-    static void FullRemote(ContainerClass* const FRestrict inTargets, const ContainerClass* const inNeighbors[],
-                           const int limiteNeighbors, const MatrixKernelClass *const MatrixKernel){
-        FP2P::GenericFullRemote<float, ContainerClass, MatrixKernelClass, __m256, 8>(inTargets, inNeighbors, limiteNeighbors, MatrixKernel);
-    }
-};
-#elif defined(SCALFMM_USE_AVX2)
 template <>
 struct FP2PT<double>{
     template <class ContainerClass, class MatrixKernelClass>
     static void FullMutual(ContainerClass* const FRestrict inTargets, ContainerClass* const inNeighbors[],
                            const int limiteNeighbors, const MatrixKernelClass *const MatrixKernel){
-        FP2P::GenericFullMutual<double, ContainerClass, MatrixKernelClass, __m512d, 8>(inTargets, inNeighbors, limiteNeighbors, MatrixKernel);
-    }
-
-    template <class ContainerClass, class MatrixKernelClass>
-    static void Inner(ContainerClass* const FRestrict inTargets, const MatrixKernelClass *const MatrixKernel){
-        FP2P::GenericInner<double, ContainerClass, MatrixKernelClass, __m512d, 8>(inTargets, MatrixKernel);
-    }
-
-    template <class ContainerClass, class MatrixKernelClass>
-    static void FullRemote(ContainerClass* const FRestrict inTargets, const ContainerClass* const inNeighbors[],
-                           const int limiteNeighbors, const MatrixKernelClass *const MatrixKernel){
-        FP2P::GenericFullRemote<double, ContainerClass, MatrixKernelClass, __m512d, 8>(inTargets, inNeighbors, limiteNeighbors, MatrixKernel);
-    }
-};
-
-template <>
-struct FP2PT<float>{
-    template <class ContainerClass, class MatrixKernelClass>
-    static void FullMutual(ContainerClass* const FRestrict inTargets, ContainerClass* const inNeighbors[],
-                           const int limiteNeighbors, const MatrixKernelClass *const MatrixKernel){
-        FP2P::GenericFullMutual<float, ContainerClass, MatrixKernelClass, __m512, 16>(inTargets, inNeighbors, limiteNeighbors, MatrixKernel);
+        FP2P::GenericFullMutual<double, ContainerClass, MatrixKernelClass, InaVecBestTypeDouble, InaVecBestTypeDouble::VecLength>(inTargets, inNeighbors, limiteNeighbors, MatrixKernel);
     }
 
 
     template <class ContainerClass, class MatrixKernelClass>
     static void Inner(ContainerClass* const FRestrict inTargets, const MatrixKernelClass *const MatrixKernel){
-        FP2P::GenericInner<float, ContainerClass, MatrixKernelClass, __m512, 16>(inTargets, MatrixKernel);
+        FP2P::GenericInner<double, ContainerClass, MatrixKernelClass, InaVecBestTypeDouble, InaVecBestTypeDouble::VecLength>(inTargets, MatrixKernel);
     }
 
     template <class ContainerClass, class MatrixKernelClass>
     static void FullRemote(ContainerClass* const FRestrict inTargets, const ContainerClass* const inNeighbors[],
                            const int limiteNeighbors, const MatrixKernelClass *const MatrixKernel){
-        FP2P::GenericFullRemote<float, ContainerClass, MatrixKernelClass, __m512, 16>(inTargets, inNeighbors, limiteNeighbors, MatrixKernel);
-    }
-};
-#elif defined(SCALFMM_USE_SSE)
-template <>
-struct FP2PT<double>{
-    template <class ContainerClass, class MatrixKernelClass>
-    static void FullMutual(ContainerClass* const FRestrict inTargets, ContainerClass* const inNeighbors[],
-                           const int limiteNeighbors, const MatrixKernelClass *const MatrixKernel){
-        FP2P::GenericFullMutual<double, ContainerClass, MatrixKernelClass, __m128d, 2>(inTargets, inNeighbors, limiteNeighbors, MatrixKernel);
-    }
-
-
-    template <class ContainerClass, class MatrixKernelClass>
-    static void Inner(ContainerClass* const FRestrict inTargets, const MatrixKernelClass *const MatrixKernel){
-        FP2P::GenericInner<double, ContainerClass, MatrixKernelClass, __m128d, 2>(inTargets, MatrixKernel);
-    }
-
-    template <class ContainerClass, class MatrixKernelClass>
-    static void FullRemote(ContainerClass* const FRestrict inTargets, const ContainerClass* const inNeighbors[],
-                           const int limiteNeighbors, const MatrixKernelClass *const MatrixKernel){
-        FP2P::GenericFullRemote<double, ContainerClass, MatrixKernelClass, __m128d, 2>(inTargets, inNeighbors, limiteNeighbors, MatrixKernel);
+        FP2P::GenericFullRemote<double, ContainerClass, MatrixKernelClass, InaVecBestTypeDouble, InaVecBestTypeDouble::VecLength>(inTargets, inNeighbors, limiteNeighbors, MatrixKernel);
     }
 };
 
@@ -446,62 +365,21 @@ struct FP2PT<float>{
     template <class ContainerClass, class MatrixKernelClass>
     static void FullMutual(ContainerClass* const FRestrict inTargets, ContainerClass* const inNeighbors[],
                            const int limiteNeighbors, const MatrixKernelClass *const MatrixKernel){
-        FP2P::GenericFullMutual<float, ContainerClass, MatrixKernelClass, __m128, 4>(inTargets, inNeighbors, limiteNeighbors, MatrixKernel);
-    }
-
-    template <class ContainerClass, class MatrixKernelClass>
-    static void Inner(ContainerClass* const FRestrict inTargets, ContainerClass* const inNeighbors[],
-                           const int limiteNeighbors, const MatrixKernelClass *const MatrixKernel){
-        FP2P::GenericInner<float, ContainerClass, MatrixKernelClass, __m128, 4>(inTargets, MatrixKernel);
-    }
-
-    template <class ContainerClass, class MatrixKernelClass>
-    static void FullRemote(ContainerClass* const FRestrict inTargets, const ContainerClass* const inNeighbors[],
-                           const int limiteNeighbors, const MatrixKernelClass *const MatrixKernel){
-        FP2P::GenericFullRemote<float, ContainerClass, MatrixKernelClass, __m128, 4>(inTargets, inNeighbors, limiteNeighbors, MatrixKernel);
-    }
-};
-#else
-template <>
-struct FP2PT<double>{
-    template <class ContainerClass, class MatrixKernelClass>
-    static void FullMutual(ContainerClass* const FRestrict inTargets, ContainerClass* const inNeighbors[],
-                           const int limiteNeighbors, const MatrixKernelClass *const MatrixKernel){
-        FP2P::GenericFullMutual<double, ContainerClass, MatrixKernelClass, double, 1>(inTargets, inNeighbors, limiteNeighbors, MatrixKernel);
+        FP2P::GenericFullMutual<float, ContainerClass, MatrixKernelClass, InaVecBestTypeFloat, InaVecBestTypeFloat::VecLength>(inTargets, inNeighbors, limiteNeighbors, MatrixKernel);
     }
 
     template <class ContainerClass, class MatrixKernelClass>
     static void Inner(ContainerClass* const FRestrict inTargets, const MatrixKernelClass *const MatrixKernel){
-        FP2P::GenericInner<double, ContainerClass, MatrixKernelClass, double, 1>(inTargets, MatrixKernel);
+        FP2P::GenericInner<float, ContainerClass, MatrixKernelClass, InaVecBestTypeFloat, InaVecBestTypeFloat::VecLength>(inTargets, MatrixKernel);
     }
 
     template <class ContainerClass, class MatrixKernelClass>
     static void FullRemote(ContainerClass* const FRestrict inTargets, const ContainerClass* const inNeighbors[],
                            const int limiteNeighbors, const MatrixKernelClass *const MatrixKernel){
-        FP2P::GenericFullRemote<double, ContainerClass, MatrixKernelClass, double, 1>(inTargets, inNeighbors, limiteNeighbors, MatrixKernel);
+        FP2P::GenericFullRemote<float, ContainerClass, MatrixKernelClass, InaVecBestTypeFloat, InaVecBestTypeFloat::VecLength>(inTargets, inNeighbors, limiteNeighbors, MatrixKernel);
     }
 };
 
-template <>
-struct FP2PT<float>{
-    template <class ContainerClass, class MatrixKernelClass>
-    static void FullMutual(ContainerClass* const FRestrict inTargets, ContainerClass* const inNeighbors[],
-                           const int limiteNeighbors, const MatrixKernelClass *const MatrixKernel){
-        FP2P::GenericFullMutual<float, ContainerClass, MatrixKernelClass, float, 1>(inTargets, inNeighbors, limiteNeighbors, MatrixKernel);
-    }
-
-    template <class ContainerClass, class MatrixKernelClass>
-    static void Inner(ContainerClass* const FRestrict inTargets, const MatrixKernelClass *const MatrixKernel){
-        FP2P::GenericInner<float, ContainerClass, MatrixKernelClass, float, 1>(inTargets, MatrixKernel);
-    }
-
-    template <class ContainerClass, class MatrixKernelClass>
-    static void FullRemote(ContainerClass* const FRestrict inTargets, const ContainerClass* const inNeighbors[],
-                           const int limiteNeighbors, const MatrixKernelClass *const MatrixKernel){
-        FP2P::GenericFullRemote<float, ContainerClass, MatrixKernelClass, float, 1>(inTargets, inNeighbors, limiteNeighbors, MatrixKernel);
-    }
-};
-#endif
 
 #include "FP2PTensorialKij.hpp"
 
diff --git a/Src/Kernels/P2P/FP2PR.hpp b/Src/Kernels/P2P/FP2PR.hpp
index f9e83c4f1bf0f99e4f4a0969423430016ca0fc70..dbdc49fc70a94ef1ae52a17d5c4d85b28b7a0674 100644
--- a/Src/Kernels/P2P/FP2PR.hpp
+++ b/Src/Kernels/P2P/FP2PR.hpp
@@ -81,7 +81,7 @@ static void GenericFullMutual(ContainerClass* const FRestrict inTargets, Contain
     FReal*const targetsForcesZ = inTargets->getForcesZ();
     FReal*const targetsPotentials = inTargets->getPotentials();
 
-    const ComputeClass mOne = FMath::One<ComputeClass>();
+    const ComputeClass mOne = ComputeClass(1);
 
     for(FSize idxNeighbors = 0 ; idxNeighbors < limiteNeighbors ; ++idxNeighbors){
         if( inNeighbors[idxNeighbors] ){
@@ -96,14 +96,14 @@ static void GenericFullMutual(ContainerClass* const FRestrict inTargets, Contain
             ComputeClass*const sourcesPotentials = (ComputeClass*)inNeighbors[idxNeighbors]->getPotentials();
 
             for(FSize idxTarget = 0 ; idxTarget < nbParticlesTargets ; ++idxTarget){
-                const ComputeClass tx = FMath::ConvertTo<ComputeClass, const FReal*>(&targetsX[idxTarget]);
-                const ComputeClass ty = FMath::ConvertTo<ComputeClass, const FReal*>(&targetsY[idxTarget]);
-                const ComputeClass tz = FMath::ConvertTo<ComputeClass, const FReal*>(&targetsZ[idxTarget]);
-                const ComputeClass tv = FMath::ConvertTo<ComputeClass, const FReal*>(&targetsPhysicalValues[idxTarget]);
-                ComputeClass  tfx = FMath::Zero<ComputeClass>();
-                ComputeClass  tfy = FMath::Zero<ComputeClass>();
-                ComputeClass  tfz = FMath::Zero<ComputeClass>();
-                ComputeClass  tpo = FMath::Zero<ComputeClass>();
+                const ComputeClass tx = ComputeClass(&targetsX[idxTarget]);
+                const ComputeClass ty = ComputeClass(&targetsY[idxTarget]);
+                const ComputeClass tz = ComputeClass(&targetsZ[idxTarget]);
+                const ComputeClass tv = ComputeClass(&targetsPhysicalValues[idxTarget]);
+                ComputeClass  tfx = ComputeClass(0.);
+                ComputeClass  tfy = ComputeClass(0.);
+                ComputeClass  tfz = ComputeClass(0.);
+                ComputeClass  tpo = ComputeClass(0.);
 
                 for(FSize idxSource = 0 ; idxSource < nbParticlesSources ; ++idxSource){
                     ComputeClass dx = tx - sourcesX[idxSource];
@@ -111,7 +111,7 @@ static void GenericFullMutual(ContainerClass* const FRestrict inTargets, Contain
                     ComputeClass dz = tz - sourcesZ[idxSource];
 
                     ComputeClass inv_square_distance = mOne / (dx*dx + dy*dy + dz*dz);
-                    const ComputeClass inv_distance = FMath::Sqrt(inv_square_distance);
+                    const ComputeClass inv_distance = inv_square_distance.sqrt();
 
                     inv_square_distance *= inv_distance;
                     inv_square_distance *= tv * sourcesPhysicalValues[idxSource];
@@ -131,10 +131,10 @@ static void GenericFullMutual(ContainerClass* const FRestrict inTargets, Contain
                     sourcesPotentials[idxSource] += inv_distance * tv;
                 }
 
-                targetsForcesX[idxTarget] += FMath::ConvertTo<FReal, ComputeClass>(tfx);
-                targetsForcesY[idxTarget] += FMath::ConvertTo<FReal, ComputeClass>(tfy);
-                targetsForcesZ[idxTarget] += FMath::ConvertTo<FReal, ComputeClass>(tfz);
-                targetsPotentials[idxTarget] += FMath::ConvertTo<FReal, ComputeClass>(tpo);
+                targetsForcesX[idxTarget] += tfx.horizontalSum();
+                targetsForcesY[idxTarget] += tfy.horizontalSum();
+                targetsForcesZ[idxTarget] += tfz.horizontalSum();
+                targetsPotentials[idxTarget] += tpo.horizontalSum();
             }
         }
     }
@@ -153,7 +153,7 @@ static void GenericInner(ContainerClass* const FRestrict inTargets){
     FReal*const targetsForcesZ = inTargets->getForcesZ();
     FReal*const targetsPotentials = inTargets->getPotentials();
 
-    const ComputeClass mOne = FMath::One<ComputeClass>();
+    const ComputeClass mOne = ComputeClass(1);
 
     {//In this part, we compute (vectorially) the interaction
         //within the target leaf.
@@ -169,14 +169,14 @@ static void GenericInner(ContainerClass* const FRestrict inTargets){
         ComputeClass*const sourcesPotentials = (ComputeClass*)targetsPotentials;
 
         for(FSize idxTarget = 0 ; idxTarget < nbParticlesTargets ; ++idxTarget){
-            const ComputeClass tx = FMath::ConvertTo<ComputeClass, const FReal*>(&targetsX[idxTarget]);
-            const ComputeClass ty = FMath::ConvertTo<ComputeClass, const FReal*>(&targetsY[idxTarget]);
-            const ComputeClass tz = FMath::ConvertTo<ComputeClass, const FReal*>(&targetsZ[idxTarget]);
-            const ComputeClass tv = FMath::ConvertTo<ComputeClass, const FReal*>(&targetsPhysicalValues[idxTarget]);
-            ComputeClass  tfx = FMath::Zero<ComputeClass>();
-            ComputeClass  tfy = FMath::Zero<ComputeClass>();
-            ComputeClass  tfz = FMath::Zero<ComputeClass>();
-            ComputeClass  tpo = FMath::Zero<ComputeClass>();
+            const ComputeClass tx = ComputeClass(&targetsX[idxTarget]);
+            const ComputeClass ty = ComputeClass(&targetsY[idxTarget]);
+            const ComputeClass tz = ComputeClass(&targetsZ[idxTarget]);
+            const ComputeClass tv = ComputeClass(&targetsPhysicalValues[idxTarget]);
+            ComputeClass  tfx = ComputeClass(0.);
+            ComputeClass  tfy = ComputeClass(0.);
+            ComputeClass  tfz = ComputeClass(0.);
+            ComputeClass  tpo = ComputeClass(0.);
 
             for(FSize idxSource = (idxTarget+NbFRealInComputeClass)/NbFRealInComputeClass ; idxSource < nbParticlesSources ; ++idxSource){
 
@@ -184,7 +184,7 @@ static void GenericInner(ContainerClass* const FRestrict inTargets){
                 ComputeClass dy = ty - sourcesY[idxSource];
                 ComputeClass dz = tz - sourcesZ[idxSource];
                 ComputeClass inv_square_distance = mOne / (dx*dx + dy*dy + dz*dz);
-                const ComputeClass inv_distance = FMath::Sqrt(inv_square_distance);
+                const ComputeClass inv_distance = inv_square_distance.sqrt();
 
                 inv_square_distance *= inv_distance;
                 inv_square_distance *= tv * sourcesPhysicalValues[idxSource];
@@ -204,10 +204,10 @@ static void GenericInner(ContainerClass* const FRestrict inTargets){
                 sourcesPotentials[idxSource] += inv_distance * tv;
             }
 
-            targetsForcesX[idxTarget] += FMath::ConvertTo<FReal, ComputeClass>(tfx);
-            targetsForcesY[idxTarget] += FMath::ConvertTo<FReal, ComputeClass>(tfy);
-            targetsForcesZ[idxTarget] += FMath::ConvertTo<FReal, ComputeClass>(tfz);
-            targetsPotentials[idxTarget] += FMath::ConvertTo<FReal, ComputeClass>(tpo);
+            targetsForcesX[idxTarget] += tfx.horizontalSum();
+            targetsForcesY[idxTarget] += tfy.horizontalSum();
+            targetsForcesZ[idxTarget] += tfz.horizontalSum();
+            targetsPotentials[idxTarget] += tpo.horizontalSum();
         }
     }
 
@@ -255,7 +255,7 @@ static void GenericFullRemote(ContainerClass* const FRestrict inTargets, const C
     FReal*const targetsForcesZ = inTargets->getForcesZ();
     FReal*const targetsPotentials = inTargets->getPotentials();
 
-    const ComputeClass mOne = FMath::One<ComputeClass>();
+    const ComputeClass mOne = ComputeClass(1);
 
     for(FSize idxNeighbors = 0 ; idxNeighbors < limiteNeighbors ; ++idxNeighbors){
         if( inNeighbors[idxNeighbors] ){
@@ -266,14 +266,14 @@ static void GenericFullRemote(ContainerClass* const FRestrict inTargets, const C
             const ComputeClass*const sourcesZ = (const ComputeClass*)inNeighbors[idxNeighbors]->getPositions()[2];
 
             for(FSize idxTarget = 0 ; idxTarget < nbParticlesTargets ; ++idxTarget){
-                const ComputeClass tx = FMath::ConvertTo<ComputeClass, const FReal*>(&targetsX[idxTarget]);
-                const ComputeClass ty = FMath::ConvertTo<ComputeClass, const FReal*>(&targetsY[idxTarget]);
-                const ComputeClass tz = FMath::ConvertTo<ComputeClass, const FReal*>(&targetsZ[idxTarget]);
-                const ComputeClass tv = FMath::ConvertTo<ComputeClass, const FReal*>(&targetsPhysicalValues[idxTarget]);
-                ComputeClass  tfx = FMath::Zero<ComputeClass>();
-                ComputeClass  tfy = FMath::Zero<ComputeClass>();
-                ComputeClass  tfz = FMath::Zero<ComputeClass>();
-                ComputeClass  tpo = FMath::Zero<ComputeClass>();
+                const ComputeClass tx = ComputeClass(&targetsX[idxTarget]);
+                const ComputeClass ty = ComputeClass(&targetsY[idxTarget]);
+                const ComputeClass tz = ComputeClass(&targetsZ[idxTarget]);
+                const ComputeClass tv = ComputeClass(&targetsPhysicalValues[idxTarget]);
+                ComputeClass  tfx = ComputeClass(0.);
+                ComputeClass  tfy = ComputeClass(0.);
+                ComputeClass  tfz = ComputeClass(0.);
+                ComputeClass  tpo = ComputeClass(0.);
 
                 for(FSize idxSource = 0 ; idxSource < nbParticlesSources ; ++idxSource){
                     ComputeClass dx = tx - sourcesX[idxSource];
@@ -281,7 +281,7 @@ static void GenericFullRemote(ContainerClass* const FRestrict inTargets, const C
                     ComputeClass dz = tz - sourcesZ[idxSource];
 
                     ComputeClass inv_square_distance = mOne / (dx*dx + dy*dy + dz*dz);
-                    const ComputeClass inv_distance = FMath::Sqrt(inv_square_distance);
+                    const ComputeClass inv_distance = inv_square_distance.sqrt();
 
                     inv_square_distance *= inv_distance;
                     inv_square_distance *= tv * sourcesPhysicalValues[idxSource];
@@ -296,10 +296,10 @@ static void GenericFullRemote(ContainerClass* const FRestrict inTargets, const C
                     tpo += inv_distance * sourcesPhysicalValues[idxSource];
                 }
 
-                targetsForcesX[idxTarget] += FMath::ConvertTo<FReal, ComputeClass>(tfx);
-                targetsForcesY[idxTarget] += FMath::ConvertTo<FReal, ComputeClass>(tfy);
-                targetsForcesZ[idxTarget] += FMath::ConvertTo<FReal, ComputeClass>(tfz);
-                targetsPotentials[idxTarget] += FMath::ConvertTo<FReal, ComputeClass>(tpo);
+                targetsForcesX[idxTarget] += tfx.horizontalSum();
+                targetsForcesY[idxTarget] += tfy.horizontalSum();
+                targetsForcesZ[idxTarget] += tfz.horizontalSum();
+                targetsPotentials[idxTarget] += tpo.horizontalSum();
             }
         }
     }
@@ -311,25 +311,25 @@ template <class FReal>
 struct FP2PRT{
 };
 
-#if defined(SCALFMM_USE_AVX)
+#include "InastempCompileConfig.h"
 
 template <>
 struct FP2PRT<double>{
     template <class ContainerClass>
     static void FullMutual(ContainerClass* const FRestrict inTargets, ContainerClass* const inNeighbors[],
                            const int limiteNeighbors){
-        FP2PR::GenericFullMutual<double, ContainerClass, __m256d, 4>(inTargets, inNeighbors, limiteNeighbors);
+        FP2PR::GenericFullMutual<double, ContainerClass, InaVecBestTypeDouble, InaVecBestTypeDouble::VecLength>(inTargets, inNeighbors, limiteNeighbors);
     }
 
     template <class ContainerClass>
     static void Inner(ContainerClass* const FRestrict inTargets){
-        FP2PR::GenericInner<double, ContainerClass, __m256d, 4>(inTargets);
+        FP2PR::GenericInner<double, ContainerClass, InaVecBestTypeDouble, InaVecBestTypeDouble::VecLength>(inTargets);
     }
 
     template <class ContainerClass>
     static void FullRemote(ContainerClass* const FRestrict inTargets, const ContainerClass* const inNeighbors[],
                const int limiteNeighbors){
-        FP2PR::GenericFullRemote<double, ContainerClass, __m256d, 4>(inTargets, inNeighbors, limiteNeighbors);
+        FP2PR::GenericFullRemote<double, ContainerClass, InaVecBestTypeDouble, InaVecBestTypeDouble::VecLength>(inTargets, inNeighbors, limiteNeighbors);
     }
 };
 
@@ -338,143 +338,21 @@ struct FP2PRT<float>{
     template <class ContainerClass>
     static void FullMutual(ContainerClass* const FRestrict inTargets, ContainerClass* const inNeighbors[],
                            const int limiteNeighbors){
-        FP2PR::GenericFullMutual<float, ContainerClass, __m256, 8>(inTargets, inNeighbors, limiteNeighbors);
+        FP2PR::GenericFullMutual<float, ContainerClass, InaVecBestTypeFloat, InaVecBestTypeFloat::VecLength>(inTargets, inNeighbors, limiteNeighbors);
     }
 
     template <class ContainerClass>
     static void Inner(ContainerClass* const FRestrict inTargets){
-        FP2PR::GenericFullMutual<float, ContainerClass, __m256, 8>(inTargets);
+        FP2PR::GenericFullMutual<float, ContainerClass, InaVecBestTypeFloat, InaVecBestTypeFloat::VecLength>(inTargets);
     }
 
     template <class ContainerClass>
     static void FullRemote(ContainerClass* const FRestrict inTargets, const ContainerClass* const inNeighbors[],
                const int limiteNeighbors){
-        FP2PR::GenericFullRemote<float, ContainerClass, __m256, 8>(inTargets, inNeighbors, limiteNeighbors);
-    }
-};
-#elif defined(SCALFMM_USE_AVX2)
-template <>
-struct FP2PRT<double>{
-    template <class ContainerClass>
-    static void FullMutual(ContainerClass* const FRestrict inTargets, ContainerClass* const inNeighbors[],
-                           const int limiteNeighbors){
-        FP2PR::GenericFullMutual<double, ContainerClass, __m512d, 8>(inTargets, inNeighbors, limiteNeighbors);
-    }
-
-    template <class ContainerClass>
-    static void Inner(ContainerClass* const FRestrict inTargets){
-        FP2PR::GenericInner<double, ContainerClass, __m512d, 8>(inTargets);
-    }
-
-    template <class ContainerClass>
-    static void FullRemote(ContainerClass* const FRestrict inTargets, const ContainerClass* const inNeighbors[],
-               const int limiteNeighbors){
-        FP2PR::GenericFullRemote<double, ContainerClass, __m512d, 8>(inTargets, inNeighbors, limiteNeighbors);
-    }
-};
-
-template <>
-struct FP2PRT<float>{
-    template <class ContainerClass>
-    static void FullMutual(ContainerClass* const FRestrict inTargets, ContainerClass* const inNeighbors[],
-                           const int limiteNeighbors){
-        FP2PR::GenericFullMutual<float, ContainerClass, __m512, 16>(inTargets, inNeighbors, limiteNeighbors);
-    }
-
-    template <class ContainerClass>
-    static void Inner(ContainerClass* const FRestrict inTargets){
-        FP2PR::GenericFullMutual<float, ContainerClass, __m512, 16>(inTargets);
-    }
-
-    template <class ContainerClass>
-    static void FullRemote(ContainerClass* const FRestrict inTargets, const ContainerClass* const inNeighbors[],
-               const int limiteNeighbors){
-        FP2PR::GenericFullRemote<float, ContainerClass, __m512, 16>(inTargets, inNeighbors, limiteNeighbors);
-    }
-};
-
-#elif defined(SCALFMM_USE_SSE)
-template <>
-struct FP2PRT<double>{
-    template <class ContainerClass>
-    static void FullMutual(ContainerClass* const FRestrict inTargets, ContainerClass* const inNeighbors[],
-                           const int limiteNeighbors){
-        FP2PR::GenericFullMutual<double, ContainerClass, __m128d, 2>(inTargets, inNeighbors, limiteNeighbors);
-    }
-
-    template <class ContainerClass>
-    static void Inner(ContainerClass* const FRestrict inTargets){
-        FP2PR::GenericInner<double, ContainerClass, __m128d, 2>(inTargets);
-    }
-
-    template <class ContainerClass>
-    static void FullRemote(ContainerClass* const FRestrict inTargets, const ContainerClass* const inNeighbors[],
-               const int limiteNeighbors){
-        FP2PR::GenericFullRemote<double, ContainerClass, __m128d, 2>(inTargets, inNeighbors, limiteNeighbors);
+        FP2PR::GenericFullRemote<float, ContainerClass, InaVecBestTypeFloat, InaVecBestTypeFloat::VecLength>(inTargets, inNeighbors, limiteNeighbors);
     }
 };
 
-template <>
-struct FP2PRT<float>{
-    template <class ContainerClass>
-    static void FullMutual(ContainerClass* const FRestrict inTargets, ContainerClass* const inNeighbors[],
-                           const int limiteNeighbors){
-        FP2PR::GenericFullMutual<float, ContainerClass, __m128, 4>(inTargets, inNeighbors, limiteNeighbors);
-    }
-
-    template <class ContainerClass>
-    static void Inner(ContainerClass* const FRestrict inTargets){
-        FP2PR::GenericInner<float, ContainerClass, __m128, 4>(inTargets);
-    }
-
-    template <class ContainerClass>
-    static void FullRemote(ContainerClass* const FRestrict inTargets, const ContainerClass* const inNeighbors[],
-               const int limiteNeighbors){
-        FP2PR::GenericFullRemote<float, ContainerClass, __m128, 4>(inTargets, inNeighbors, limiteNeighbors);
-    }
-};
-
-#else
-template <>
-struct FP2PRT<double>{
-    template <class ContainerClass>
-    static void FullMutual(ContainerClass* const FRestrict inTargets, ContainerClass* const inNeighbors[],
-                           const int limiteNeighbors){
-        FP2PR::GenericFullMutual<double, ContainerClass, double, 1>(inTargets, inNeighbors, limiteNeighbors);
-    }
-
-    template <class ContainerClass>
-    static void Inner(ContainerClass* const FRestrict inTargets){
-        FP2PR::GenericInner<double, ContainerClass, double, 1>(inTargets);
-    }
-
-    template <class ContainerClass>
-    static void FullRemote(ContainerClass* const FRestrict inTargets, const ContainerClass* const inNeighbors[],
-               const int limiteNeighbors){
-        FP2PR::GenericFullRemote<double, ContainerClass, double, 1>(inTargets, inNeighbors, limiteNeighbors);
-    }
-};
-
-template <>
-struct FP2PRT<float>{
-    template <class ContainerClass>
-    static void FullMutual(ContainerClass* const FRestrict inTargets, ContainerClass* const inNeighbors[],
-                           const int limiteNeighbors){
-        FP2PR::GenericFullMutual<float, ContainerClass, float, 1>(inTargets, inNeighbors, limiteNeighbors);
-    }
-
-    template <class ContainerClass>
-    static void Inner(ContainerClass* const FRestrict inTargets){
-        FP2PR::GenericInner<float, ContainerClass, float, 1>(inTargets);
-    }
-
-    template <class ContainerClass>
-    static void FullRemote(ContainerClass* const FRestrict inTargets, const ContainerClass* const inNeighbors[],
-               const int limiteNeighbors){
-        FP2PR::GenericFullRemote<float, ContainerClass, float, 1>(inTargets, inNeighbors, limiteNeighbors);
-    }
-};
-#endif
 
 
 
diff --git a/Src/ScalFmmConfig.h.cmake b/Src/ScalFmmConfig.h.cmake
index ef3f6204e4c9c27c89cb394deb1e33d2b53810b3..71198ba5e448488e220e0a710d1a7dd97c013293 100644
--- a/Src/ScalFmmConfig.h.cmake
+++ b/Src/ScalFmmConfig.h.cmake
@@ -28,7 +28,6 @@
 #cmakedefine SCALFMM_BLAS_ADD_
 #cmakedefine SCALFMM_BLAS_UPCASE
 #cmakedefine SCALFMM_BLAS_NOCHANGE
-
 ////////////////////////////////////////////////////////
 // FFT
 ///////////////////////////////////////////////////////
@@ -68,20 +67,6 @@
 #cmakedefine SCALFMM_USE_STARPU
 #cmakedefine SCALFMM_DISABLE_NATIVE_OMP4
 
-///////////////////////////////////////////////////////
-// SSE
-///////////////////////////////////////////////////////
-
-#cmakedefine SCALFMM_USE_SSE
-#cmakedefine __AVXPE_INTEL_COMPILER
-
-///////////////////////////////////////////////////////
-// AVX
-///////////////////////////////////////////////////////
-
-#cmakedefine SCALFMM_USE_AVX
-#cmakedefine __SSEPE_INTEL_COMPILER
-
 ///////////////////////////////////////////////////////
 // EZTRACE
 ///////////////////////////////////////////////////////
diff --git a/Src/Utils/FAvx.hpp b/Src/Utils/FAvx.hpp
deleted file mode 100644
index 717ad2f8fadca99425aa419122a21bb98ff199ac..0000000000000000000000000000000000000000
--- a/Src/Utils/FAvx.hpp
+++ /dev/null
@@ -1,85 +0,0 @@
-// See LICENCE file at project root
-#ifndef FAVX_HPP
-#define FAVX_HPP
-
-#include "FGlobal.hpp"
-#ifndef SCALFMM_USE_AVX
-#error The AVX header is included while SCALFMM_USE_AVX is turned OFF
-#else 
-
-
-#include <immintrin.h>
-
-#ifdef __AVXPE_INTEL_COMPILER
-
-//Side effect operators DOUBLE
-inline __m256d& operator+=(__m256d & a, const __m256d & b){
-  return (a = _mm256_add_pd (a,b));
-}
-
-inline __m256d& operator-=(__m256d& a, const __m256d& b){
-  return (a = _mm256_sub_pd (a,b));
-}
-
-inline __m256d& operator*=(__m256d& a, const __m256d& b){
-  return (a = _mm256_mul_pd (a,b));
-}
-
-inline __m256d& operator/=(__m256d& a, const __m256d& b){
-  return (a = _mm256_div_pd (a,b));
-}
-
-//No side effect operators DOUBLE
-inline __m256d operator+(const __m256d& a,const  __m256d& b){
-  return _mm256_add_pd (a,b);
-}
-
-inline __m256d operator-(const __m256d& a, const __m256d& b){
-  return _mm256_sub_pd (a,b);
-}
-
-inline __m256d operator*(const __m256d& v1, const __m256d& v2){
-    return _mm256_mul_pd(v1, v2);
-}
-
-inline __m256d operator/(const __m256d& v1, const __m256d& v2){
-    return _mm256_div_pd(v1, v2);
-}
-
-//Side effect operators SINGLE
-inline __m256& operator+=(__m256 & a, const __m256 & b){
-  return (a = _mm256_add_ps (a,b));
-}
-
-inline __m256& operator-=(__m256& a, const __m256& b){
-  return (a = _mm256_sub_ps (a,b));
-}
-
-inline __m256& operator*=(__m256& a, const __m256& b){
-  return (a = _mm256_mul_ps (a,b));
-}
-
-inline __m256& operator/=(__m256& a, const __m256& b){
-  return (a = _mm256_div_ps (a,b));
-}
-
-//No side effect operators SINGLE
-inline __m256 operator+(const __m256& a,const  __m256& b){
-  return _mm256_add_ps (a,b);
-}
-
-inline __m256 operator-(const __m256& a, const __m256& b){
-  return _mm256_sub_ps (a,b);
-}
-
-inline __m256 operator*(const __m256& v1, const __m256& v2){
-    return _mm256_mul_ps(v1, v2);
-}
-
-inline __m256 operator/(const __m256& v1, const __m256& v2){
-    return _mm256_div_ps(v1, v2);
-}
-
-#endif
-#endif
-#endif
diff --git a/Src/Utils/FAvx2.hpp b/Src/Utils/FAvx2.hpp
deleted file mode 100644
index 08226a205de9a9e9aae6e2c5725e7457c4aebb12..0000000000000000000000000000000000000000
--- a/Src/Utils/FAvx2.hpp
+++ /dev/null
@@ -1,85 +0,0 @@
-// See LICENCE file at project root
-#ifndef FAVX2_HPP
-#define FAVX2_HPP
-
-#include "FGlobal.hpp"
-#ifndef SCALFMM_USE_AVX2
-#error The AVX header is included while SCALFMM_USE_AVX is turned OFF
-#endif
-
-#include <immintrin.h>
-
-#ifdef __MIC__
-
-//Side effect operators DOUBLE
-inline __m512d& operator+=(__m512d & a, const __m512d & b){
-  return (a = _mm512_add_pd (a,b));
-}
-
-inline __m512d& operator-=(__m512d& a, const __m512d& b){
-  return (a = _mm512_sub_pd (a,b));
-}
-
-inline __m512d& operator*=(__m512d& a, const __m512d& b){
-  return (a = _mm512_mul_pd (a,b));
-}
-
-inline __m512d& operator/=(__m512d& a, const __m512d& b){
-  return (a = _mm512_div_pd (a,b));
-}
-
-//No side effect operators DOUBLE
-inline __m512d operator+(const __m512d& a,const  __m512d& b){
-  return _mm512_add_pd (a,b);
-}
-
-inline __m512d operator-(const __m512d& a, const __m512d& b){
-  return _mm512_sub_pd (a,b);
-}
-
-inline __m512d operator*(const __m512d& v1, const __m512d& v2){
-    return _mm512_mul_pd(v1, v2);
-}
-
-inline __m512d operator/(const __m512d& v1, const __m512d& v2){
-    return _mm512_div_pd(v1, v2);
-}
-
-//Side effect operators SINGLE
-inline __m512& operator+=(__m512 & a, const __m512 & b){
-  return (a = _mm512_add_ps (a,b));
-}
-
-inline __m512& operator-=(__m512& a, const __m512& b){
-  return (a = _mm512_sub_ps (a,b));
-}
-
-inline __m512& operator*=(__m512& a, const __m512& b){
-  return (a = _mm512_mul_ps (a,b));
-}
-
-inline __m512& operator/=(__m512& a, const __m512& b){
-  return (a = _mm512_div_ps (a,b));
-}
-
-//No side effect operators SINGLE
-inline __m512 operator+(const __m512& a,const  __m512& b){
-  return _mm512_add_ps (a,b);
-}
-
-inline __m512 operator-(const __m512& a, const __m512& b){
-  return _mm512_sub_ps (a,b);
-}
-
-inline __m512 operator*(const __m512& v1, const __m512& v2){
-    return _mm512_mul_ps(v1, v2);
-}
-
-inline __m512 operator/(const __m512& v1, const __m512& v2){
-    return _mm512_div_ps(v1, v2);
-}
-
-#endif
-
-#endif
-
diff --git a/Src/Utils/FMath.hpp b/Src/Utils/FMath.hpp
index d753ebcfcb3ea2664da3e0dd0add5dc03d5bacea..c2c8a0dcce37161f67f2f6b02515895035d5eb0c 100644
--- a/Src/Utils/FMath.hpp
+++ b/Src/Utils/FMath.hpp
@@ -8,14 +8,6 @@
 
 #include "FGlobal.hpp"
 
-#ifdef SCALFMM_USE_SSE
-#include "FSse.hpp"
-#endif
-
-#ifdef SCALFMM_USE_AVX
-#include "FAvx.hpp"
-#endif
-
 
 /**
  * @author Berenger Bramas (berenger.bramas@inria.fr)
@@ -40,36 +32,6 @@ struct FMath{
         return (inV < 0 ? -inV : inV);
     }
 
-#ifdef SCALFMM_USE_SSE
-    static __m128 Abs(const __m128 inV){
-        return _mm_max_ps(_mm_sub_ps(_mm_setzero_ps(), inV), inV);
-    }
-
-    static __m128d Abs(const __m128d inV){
-        return _mm_max_pd(_mm_sub_pd(_mm_setzero_pd(), inV), inV);
-    }
-#endif
-#ifdef SCALFMM_USE_AVX
-    static __m256 Abs(const __m256 inV){
-        return _mm256_max_ps(_mm256_sub_ps(_mm256_setzero_ps(), inV), inV);
-    }
-
-    static __m256d Abs(const __m256d inV){
-        return _mm256_max_pd(_mm256_sub_pd(_mm256_setzero_pd(), inV), inV);
-    }
-#endif
-#ifdef SCALFMM_USE_AVX2
-#ifdef __MIC__
-    static __m512 Abs(const __m512 inV){
-        return _mm512_max_ps(_mm512_sub_ps(_mm512_setzero_ps(), inV), inV);
-    }
-
-    static __m512d Abs(const __m512d inV){
-        return _mm512_max_pd(_mm512_sub_pd(_mm512_setzero_pd(), inV), inV);
-    }
-#endif
-#endif
-
     /** To get max between 2 values */
     template <class NumType>
     static NumType Max(const NumType inV1, const NumType inV2){
@@ -82,53 +44,6 @@ struct FMath{
         return (inV1 < inV2 ? inV1 : inV2);
     }
 
-#ifdef SCALFMM_USE_SSE
-    static __m128 Max(const __m128 inV1, const __m128 inV2){
-        return _mm_max_ps(inV1, inV2);
-    }
-    static __m128 Min(const __m128 inV1, const __m128 inV2){
-        return _mm_min_ps(inV1, inV2);
-    }
-
-    static __m128d Max(const __m128d inV1, const __m128d inV2){
-        return _mm_max_pd(inV1, inV2);
-    }
-    static __m128d Min(const __m128d inV1, const __m128d inV2){
-        return _mm_min_pd(inV1, inV2);
-    }
-#endif
-#ifdef SCALFMM_USE_AVX
-    static __m256 Max(const __m256 inV1, const __m256 inV2){
-        return _mm256_max_ps(inV1, inV2);
-    }
-    static __m256 Min(const __m256 inV1, const __m256 inV2){
-        return _mm256_min_ps(inV1, inV2);
-    }
-
-    static __m256d Max(const __m256d inV1, const __m256d inV2){
-        return _mm256_max_pd(inV1, inV2);
-    }
-    static __m256d Min(const __m256d inV1, const __m256d inV2){
-        return _mm256_min_pd(inV1, inV2);
-    }
-#endif
-#ifdef SCALFMM_USE_AVX2
-#ifdef __MIC__
-    static __m512 Max(const __m512 inV1, const __m512 inV2){
-        return _mm512_max_ps(inV1, inV2);
-    }
-    static __m512 Min(const __m512 inV1, const __m512 inV2){
-        return _mm512_min_ps(inV1, inV2);
-    }
-
-    static __m512d Max(const __m512d inV1, const __m512d inV2){
-        return _mm512_max_pd(inV1, inV2);
-    }
-    static __m512d Min(const __m512d inV1, const __m512d inV2){
-        return _mm512_min_pd(inV1, inV2);
-    }
-#endif
-#endif
     /** To know if 2 values seems to be equal */
     template <class NumType>
     static bool LookEqual(const NumType inV1, const NumType inV2){
@@ -160,59 +75,6 @@ struct FMath{
         return ceil(inValue);
     }
 
-#if  defined(SCALFMM_USE_SSE ) && defined(__SSSE4_1__)
-    static __m128 dfloor(const __m128 inV){
-        return _mm_floor_ps(inV);
-    }
-
-    static __m128d dfloor(const __m128d inV){
-        return _mm_floor_pd(inV);
-    }
-
-    static __m128 Ceil(const __m128 inV){
-        return _mm_ceil_ps(inV);
-    }
-
-    static __m128d Ceil(const __m128d inV){
-        return _mm_ceil_pd(inV);
-    }
-#endif
-#ifdef SCALFMM_USE_AVX
-    static __m256 dfloor(const __m256 inV){
-        return _mm256_floor_ps(inV);
-    }
-
-    static __m256d dfloor(const __m256d inV){
-        return _mm256_floor_pd(inV);
-    }
-
-    static __m256 Ceil(const __m256 inV){
-        return _mm256_ceil_ps(inV);
-    }
-
-    static __m256d Ceil(const __m256d inV){
-        return _mm256_ceil_pd(inV);
-    }
-#endif
-#ifdef SCALFMM_USE_AVX2
-#ifdef __MIC__
-    static __m512 dfloor(const __m512 inV){
-        return _mm512_floor_ps(inV);
-    }
-
-    static __m512d dfloor(const __m512d inV){
-        return _mm512_floor_pd(inV);
-    }
-
-    static __m512 Ceil(const __m512 inV){
-        return _mm512_ceil_ps(inV);
-    }
-
-    static __m512d Ceil(const __m512d inV){
-        return _mm512_ceil_pd(inV);
-    }
-#endif
-#endif
     /** To get pow */
     static double pow(double x, double y){
         return ::pow(x,y);
@@ -263,39 +125,6 @@ struct FMath{
     }
 
 
-#if  defined(SCALFMM_USE_SSE ) && defined(__SSSE4_1__)
-    static __m128 FMAdd(const __m128 inV1, const __m128 inV2, const __m128 inV3){
-        return _mm_add_ps( _mm_mul_ps(inV1,inV2), inV3);
-    }
-
-    static __m128d FMAdd(const __m128d inV1, const __m128d inV2, const __m128d inV3){
-        return _mm_add_pd( _mm_mul_pd(inV1,inV2), inV3);
-    }
-
-#endif
-#ifdef SCALFMM_USE_AVX
-    static __m256 FMAdd(const __m256 inV1, const __m256 inV2, const __m256 inV3){
-        return _mm256_add_ps( _mm256_mul_ps(inV1,inV2), inV3);
-    }
-
-    static __m256d FMAdd(const __m256d inV1, const __m256d inV2, const __m256d inV3){
-        return _mm256_add_pd( _mm256_mul_pd(inV1,inV2), inV3);
-    }
-
-#endif
-#ifdef SCALFMM_USE_AVX2
-#ifdef __MIC__
-    static __m512 FMAdd(const __m512 inV1, const __m512 inV2, const __m512 inV3){
-        //return _mm512_add_ps( _mm512_mul_ps(inV1,inV2), inV3);
-        return _mm512_fmadd_ps(inV1, inV2, inV3);
-    }
-
-    static __m512d FMAdd(const __m512d inV1, const __m512d inV2, const __m512d inV3){
-        //return _mm512_add_pd( _mm512_mul_pd(inV1,inV2), inV3);
-        return _mm512_fmadd_pd(inV1, inV2, inV3);
-    }
-#endif
-#endif
     /** To get sqrt of a FReal */
     static float Sqrt(const float inValue){
         return sqrtf(inValue);
@@ -303,123 +132,25 @@ struct FMath{
     static double Sqrt(const double inValue){
         return sqrt(inValue);
     }
-    static float Rsqrt(const float inValue){
-        return float(1.0)/sqrtf(inValue);
-    }
-    static double Rsqrt(const double inValue){
-        return 1.0/sqrt(inValue);
-    }
-#ifdef SCALFMM_USE_SSE
-    static __m128 Exp(const __m128 inV){
-         float ptr[4];
-        _mm_storeu_ps(ptr, inV);
-        for(int idx = 0 ; idx < 4 ; ++idx){
-            ptr[idx] = std::exp(ptr[idx]);
-        }
-        return _mm_loadu_ps(ptr);
-    }
 
-    static __m128d Exp(const __m128d inV){
-         double ptr[2];
-        _mm_storeu_pd(ptr, inV);
-        for(int idx = 0 ; idx < 2 ; ++idx){
-            ptr[idx] = std::exp(ptr[idx]);
-        }
-        return _mm_loadu_pd(ptr);
-    }
-
-    static __m128 Sqrt(const __m128 inV){
-        return _mm_sqrt_ps(inV);
+    template <class InaClass>
+    static InaClass Sqrt(const InaClass inValue){
+        return inValue.sqrt();
     }
 
-    static __m128d Sqrt(const __m128d inV){
-        return _mm_sqrt_pd(inV);
+    template <class InaClass>
+    static InaClass Exp(const InaClass inValue){
+        return inValue.exp();
     }
 
-    static __m128 Rsqrt(const __m128 inV){
-        return _mm_rsqrt_ps(inV);
-    }
-
-    static __m128d Rsqrt(const __m128d inV){
-        return _mm_set_pd1(1.0) / _mm_sqrt_pd(inV);
-    }
-#endif
-#ifdef SCALFMM_USE_AVX
-    static __m256 Exp(const __m256 inV){
-         float ptr[8];
-        _mm256_storeu_ps(ptr, inV);
-        for(int idx = 0 ; idx < 8 ; ++idx){
-            ptr[idx] = std::exp(ptr[idx]);
-        }
-        return _mm256_loadu_ps(ptr);
-    }
-
-    static __m256d Exp(const __m256d inV){
-         double ptr[4];
-        _mm256_storeu_pd(ptr, inV);
-        for(int idx = 0 ; idx < 4 ; ++idx){
-            ptr[idx] = std::exp(ptr[idx]);
-        }
-        return _mm256_loadu_pd(ptr);
-    }
-
-    static __m256 Sqrt(const __m256 inV){
-        return _mm256_sqrt_ps(inV);
-    }
 
-    static __m256d Sqrt(const __m256d inV){
-        return _mm256_sqrt_pd(inV);
-    }
-
-    static __m256 Rsqrt(const __m256 inV){
-        return _mm256_rsqrt_ps(inV);
-    }
-
-    static __m256d Rsqrt(const __m256d inV){
-        return _mm256_set1_pd(1.0) / _mm256_sqrt_pd(inV);
-    }
-#endif
-#ifdef SCALFMM_USE_AVX2
-#ifdef __MIC__
-    static __m512 Exp(const __m512 inV){
-         float ptr[16];
-        _mm512_storeu_ps(ptr, inV);
-        for(int idx = 0 ; idx < 16 ; ++idx){
-            ptr[idx] = std::exp(ptr[idx]);
-        }
-        return _mm512_loadu_ps(ptr);
-    }
-
-    static __m512d Exp(const __m512d inV){
-         double ptr[8];
-        _mm512_storeu_pd(ptr, inV);
-        for(int idx = 0 ; idx < 8 ; ++idx){
-            ptr[idx] = std::exp(ptr[idx]);
-        }
-        return _mm512_loadu_pd(ptr);
-    }
-
-    static __m512d Exp(const __m512d inV){
-        return _mm512_sqrt_pd(inV);
-    }
-
-    static __m512 Sqrt(const __m512 inV){
-        return _mm512_sqrt_ps(inV);
-    }
-
-    static __m512d Sqrt(const __m512d inV){
-        return _mm512_sqrt_pd(inV);
+    static float Rsqrt(const float inValue){
+        return float(1.0)/sqrtf(inValue);
     }
-
-    static __m512 Rsqrt(const __m512 inV){
-        return _mm512_rsqrt_ps(inV);
+    static double Rsqrt(const double inValue){
+        return 1.0/sqrt(inValue);
     }
 
-    static __m512d Rsqrt(const __m512d inV){
-        return _mm512_set1_pd(1.0) / _mm512_sqrt_pd(inV);
-    }
-#endif
-#endif
     /** To get Log of a FReal */
     static float Log(const float inValue){
         return logf(inValue);
@@ -503,15 +234,6 @@ struct FMath{
         return std::isfinite(value);
     }
 
-    template <class NumType>
-    static NumType Zero();
-
-    template <class NumType>
-    static NumType One();
-
-    template <class DestType, class SrcType>
-    static DestType ConvertTo(const SrcType val);
-
 
     /** A class to compute accuracy */
     template <class FReal, class IndexType = FSize>
@@ -531,10 +253,10 @@ struct FMath{
 
         /** Add value to the current list */
         void add(const FReal inGood, const FReal inBad){
-            l2Diff     += (inBad - inGood) * (inBad - inGood);
-            l2Dot      += inGood * inGood;
-            max         = Max(max , Abs(inGood));
-            maxDiff     = Max(maxDiff, Abs(inGood-inBad));
+            l2Diff          += (inBad - inGood) * (inBad - inGood);
+            l2Dot          += inGood * inGood;
+            max               = Max(max , Abs(inGood));
+            maxDiff         = Max(maxDiff, Abs(inGood-inBad));
             nbElements += 1 ;
         }
         /** Add array of values */
@@ -609,213 +331,5 @@ struct FMath{
     };
 };
 
-template <>
-inline float FMath::Zero<float>(){
-    return float(0.0);
-}
-
-template <>
-inline double FMath::Zero<double>(){
-    return double(0.0);
-}
-
-template <>
-inline float FMath::One<float>(){
-    return float(1.0);
-}
-
-template <>
-inline double FMath::One<double>(){
-    return double(1.0);
-}
-
-template <>
-inline float FMath::ConvertTo<float,float>(const float val){
-    return val;
-}
-
-template <>
-inline double FMath::ConvertTo<double,double>(const double val){
-    return val;
-}
-
-template <>
-inline float FMath::ConvertTo<float,const float*>(const float* val){
-    return *val;
-}
-
-template <>
-inline double FMath::ConvertTo<double,const double*>(const double* val){
-    return *val;
-}
-
-#ifdef SCALFMM_USE_SSE
-template <>
-inline __m128 FMath::One<__m128>(){
-    return _mm_set_ps1(1.0);
-}
-
-template <>
-inline __m128d FMath::One<__m128d>(){
-    return _mm_set_pd1(1.0);
-}
-
-template <>
-inline __m128 FMath::Zero<__m128>(){
-    return _mm_setzero_ps();
-}
-
-template <>
-inline __m128d FMath::Zero<__m128d>(){
-    return _mm_setzero_pd();
-}
-
-template <>
-inline __m128 FMath::ConvertTo<__m128,float>(const float val){
-    return _mm_set_ps1(val);
-}
-
-template <>
-inline __m128d FMath::ConvertTo<__m128d,double>(const double val){
-    return _mm_set_pd1(val);
-}
-
-template <>
-inline __m128 FMath::ConvertTo<__m128,const float*>(const float* val){
-    return _mm_load1_ps(val);
-}
-
-template <>
-inline __m128d FMath::ConvertTo<__m128d,const double*>(const double* val){
-    return _mm_load1_pd(val);
-}
-
-template <>
-inline float FMath::ConvertTo<float,__m128>(const __m128 val){
-    __attribute__((aligned(16))) float buffer[4];
-    _mm_store_ps(buffer, val);
-    return buffer[0] + buffer[1] + buffer[2] + buffer[3];
-}
-
-template <>
-inline double FMath::ConvertTo<double,__m128d>(const __m128d val){
-    __attribute__((aligned(16))) double buffer[2];
-    _mm_store_pd(buffer, val);
-    return buffer[0] + buffer[1];
-}
-#endif
-
-#ifdef SCALFMM_USE_AVX
-template <>
-inline __m256 FMath::One<__m256>(){
-    return _mm256_set1_ps(1.0);
-}
-
-template <>
-inline __m256d FMath::One<__m256d>(){
-    return _mm256_set1_pd(1.0);
-}
-
-template <>
-inline __m256 FMath::Zero<__m256>(){
-    return _mm256_setzero_ps();
-}
-
-template <>
-inline __m256d FMath::Zero<__m256d>(){
-    return _mm256_setzero_pd();
-}
-
-template <>
-inline __m256 FMath::ConvertTo<__m256,float>(const float val){
-    return _mm256_set1_ps(val);
-}
-
-template <>
-inline __m256d FMath::ConvertTo<__m256d,double>(const double val){
-    return _mm256_set1_pd(val);
-}
-
-template <>
-inline __m256 FMath::ConvertTo<__m256,const float*>(const float* val){
-    return _mm256_broadcast_ss(val);
-}
-
-template <>
-inline __m256d FMath::ConvertTo<__m256d,const double*>(const double* val){
-    return _mm256_broadcast_sd(val);
-}
-
-template <>
-inline float FMath::ConvertTo<float,__m256>(const __m256 val){
-    __attribute__((aligned(32))) float buffer[8];
-    _mm256_store_ps(buffer, val);
-    return buffer[0] + buffer[1] + buffer[2] + buffer[3] + buffer[4] + buffer[5] + buffer[6] + buffer[7];
-}
-
-template <>
-inline double FMath::ConvertTo<double,__m256d>(const __m256d val){
-    __attribute__((aligned(32))) double buffer[4];
-    _mm256_store_pd(buffer, val);
-    return buffer[0] + buffer[1] + buffer[2] + buffer[3];
-}
-#endif
-#ifdef SCALFMM_USE_AVX2
-#ifdef __MIC__
-template <>
-inline __m512 FMath::One<__m512>(){
-    return _mm512_set1_ps(1.0);
-}
-
-template <>
-inline __m512d FMath::One<__m512d>(){
-    return _mm512_set1_pd(1.0);
-}
-
-template <>
-inline __m512 FMath::Zero<__m512>(){
-    return _mm512_setzero_ps();
-}
-
-template <>
-inline __m512d FMath::Zero<__m512d>(){
-    return _mm512_setzero_pd();
-}
-
-template <>
-inline __m512 FMath::ConvertTo<__m512,__attribute__((aligned(64))) float>(const float val){
-    return _mm512_set1_ps(val);
-}
-
-template <>
-inline __m512d FMath::ConvertTo<__m512d,__attribute__((aligned(64))) double>(const double val){
-    return _mm512_set1_pd(val);
-}
-
-template <>
-inline __m512 FMath::ConvertTo<__m512,const __attribute__((aligned(64))) float*>(const float* val){
-    return _mm512_set1_ps(val[0]);
-}
-
-template <>
-inline __m512d FMath::ConvertTo<__m512d,const __attribute__((aligned(64))) double*>(const double* val){
-    return _mm512_set1_pd(val[0]);
-}
-
-template <>
-inline float FMath::ConvertTo<float,__m512>(const __m512 val){
-    __attribute__((aligned(64))) float buffer[16];
-    _mm512_store_ps(buffer, val);
-    return buffer[0] + buffer[1] + buffer[2] + buffer[3] + buffer[4] + buffer[5] + buffer[6] + buffer[7] + buffer[8] + buffer[9] + buffer[10] + buffer[11] + buffer[12] + buffer[13] + buffer[14] + buffer[15];
-}
-
-template <>
-inline double FMath::ConvertTo<double,__m512d>(const __m512d val){
-    __attribute__((aligned(64))) double buffer[8];
-    _mm512_store_pd(buffer, val);
-    return buffer[0] + buffer[1] + buffer[2] + buffer[3] + buffer[4] + buffer[5] + buffer[6] + buffer[7];
-}
-#endif
-#endif
 
 #endif //FMATH_HPP
diff --git a/Src/Utils/FSse.hpp b/Src/Utils/FSse.hpp
deleted file mode 100644
index ad818a515ca6aa95ef17451b018ca6fbcce6c00b..0000000000000000000000000000000000000000
--- a/Src/Utils/FSse.hpp
+++ /dev/null
@@ -1,96 +0,0 @@
-// See LICENCE file at project root
-#ifndef FSSE_HPP
-#define FSSE_HPP
-
-#include "FGlobal.hpp"
-#ifndef SCALFMM_USE_SSE
-#error The SSE header is included while SCALFMM_USE_SSE is turned OFF
-#endif
-
-#include <xmmintrin.h>  // SSE
-#include <emmintrin.h>  //SSE2
-#include <pmmintrin.h> //SSE3
-#ifdef __SSSE3__
-#include <tmmintrin.h>  //SSSE3
-#endif
-#ifdef __SSSE4_1__
-#include <smmintrin.h> // SSE4
-#endif
-
-
-#ifndef _mm_set_pd1
-// Looks like clang's emmintrin.h doesn't have this alternate name.
-// But _mm_set1_pd is an equivalent to _mm_set_pd1.
-#define _mm_set_pd1 _mm_set1_pd
-#endif
-
-
-#ifdef __SSEPE_INTEL_COMPILER
-
-inline __m128d& operator+=(__m128d& v1, const __m128d& v2){
-    return (v1 = _mm_add_pd(v1, v2));
-}
-
-inline __m128d& operator-=(__m128d& v1, const __m128d& v2){
-    return (v1 = _mm_sub_pd(v1, v2));
-}
-
-inline __m128d& operator*=(__m128d& v1, const __m128d& v2){
-    return (v1 = _mm_mul_pd(v1, v2));
-}
-
-inline __m128d& operator/=(__m128d& v1, const __m128d& v2){
-    return (v1 = _mm_div_pd(v1, v2));
-}
-
-inline __m128d operator+(const __m128d& v1, const __m128d& v2){
-    return _mm_add_pd(v1, v2);
-}
-
-inline __m128d operator-(const __m128d& v1, const __m128d& v2){
-    return _mm_sub_pd(v1, v2);
-}
-
-inline __m128d operator*(const __m128d& v1, const __m128d& v2){
-    return _mm_mul_pd(v1, v2);
-}
-
-inline __m128d operator/(const __m128d& v1, const __m128d& v2){
-    return _mm_div_pd(v1, v2);
-}
-
-inline __m128& operator+=(__m128& v1, const __m128& v2){
-    return (v1 = _mm_add_ps(v1, v2));
-}
-
-inline __m128& operator-=(__m128& v1, const __m128& v2){
-    return (v1 = _mm_sub_ps(v1, v2));
-}
-
-inline __m128& operator*=(__m128& v1, const __m128& v2){
-    return (v1 = _mm_mul_ps(v1, v2));
-}
-
-inline __m128& operator/=(__m128& v1, const __m128& v2){
-    return (v1 = _mm_div_ps(v1, v2));
-}
-
-inline __m128 operator+(const __m128& v1, const __m128& v2){
-    return _mm_add_ps(v1, v2);
-}
-
-inline __m128 operator-(const __m128& v1, const __m128& v2){
-    return _mm_sub_ps(v1, v2);
-}
-
-inline __m128 operator*(const __m128& v1, const __m128& v2){
-    return _mm_mul_ps(v1, v2);
-}
-
-inline __m128 operator/(const __m128& v1, const __m128& v2){
-    return _mm_div_ps(v1, v2);
-}
-
-#endif
-
-#endif // FSSE_HPP
diff --git a/inastemp b/inastemp
new file mode 160000
index 0000000000000000000000000000000000000000..5f1c2dd418c99c2eaa9299633929bbc1ac75001e
--- /dev/null
+++ b/inastemp
@@ -0,0 +1 @@
+Subproject commit 5f1c2dd418c99c2eaa9299633929bbc1ac75001e