diff --git a/Addons/CKernelApi/CMakeLists.txt b/Addons/CKernelApi/CMakeLists.txt index 07083e44d024f53e042543b6413adad7bb5a9fa4..9cbc5a189271ebbfdf33b35dcced90828235fd8f 100644 --- a/Addons/CKernelApi/CMakeLists.txt +++ b/Addons/CKernelApi/CMakeLists.txt @@ -29,8 +29,8 @@ if(SCALFMM_ADDON_CKERNELAPI) # Adding the entire project dir as an include dir INCLUDE_DIRECTORIES( - ${CMAKE_BINARY_DIR}/Src - ${CMAKE_SOURCE_DIR}/Src + ${SCALFMM_BINARY_DIR}/Src + ${SCALFMM_SOURCE_DIR}/Src ${SCALFMM_INCLUDES} ) @@ -44,7 +44,7 @@ if(SCALFMM_ADDON_CKERNELAPI) INSTALL( FILES ${hpp_in_dir} DESTINATION include/ScalFmm/CKernelApi ) file( GLOB_RECURSE source_tests_files Tests/*.c ) - INCLUDE_DIRECTORIES( ${CMAKE_BINARY_DIR}/Src ) + INCLUDE_DIRECTORIES( ${SCALFMM_BINARY_DIR}/Src ) # Then build test files foreach(exec ${source_tests_files}) diff --git a/Addons/FmmApi/CMakeLists.txt b/Addons/FmmApi/CMakeLists.txt index 2ad1402ef04aa9807297d1839fa43b742453e782..c462e2484bda9b0aad55d7bcb1d97a3d8531a78d 100644 --- a/Addons/FmmApi/CMakeLists.txt +++ b/Addons/FmmApi/CMakeLists.txt @@ -31,8 +31,8 @@ if(SCALFMM_ADDON_FMMAPI) # Adding the entire project dir as an include dir INCLUDE_DIRECTORIES( - ${CMAKE_BINARY_DIR}/Src - ${CMAKE_SOURCE_DIR}/Src + ${SCALFMM_BINARY_DIR}/Src + ${SCALFMM_SOURCE_DIR}/Src ${SCALFMM_INCLUDES} ) @@ -46,7 +46,7 @@ if(SCALFMM_ADDON_FMMAPI) INSTALL( FILES ${hpp_in_dir} DESTINATION include/ScalFmm/FmmApi ) file( GLOB_RECURSE source_tests_files Tests/*.cpp ) - INCLUDE_DIRECTORIES( ${CMAKE_BINARY_DIR}/Src ) + INCLUDE_DIRECTORIES( ${SCALFMM_BINARY_DIR}/Src ) # Then build test files foreach(exec ${source_tests_files}) diff --git a/Addons/HMat/CMakeLists.txt b/Addons/HMat/CMakeLists.txt index 3132500f7f3569d30acced136f94e6d4fa46bc55..ad81efbc94091dfd08cdc34f2918dd70f3c9afac 100644 --- a/Addons/HMat/CMakeLists.txt +++ b/Addons/HMat/CMakeLists.txt @@ -43,8 +43,8 @@ if(SCALFMM_ADDON_HMAT) # Adding the entire project dir as an include dir INCLUDE_DIRECTORIES( - ${CMAKE_BINARY_DIR}/Src - ${CMAKE_SOURCE_DIR}/Src + ${SCALFMM_BINARY_DIR}/Src + ${SCALFMM_SOURCE_DIR}/Src ${SCALFMM_INCLUDES} ) @@ -65,7 +65,7 @@ if(SCALFMM_ADDON_HMAT) install( TARGETS cclusteringlib ARCHIVE DESTINATION lib ) file( GLOB_RECURSE source_tests_files Tests/*.cpp ) - INCLUDE_DIRECTORIES( ${CMAKE_BINARY_DIR}/Src ) + INCLUDE_DIRECTORIES( ${SCALFMM_BINARY_DIR}/Src ) # Then build test files SET(hmat_list_execs "") diff --git a/CMakeLists.txt b/CMakeLists.txt index 442cd53b07df6cef293b921c560828d618a736c2..e5a962af52497796a27cd1d74c88be77996f3133 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -93,6 +93,7 @@ if (MORSE_DISTRIB_DIR OR EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/CMakeModules/morse/ # OPENMP 4/5 support option( OPENMP_SUPPORT_COMMUTE "Set to ON to let tasks commute (KSTAR/StarPU compiler only)" OFF ) option( OPENMP_SUPPORT_PRIORITY "Set to ON to enable tasks priority (KSTAR/StarPU compiler only)" OFF ) + option( OPENMP_SUPPORT_TASK_NAME "Set to ON to enable a taskname clause for tasks (KSTAR/StarPU compiler only)" OFF ) option( SCALFMM_DISABLE_NATIVE_OMP4 "Set to ON to disable the gcc/intel omp4" OFF ) option( SCALFMM_TIME_OMPTASKS "Set to ON to time omp4 tasks and generate output file" OFF ) # STARPU options diff --git a/CMakeModules/morse/find/FindFFTW.cmake b/CMakeModules/morse/find/FindFFTW.cmake index 8a992479577a07cbced3ee471947196915484c82..ac98df8b664267d7f9bb224f215cc28a1b6bb6fb 100644 --- a/CMakeModules/morse/find/FindFFTW.cmake +++ b/CMakeModules/morse/find/FindFFTW.cmake @@ -64,10 +64,10 @@ if (NOT FFTW_FOUND) - set(FFTW_DIR "" CACHE PATH "Installation directory of FFTW library given by user") - if (NOT FFTW_FIND_QUIETLY) - message(STATUS "A cache variable, namely FFTW_DIR, has been set to specify the install directory of FFTW") - endif() + set(FFTW_DIR "" CACHE PATH "Installation directory of FFTW library given by user") + if (NOT FFTW_FIND_QUIETLY) + message(STATUS "A cache variable, namely FFTW_DIR, has been set to specify the install directory of FFTW") + endif() endif() # Set the version to find @@ -80,223 +80,264 @@ set(FFTW_LOOK_FOR_FFTW_LONG OFF) set(FFTW_LOOK_FOR_FFTW_QUAD OFF) if( FFTW_FIND_COMPONENTS ) - foreach( component ${FFTW_FIND_COMPONENTS} ) - if (${component} STREQUAL "THREADS") - # means we look for the Threads version of FFTW - set(FFTW_LOOK_FOR_THREADS ON) - endif() - if (${component} STREQUAL "OMP") - # means we look for the OpenMP version of FFTW - set(FFTW_LOOK_FOR_OMP ON) - endif() - if (${component} STREQUAL "SIMPLE") - # means we look for FFTW simple precision (fftw3f) - set(FFTW_LOOK_FOR_FFTW_SIMPLE ON) - set(FFTW_LOOK_FOR_FFTW_DOUBLE OFF) - set(FFTW_LOOK_FOR_FFTW_LONG OFF) - set(FFTW_LOOK_FOR_FFTW_QUAD OFF) - endif() - if (${component} STREQUAL "DOUBLE") - # means we look for FFTW double precision (fftw3) - set(FFTW_LOOK_FOR_FFTW_SIMPLE OFF) - set(FFTW_LOOK_FOR_FFTW_DOUBLE ON) - set(FFTW_LOOK_FOR_FFTW_LONG OFF) - set(FFTW_LOOK_FOR_FFTW_QUAD OFF) - endif() - if (${component} STREQUAL "LONG") - # means we look for FFTW long double precision (fftw3l) - set(FFTW_LOOK_FOR_FFTW_SIMPLE OFF) - set(FFTW_LOOK_FOR_FFTW_DOUBLE OFF) - set(FFTW_LOOK_FOR_FFTW_LONG ON) - set(FFTW_LOOK_FOR_FFTW_QUAD OFF) - endif() - if (${component} STREQUAL "QUAD") - # means we look for FFTW quad precision (fftw3q) - set(FFTW_LOOK_FOR_FFTW_SIMPLE OFF) - set(FFTW_LOOK_FOR_FFTW_DOUBLE OFF) - set(FFTW_LOOK_FOR_FFTW_LONG OFF) - set(FFTW_LOOK_FOR_FFTW_QUAD ON) - endif() - if (${component} STREQUAL "MKL") - # means we look for the Intel MKL version of FFTW - set(FFTW_LOOK_FOR_MKL ON) - if (FFTW_LOOK_FOR_FFTW_LONG) - message(WARNING "Looking for FFTW -- long precision functions do not exist in MKL FFTW") - set(FFTW_LOOK_FOR_FFTW_LONG OFF) - endif() - if (FFTW_LOOK_FOR_FFTW_QUAD) - message(WARNING "Looking for FFTW -- quadruple functions do not exist in MKL FFTW") - set(FFTW_LOOK_FOR_FFTW_QUAD OFF) - endif() - endif() - endforeach() + foreach( component ${FFTW_FIND_COMPONENTS} ) + if (${component} STREQUAL "THREADS") + # means we look for the Threads version of FFTW + set(FFTW_LOOK_FOR_THREADS ON) + endif() + if (${component} STREQUAL "OMP") + # means we look for the OpenMP version of FFTW + set(FFTW_LOOK_FOR_OMP ON) + endif() + if (${component} STREQUAL "SIMPLE") + # means we look for FFTW simple precision (fftw3f) + set(FFTW_LOOK_FOR_FFTW_SIMPLE ON) + set(FFTW_LOOK_FOR_FFTW_DOUBLE OFF) + set(FFTW_LOOK_FOR_FFTW_LONG OFF) + set(FFTW_LOOK_FOR_FFTW_QUAD OFF) + endif() + if (${component} STREQUAL "DOUBLE") + # means we look for FFTW double precision (fftw3) + set(FFTW_LOOK_FOR_FFTW_SIMPLE OFF) + set(FFTW_LOOK_FOR_FFTW_DOUBLE ON) + set(FFTW_LOOK_FOR_FFTW_LONG OFF) + set(FFTW_LOOK_FOR_FFTW_QUAD OFF) + endif() + if (${component} STREQUAL "LONG") + # means we look for FFTW long double precision (fftw3l) + set(FFTW_LOOK_FOR_FFTW_SIMPLE OFF) + set(FFTW_LOOK_FOR_FFTW_DOUBLE OFF) + set(FFTW_LOOK_FOR_FFTW_LONG ON) + set(FFTW_LOOK_FOR_FFTW_QUAD OFF) + endif() + if (${component} STREQUAL "QUAD") + # means we look for FFTW quad precision (fftw3q) + set(FFTW_LOOK_FOR_FFTW_SIMPLE OFF) + set(FFTW_LOOK_FOR_FFTW_DOUBLE OFF) + set(FFTW_LOOK_FOR_FFTW_LONG OFF) + set(FFTW_LOOK_FOR_FFTW_QUAD ON) + endif() + if (${component} STREQUAL "MKL") + # means we look for the Intel MKL version of FFTW + set(FFTW_LOOK_FOR_MKL ON) + if (FFTW_LOOK_FOR_FFTW_LONG) + message(WARNING "Looking for FFTW -- long precision functions do not exist in MKL FFTW") + set(FFTW_LOOK_FOR_FFTW_LONG OFF) + endif() + if (FFTW_LOOK_FOR_FFTW_QUAD) + message(WARNING "Looking for FFTW -- quadruple functions do not exist in MKL FFTW") + set(FFTW_LOOK_FOR_FFTW_QUAD OFF) + endif() + endif() + endforeach() endif() if (FFTW_LOOK_FOR_THREADS) - if (FFTW_FIND_REQUIRED AND FFTW_FIND_REQUIRED_THREADS) - find_package(Threads REQUIRED) - else() - find_package(Threads) - endif() + if (FFTW_FIND_REQUIRED AND FFTW_FIND_REQUIRED_THREADS) + find_package(Threads REQUIRED) + else() + find_package(Threads) + endif() endif() if (FFTW_LOOK_FOR_MKL) - if (FFTW_FIND_REQUIRED AND FFTW_FIND_REQUIRED_MKL) - find_package(Threads REQUIRED) - else() - find_package(Threads) - endif() + if (FFTW_FIND_REQUIRED AND FFTW_FIND_REQUIRED_MKL) + find_package(Threads REQUIRED) + else() + find_package(Threads) + endif() endif() if (FFTW_LOOK_FOR_OMP) - if (FFTW_FIND_REQUIRED AND FFTW_FIND_REQUIRED_OMP) - find_package(OpenMP REQUIRED) - else() - find_package(OpenMP) - endif() + if (FFTW_FIND_REQUIRED AND FFTW_FIND_REQUIRED_OMP) + find_package(OpenMP REQUIRED) + else() + find_package(OpenMP) + endif() endif() -# Looking for include -# ------------------- -# Add system include paths to search include -# ------------------------------------------ -unset(_inc_env) -set(ENV_MKLROOT "$ENV{MKLROOT}") set(ENV_FFTW_DIR "$ENV{FFTW_DIR}") set(ENV_FFTW_INCDIR "$ENV{FFTW_INCDIR}") -if(ENV_FFTW_INCDIR) +set(ENV_FFTW_LIBDIR "$ENV{FFTW_LIBDIR}") +set(FFTW_GIVEN_BY_USER "FALSE") +if ( FFTW_DIR OR ( FFTW_INCDIR AND FFTW_LIBDIR) OR ENV_FFTW_DIR OR (ENV_FFTW_INCDIR AND ENV_FFTW_LIBDIR) ) + set(FFTW_GIVEN_BY_USER "TRUE") +endif() + +# Optionally use pkg-config to detect include/library dirs (if pkg-config is available) +# ------------------------------------------------------------------------------------- +include(FindPkgConfig) +find_package(PkgConfig QUIET) +if( PKG_CONFIG_EXECUTABLE AND NOT FFTW_GIVEN_BY_USER ) + + pkg_search_module(FFTW fftw3) + if (NOT FFTW_FIND_QUIETLY) + if (FFTW_FOUND AND FFTW_LIBRARIES) + message(STATUS "Looking for FFTW - found using PkgConfig") + #if(NOT FFTW_INCLUDE_DIRS) + # message("${Magenta}FFTW_INCLUDE_DIRS is empty using PkgConfig." + # "Perhaps the path to hwloc headers is already present in your" + # "C(PLUS)_INCLUDE_PATH environment variable.${ColourReset}") + #endif() + else() + message("${Magenta}Looking for FFTW - not found using PkgConfig." + "Perhaps you should add the directory containing fftw3.pc to" + "the PKG_CONFIG_PATH environment variable.${ColourReset}") + endif() + endif() + + set(FFTW_INCLUDE_DIRS_DEP "${FFTW_INCLUDE_DIRS}") + set(FFTW_LIBRARY_DIRS_DEP "${FFTW_LIBRARY_DIRS}") + set(FFTW_LIBRARIES_DEP "${FFTW_LIBRARIES}") + set(FFTW_WORKS TRUE) + +endif( PKG_CONFIG_EXECUTABLE AND NOT FFTW_GIVEN_BY_USER ) + + +if( (NOT PKG_CONFIG_EXECUTABLE) OR (PKG_CONFIG_EXECUTABLE AND NOT FFTW_FOUND) OR (FFTW_GIVEN_BY_USER) ) + + # Looking for include + # ------------------- + + # Add system include paths to search include + # ------------------------------------------ + unset(_inc_env) + set(ENV_MKLROOT "$ENV{MKLROOT}") + set(ENV_FFTW_DIR "$ENV{FFTW_DIR}") + set(ENV_FFTW_INCDIR "$ENV{FFTW_INCDIR}") + if(ENV_FFTW_INCDIR) list(APPEND _inc_env "${ENV_FFTW_INCDIR}") -elseif(ENV_FFTW_DIR) + elseif(ENV_FFTW_DIR) list(APPEND _inc_env "${ENV_FFTW_DIR}") list(APPEND _inc_env "${ENV_FFTW_DIR}/include") list(APPEND _inc_env "${ENV_FFTW_DIR}/include/fftw") -else() + else() if (ENV_MKLROOT) - list(APPEND _inc_env "${ENV_MKLROOT}/include/fftw") + list(APPEND _inc_env "${ENV_MKLROOT}/include/fftw") endif() # system variables if(WIN32) - string(REPLACE ":" ";" _path_env "$ENV{INCLUDE}") - list(APPEND _inc_env "${_path_env}") + string(REPLACE ":" ";" _path_env "$ENV{INCLUDE}") + list(APPEND _inc_env "${_path_env}") else() - string(REPLACE ":" ";" _path_env "$ENV{INCLUDE}") - list(APPEND _inc_env "${_path_env}") - string(REPLACE ":" ";" _path_env "$ENV{C_INCLUDE_PATH}") - list(APPEND _inc_env "${_path_env}") - string(REPLACE ":" ";" _path_env "$ENV{CPATH}") - list(APPEND _inc_env "${_path_env}") - string(REPLACE ":" ";" _path_env "$ENV{INCLUDE_PATH}") - list(APPEND _inc_env "${_path_env}") + string(REPLACE ":" ";" _path_env "$ENV{INCLUDE}") + list(APPEND _inc_env "${_path_env}") + string(REPLACE ":" ";" _path_env "$ENV{C_INCLUDE_PATH}") + list(APPEND _inc_env "${_path_env}") + string(REPLACE ":" ";" _path_env "$ENV{CPATH}") + list(APPEND _inc_env "${_path_env}") + string(REPLACE ":" ";" _path_env "$ENV{INCLUDE_PATH}") + list(APPEND _inc_env "${_path_env}") endif() -endif() -list(APPEND _inc_env "${CMAKE_PLATFORM_IMPLICIT_INCLUDE_DIRECTORIES}") -list(APPEND _inc_env "${CMAKE_C_IMPLICIT_INCLUDE_DIRECTORIES}") -list(REMOVE_DUPLICATES _inc_env) - -# set paths where to look for -set(PATH_TO_LOOK_FOR "${_inc_env}") - -# Try to find the fftw header in the given paths -# ------------------------------------------------- -# call cmake macro to find the header path -if(FFTW_INCDIR) + endif() + list(APPEND _inc_env "${CMAKE_PLATFORM_IMPLICIT_INCLUDE_DIRECTORIES}") + list(APPEND _inc_env "${CMAKE_C_IMPLICIT_INCLUDE_DIRECTORIES}") + list(REMOVE_DUPLICATES _inc_env) + + # set paths where to look for + set(PATH_TO_LOOK_FOR "${_inc_env}") + + # Try to find the fftw header in the given paths + # ------------------------------------------------- + # call cmake macro to find the header path + if(FFTW_INCDIR) set(FFTW_fftw3.h_DIRS "FFTW_fftw3.h_DIRS-NOTFOUND") find_path(FFTW_fftw3.h_DIRS NAMES fftw3.h HINTS ${FFTW_INCDIR}) -else() + else() if(FFTW_DIR) - set(FFTW_fftw3.h_DIRS "FFTW_fftw3.h_DIRS-NOTFOUND") - find_path(FFTW_fftw3.h_DIRS - NAMES fftw3.h - HINTS ${FFTW_DIR} - PATH_SUFFIXES "include" "include/fftw") + set(FFTW_fftw3.h_DIRS "FFTW_fftw3.h_DIRS-NOTFOUND") + find_path(FFTW_fftw3.h_DIRS + NAMES fftw3.h + HINTS ${FFTW_DIR} + PATH_SUFFIXES "include" "include/fftw") else() - set(FFTW_fftw3.h_DIRS "FFTW_fftw3.h_DIRS-NOTFOUND") - find_path(FFTW_fftw3.h_DIRS - NAMES fftw3.h - HINTS ${PATH_TO_LOOK_FOR} - PATH_SUFFIXES "fftw") + set(FFTW_fftw3.h_DIRS "FFTW_fftw3.h_DIRS-NOTFOUND") + find_path(FFTW_fftw3.h_DIRS + NAMES fftw3.h + HINTS ${PATH_TO_LOOK_FOR} + PATH_SUFFIXES "fftw") endif() -endif() -mark_as_advanced(FFTW_fftw3.h_DIRS) + endif() + mark_as_advanced(FFTW_fftw3.h_DIRS) -# Add path to cmake variable -# ------------------------------------ -if (FFTW_fftw3.h_DIRS) + # Add path to cmake variable + # ------------------------------------ + if (FFTW_fftw3.h_DIRS) set(FFTW_INCLUDE_DIRS "${FFTW_fftw3.h_DIRS}") -else () + else () set(FFTW_INCLUDE_DIRS "FFTW_INCLUDE_DIRS-NOTFOUND") if(NOT FFTW_FIND_QUIETLY) - message(STATUS "Looking for FFTW -- fftw3.h not found") + message(STATUS "Looking for FFTW -- fftw3.h not found") endif() -endif () + endif () -# Looking for lib -# --------------- + # Looking for lib + # --------------- -# Add system library paths to search lib -# -------------------------------------- -unset(_lib_env) -set(ENV_FFTW_LIBDIR "$ENV{FFTW_LIBDIR}") -if(ENV_FFTW_LIBDIR) + # Add system library paths to search lib + # -------------------------------------- + unset(_lib_env) + set(ENV_FFTW_LIBDIR "$ENV{FFTW_LIBDIR}") + if(ENV_FFTW_LIBDIR) list(APPEND _lib_env "${ENV_FFTW_LIBDIR}") -elseif(ENV_FFTW_DIR) + elseif(ENV_FFTW_DIR) list(APPEND _lib_env "${ENV_FFTW_DIR}") list(APPEND _lib_env "${ENV_FFTW_DIR}/lib") if("${CMAKE_SIZEOF_VOID_P}" EQUAL "8") - list(APPEND _lib_env "${ENV_FFTW_DIR}/lib64") - list(APPEND _lib_env "${ENV_FFTW_DIR}/lib/intel64") + list(APPEND _lib_env "${ENV_FFTW_DIR}/lib64") + list(APPEND _lib_env "${ENV_FFTW_DIR}/lib/intel64") else() - list(APPEND _lib_env "${ENV_FFTW_DIR}/lib32") - list(APPEND _lib_env "${ENV_FFTW_DIR}/lib/ia32") + list(APPEND _lib_env "${ENV_FFTW_DIR}/lib32") + list(APPEND _lib_env "${ENV_FFTW_DIR}/lib/ia32") endif() -else() + else() if (ENV_MKLROOT) - list(APPEND _lib_env "${ENV_MKLROOT}/lib") - if("${CMAKE_SIZEOF_VOID_P}" EQUAL "8") - list(APPEND _lib_env "${ENV_MKLROOT}/lib64") - list(APPEND _lib_env "${ENV_MKLROOT}/lib/intel64") - else() - list(APPEND _lib_env "${ENV_MKLROOT}/lib32") - list(APPEND _lib_env "${ENV_MKLROOT}/lib/ia32") - endif() + list(APPEND _lib_env "${ENV_MKLROOT}/lib") + if("${CMAKE_SIZEOF_VOID_P}" EQUAL "8") + list(APPEND _lib_env "${ENV_MKLROOT}/lib64") + list(APPEND _lib_env "${ENV_MKLROOT}/lib/intel64") + else() + list(APPEND _lib_env "${ENV_MKLROOT}/lib32") + list(APPEND _lib_env "${ENV_MKLROOT}/lib/ia32") + endif() endif() if(WIN32) - string(REPLACE ":" ";" _lib_env2 "$ENV{LIB}") + string(REPLACE ":" ";" _lib_env2 "$ENV{LIB}") else() - if(APPLE) - string(REPLACE ":" ";" _lib_env2 "$ENV{DYLD_LIBRARY_PATH}") - else() - string(REPLACE ":" ";" _lib_env2 "$ENV{LD_LIBRARY_PATH}") - endif() - list(APPEND _lib_env "${_lib_env2}") - list(APPEND _lib_env "${CMAKE_PLATFORM_IMPLICIT_LINK_DIRECTORIES}") - list(APPEND _lib_env "${CMAKE_C_IMPLICIT_LINK_DIRECTORIES}") + if(APPLE) + string(REPLACE ":" ";" _lib_env2 "$ENV{DYLD_LIBRARY_PATH}") + else() + string(REPLACE ":" ";" _lib_env2 "$ENV{LD_LIBRARY_PATH}") + endif() + list(APPEND _lib_env "${_lib_env2}") + list(APPEND _lib_env "${CMAKE_PLATFORM_IMPLICIT_LINK_DIRECTORIES}") + list(APPEND _lib_env "${CMAKE_C_IMPLICIT_LINK_DIRECTORIES}") endif() -endif() -list(REMOVE_DUPLICATES _lib_env) + endif() + list(REMOVE_DUPLICATES _lib_env) -# set paths where to look for -set(PATH_TO_LOOK_FOR "${_lib_env}") + # set paths where to look for + set(PATH_TO_LOOK_FOR "${_lib_env}") -if(FFTW_LOOK_FOR_FFTW_SIMPLE) + if(FFTW_LOOK_FOR_FFTW_SIMPLE) set(FFTW_PREC "f") set(FFTW_PREC_TESTFUNC "s") -elseif(FFTW_LOOK_FOR_FFTW_DOUBLE) + elseif(FFTW_LOOK_FOR_FFTW_DOUBLE) set(FFTW_PREC "") set(FFTW_PREC_TESTFUNC "d") -elseif(FFTW_LOOK_FOR_FFTW_LONG) + elseif(FFTW_LOOK_FOR_FFTW_LONG) set(FFTW_PREC "l") set(FFTW_PREC_TESTFUNC "l") -elseif(FFTW_LOOK_FOR_FFTW_QUAD) + elseif(FFTW_LOOK_FOR_FFTW_QUAD) set(FFTW_PREC "q") set(FFTW_PREC_TESTFUNC "q") -endif() + endif() -if (FFTW_LOOK_FOR_MKL) + if (FFTW_LOOK_FOR_MKL) set(FFTW_libs_to_find "mkl_intel_lp64;mkl_sequential;mkl_core") @@ -305,39 +346,39 @@ if (FFTW_LOOK_FOR_MKL) # call cmake macro to find the lib path if(FFTW_LIBDIR) - foreach(fftw_lib ${FFTW_libs_to_find}) - set(FFTW_${fftw_lib}_LIBRARY "FFTW_${fftw_lib}_LIBRARY-NOTFOUND") - find_library(FFTW_${fftw_lib}_LIBRARY - NAMES ${fftw_lib} - HINTS ${FFTW_LIBDIR}) - endforeach() + foreach(fftw_lib ${FFTW_libs_to_find}) + set(FFTW_${fftw_lib}_LIBRARY "FFTW_${fftw_lib}_LIBRARY-NOTFOUND") + find_library(FFTW_${fftw_lib}_LIBRARY + NAMES ${fftw_lib} + HINTS ${FFTW_LIBDIR}) + endforeach() else() - if(FFTW_DIR) - foreach(fftw_lib ${FFTW_libs_to_find}) - set(FFTW_${fftw_lib}_LIBRARY "FFTW_${fftw_lib}_LIBRARY-NOTFOUND") - find_library(FFTW_${fftw_lib}_LIBRARY - NAMES ${fftw_lib} - HINTS ${FFTW_DIR} - PATH_SUFFIXES lib lib32 lib64) - endforeach() - else() - foreach(fftw_lib ${FFTW_libs_to_find}) - set(FFTW_${fftw_lib}_LIBRARY "FFTW_${fftw_lib}_LIBRARY-NOTFOUND") - find_library(FFTW_${fftw_lib}_LIBRARY - NAMES ${fftw_lib} - HINTS ${PATH_TO_LOOK_FOR}) - endforeach() - endif() + if(FFTW_DIR) + foreach(fftw_lib ${FFTW_libs_to_find}) + set(FFTW_${fftw_lib}_LIBRARY "FFTW_${fftw_lib}_LIBRARY-NOTFOUND") + find_library(FFTW_${fftw_lib}_LIBRARY + NAMES ${fftw_lib} + HINTS ${FFTW_DIR} + PATH_SUFFIXES lib lib32 lib64) + endforeach() + else() + foreach(fftw_lib ${FFTW_libs_to_find}) + set(FFTW_${fftw_lib}_LIBRARY "FFTW_${fftw_lib}_LIBRARY-NOTFOUND") + find_library(FFTW_${fftw_lib}_LIBRARY + NAMES ${fftw_lib} + HINTS ${PATH_TO_LOOK_FOR}) + endforeach() + endif() endif() -else(FFTW_LOOK_FOR_MKL) + else(FFTW_LOOK_FOR_MKL) if (FFTW_LOOK_FOR_THREADS) - set(FFTW_libs_to_find "fftw3${FFTW_PREC}_threads;fftw3${FFTW_PREC};fftw3") + set(FFTW_libs_to_find "fftw3${FFTW_PREC}_threads;fftw3${FFTW_PREC};fftw3") elseif (FFTW_LOOK_FOR_OMP) - set(FFTW_libs_to_find "fftw3${FFTW_PREC}_omp;fftw3${FFTW_PREC};fftw3") + set(FFTW_libs_to_find "fftw3${FFTW_PREC}_omp;fftw3${FFTW_PREC};fftw3") else() - set(FFTW_libs_to_find "fftw3${FFTW_PREC};fftw3") + set(FFTW_libs_to_find "fftw3${FFTW_PREC};fftw3") endif() # Try to find the fftw lib in the given paths @@ -345,59 +386,59 @@ else(FFTW_LOOK_FOR_MKL) # call cmake macro to find the lib path if(FFTW_LIBDIR) - foreach(fftw_lib ${FFTW_libs_to_find}) - set(FFTW_${fftw_lib}_LIBRARY "FFTW_${fftw_lib}_LIBRARY-NOTFOUND") - find_library(FFTW_${fftw_lib}_LIBRARY - NAMES ${fftw_lib} - HINTS ${FFTW_LIBDIR}) - endforeach() + foreach(fftw_lib ${FFTW_libs_to_find}) + set(FFTW_${fftw_lib}_LIBRARY "FFTW_${fftw_lib}_LIBRARY-NOTFOUND") + find_library(FFTW_${fftw_lib}_LIBRARY + NAMES ${fftw_lib} + HINTS ${FFTW_LIBDIR}) + endforeach() else() - if(FFTW_DIR) - foreach(fftw_lib ${FFTW_libs_to_find}) - set(FFTW_${fftw_lib}_LIBRARY "FFTW_${fftw_lib}_LIBRARY-NOTFOUND") - find_library(FFTW_${fftw_lib}_LIBRARY - NAMES ${fftw_lib} - HINTS ${FFTW_DIR} - PATH_SUFFIXES lib lib32 lib64) - endforeach() - else() - foreach(fftw_lib ${FFTW_libs_to_find}) - set(FFTW_${fftw_lib}_LIBRARY "FFTW_${fftw_lib}_LIBRARY-NOTFOUND") - find_library(FFTW_${fftw_lib}_LIBRARY - NAMES ${fftw_lib} - HINTS ${PATH_TO_LOOK_FOR}) - endforeach() - endif() + if(FFTW_DIR) + foreach(fftw_lib ${FFTW_libs_to_find}) + set(FFTW_${fftw_lib}_LIBRARY "FFTW_${fftw_lib}_LIBRARY-NOTFOUND") + find_library(FFTW_${fftw_lib}_LIBRARY + NAMES ${fftw_lib} + HINTS ${FFTW_DIR} + PATH_SUFFIXES lib lib32 lib64) + endforeach() + else() + foreach(fftw_lib ${FFTW_libs_to_find}) + set(FFTW_${fftw_lib}_LIBRARY "FFTW_${fftw_lib}_LIBRARY-NOTFOUND") + find_library(FFTW_${fftw_lib}_LIBRARY + NAMES ${fftw_lib} + HINTS ${PATH_TO_LOOK_FOR}) + endforeach() + endif() endif() -endif(FFTW_LOOK_FOR_MKL) + endif(FFTW_LOOK_FOR_MKL) -# If found, add path to cmake variable -# ------------------------------------ -set(FFTW_LIBRARIES "") -set(FFTW_LIBRARY_DIRS "") -foreach(fftw_lib ${FFTW_libs_to_find}) + # If found, add path to cmake variable + # ------------------------------------ + set(FFTW_LIBRARIES "") + set(FFTW_LIBRARY_DIRS "") + foreach(fftw_lib ${FFTW_libs_to_find}) if (FFTW_${fftw_lib}_LIBRARY) - get_filename_component(${fftw_lib}_lib_path "${FFTW_${fftw_lib}_LIBRARY}" PATH) - # set cmake variables - list(APPEND FFTW_LIBRARIES "${FFTW_${fftw_lib}_LIBRARY}") - list(APPEND FFTW_LIBRARY_DIRS "${${fftw_lib}_lib_path}") + get_filename_component(${fftw_lib}_lib_path "${FFTW_${fftw_lib}_LIBRARY}" PATH) + # set cmake variables + list(APPEND FFTW_LIBRARIES "${FFTW_${fftw_lib}_LIBRARY}") + list(APPEND FFTW_LIBRARY_DIRS "${${fftw_lib}_lib_path}") else () - list(APPEND FFTW_LIBRARIES "${FFTW_${fftw_lib}_LIBRARY}") - if (NOT FFTW_FIND_QUIETLY) - message(STATUS "Looking for FFTW -- lib ${fftw_lib} not found") - endif() + list(APPEND FFTW_LIBRARIES "${FFTW_${fftw_lib}_LIBRARY}") + if (NOT FFTW_FIND_QUIETLY) + message(STATUS "Looking for FFTW -- lib ${fftw_lib} not found") + endif() endif () mark_as_advanced(FFTW_${fftw_lib}_LIBRARY) -endforeach() + endforeach() -list(REMOVE_DUPLICATES FFTW_INCLUDE_DIRS) -list(REMOVE_DUPLICATES FFTW_LIBRARY_DIRS) + list(REMOVE_DUPLICATES FFTW_INCLUDE_DIRS) + list(REMOVE_DUPLICATES FFTW_LIBRARY_DIRS) -# check a function to validate the find -if(FFTW_LIBRARIES) + # check a function to validate the find + if(FFTW_LIBRARIES) set(REQUIRED_FLAGS) set(REQUIRED_LDFLAGS) @@ -407,39 +448,39 @@ if(FFTW_LIBRARIES) # FFTW if (FFTW_INCLUDE_DIRS) - set(REQUIRED_INCDIRS "${FFTW_INCLUDE_DIRS}") + set(REQUIRED_INCDIRS "${FFTW_INCLUDE_DIRS}") endif() if (FFTW_LIBRARY_DIRS) - set(REQUIRED_LIBDIRS "${FFTW_LIBRARY_DIRS}") + set(REQUIRED_LIBDIRS "${FFTW_LIBRARY_DIRS}") endif() set(REQUIRED_LIBS "${FFTW_LIBRARIES}") # THREADS if (FFTW_LOOK_FOR_THREADS) - list(APPEND REQUIRED_LIBS "${CMAKE_THREAD_LIBS_INIT}") + list(APPEND REQUIRED_LIBS "${CMAKE_THREAD_LIBS_INIT}") endif() # OMP if(FFTW_LOOK_FOR_OMP) - if (CMAKE_C_COMPILER_ID STREQUAL "GNU") - # either gomp ... - #set(REQUIRED_FLAGS "-fopenmp") - #list(APPEND REQUIRED_LIBS "-lgomp") - # or iomp5 - list(APPEND REQUIRED_LIBS "-liomp5") - elseif (CMAKE_C_COMPILER_ID STREQUAL "Intel") - list(APPEND REQUIRED_LIBS "-liomp5") - endif() + if (CMAKE_C_COMPILER_ID STREQUAL "GNU") + # either gomp ... + #set(REQUIRED_FLAGS "-fopenmp") + #list(APPEND REQUIRED_LIBS "-lgomp") + # or iomp5 + list(APPEND REQUIRED_LIBS "-liomp5") + elseif (CMAKE_C_COMPILER_ID STREQUAL "Intel") + list(APPEND REQUIRED_LIBS "-liomp5") + endif() endif() # MKL if(FFTW_LOOK_FOR_MKL) - list(APPEND REQUIRED_LIBS "${CMAKE_THREAD_LIBS_INIT}") - if (CMAKE_C_COMPILER_ID STREQUAL "GNU" AND CMAKE_SYSTEM_NAME STREQUAL "Linux") - list(APPEND REQUIRED_LDFLAGS "-Wl,--no-as-needed") - endif() + list(APPEND REQUIRED_LIBS "${CMAKE_THREAD_LIBS_INIT}") + if (CMAKE_C_COMPILER_ID STREQUAL "GNU" AND CMAKE_SYSTEM_NAME STREQUAL "Linux") + list(APPEND REQUIRED_LDFLAGS "-Wl,--no-as-needed") + endif() endif() # m find_library(M_LIBRARY NAMES m) if(M_LIBRARY) - list(APPEND REQUIRED_LIBS "-lm") + list(APPEND REQUIRED_LIBS "-lm") endif() # set required libraries for link @@ -447,7 +488,7 @@ if(FFTW_LIBRARIES) set(CMAKE_REQUIRED_LIBRARIES) list(APPEND CMAKE_REQUIRED_LIBRARIES "${REQUIRED_LDFLAGS}") foreach(lib_dir ${REQUIRED_LIBDIRS}) - list(APPEND CMAKE_REQUIRED_LIBRARIES "-L${lib_dir}") + list(APPEND CMAKE_REQUIRED_LIBRARIES "-L${lib_dir}") endforeach() list(APPEND CMAKE_REQUIRED_LIBRARIES "${REQUIRED_LIBS}") list(APPEND CMAKE_REQUIRED_FLAGS "${REQUIRED_FLAGS}") @@ -460,46 +501,48 @@ if(FFTW_LIBRARIES) mark_as_advanced(FFTW_WORKS) if(FFTW_WORKS) - # save link with dependencies - set(FFTW_LIBRARIES_DEP "${REQUIRED_LIBS}") - set(FFTW_LIBRARY_DIRS_DEP "${REQUIRED_LIBDIRS}") - set(FFTW_INCLUDE_DIRS_DEP "${REQUIRED_INCDIRS}") - set(FFTW_C_FLAGS "${REQUIRED_FLAGS}") - set(FFTW_LINKER_FLAGS "${REQUIRED_LDFLAGS}") - list(REMOVE_DUPLICATES FFTW_LIBRARY_DIRS_DEP) - list(REMOVE_DUPLICATES FFTW_INCLUDE_DIRS_DEP) - list(REMOVE_DUPLICATES FFTW_LINKER_FLAGS) + # save link with dependencies + set(FFTW_LIBRARIES_DEP "${REQUIRED_LIBS}") + set(FFTW_LIBRARY_DIRS_DEP "${REQUIRED_LIBDIRS}") + set(FFTW_INCLUDE_DIRS_DEP "${REQUIRED_INCDIRS}") + set(FFTW_C_FLAGS "${REQUIRED_FLAGS}") + set(FFTW_LINKER_FLAGS "${REQUIRED_LDFLAGS}") + list(REMOVE_DUPLICATES FFTW_LIBRARY_DIRS_DEP) + list(REMOVE_DUPLICATES FFTW_INCLUDE_DIRS_DEP) + list(REMOVE_DUPLICATES FFTW_LINKER_FLAGS) else() - if(NOT FFTW_FIND_QUIETLY) - message(STATUS "Looking for FFTW : test of ${FFTW_PREC_TESTFUNC}fftw_execute_ with fftw library fails") - message(STATUS "CMAKE_REQUIRED_LIBRARIES: ${CMAKE_REQUIRED_LIBRARIES}") - message(STATUS "CMAKE_REQUIRED_INCLUDES: ${CMAKE_REQUIRED_INCLUDES}") - message(STATUS "CMAKE_REQUIRED_FLAGS: ${CMAKE_REQUIRED_FLAGS}") - message(STATUS "Check in CMakeFiles/CMakeError.log to figure out why it fails") - endif() + if(NOT FFTW_FIND_QUIETLY) + message(STATUS "Looking for FFTW : test of ${FFTW_PREC_TESTFUNC}fftw_execute_ with fftw library fails") + message(STATUS "CMAKE_REQUIRED_LIBRARIES: ${CMAKE_REQUIRED_LIBRARIES}") + message(STATUS "CMAKE_REQUIRED_INCLUDES: ${CMAKE_REQUIRED_INCLUDES}") + message(STATUS "CMAKE_REQUIRED_FLAGS: ${CMAKE_REQUIRED_FLAGS}") + message(STATUS "Check in CMakeFiles/CMakeError.log to figure out why it fails") + endif() else() - set(FFTW_LIBRARIES ${CMAKE_REQUIRED_LIBRARIES}) + set(FFTW_LIBRARIES ${CMAKE_REQUIRED_LIBRARIES}) endif() set(CMAKE_REQUIRED_INCLUDES) set(CMAKE_REQUIRED_FLAGS) set(CMAKE_REQUIRED_LIBRARIES) -endif(FFTW_LIBRARIES) + endif(FFTW_LIBRARIES) + +endif( (NOT PKG_CONFIG_EXECUTABLE) OR (PKG_CONFIG_EXECUTABLE AND NOT FFTW_FOUND) OR (FFTW_GIVEN_BY_USER) ) if (FFTW_LIBRARIES) - list(GET FFTW_LIBRARIES 0 first_lib) - get_filename_component(first_lib_path "${first_lib}" PATH) - if (${first_lib_path} MATCHES "(/lib(32|64)?$)|(/lib/intel64$|/lib/ia32$)") - string(REGEX REPLACE "(/lib(32|64)?$)|(/lib/intel64$|/lib/ia32$)" "" not_cached_dir "${first_lib_path}") - set(FFTW_DIR_FOUND "${not_cached_dir}" CACHE PATH "Installation directory of FFTW library" FORCE) - else() - set(FFTW_DIR_FOUND "${first_lib_path}" CACHE PATH "Installation directory of FFTW library" FORCE) - endif() + list(GET FFTW_LIBRARIES 0 first_lib) + get_filename_component(first_lib_path "${first_lib}" PATH) + if (${first_lib_path} MATCHES "(/lib(32|64)?$)|(/lib/intel64$|/lib/ia32$)") + string(REGEX REPLACE "(/lib(32|64)?$)|(/lib/intel64$|/lib/ia32$)" "" not_cached_dir "${first_lib_path}") + set(FFTW_DIR_FOUND "${not_cached_dir}" CACHE PATH "Installation directory of FFTW library" FORCE) + else() + set(FFTW_DIR_FOUND "${first_lib_path}" CACHE PATH "Installation directory of FFTW library" FORCE) + endif() endif() # check that FFTW has been found # ------------------------------- include(FindPackageHandleStandardArgs) find_package_handle_standard_args(FFTW DEFAULT_MSG - FFTW_LIBRARIES - FFTW_INCLUDE_DIRS - FFTW_WORKS) + FFTW_LIBRARIES + FFTW_INCLUDE_DIRS + FFTW_WORKS) diff --git a/Examples/CMakeLists.txt b/Examples/CMakeLists.txt index 0add5c5da64a93e2a85e237f83771d8b81022fb0..c01af575183406afbea9f53920d318a925ccf664 100644 --- a/Examples/CMakeLists.txt +++ b/Examples/CMakeLists.txt @@ -17,8 +17,8 @@ file( # Adding the project sources dir as an include dir INCLUDE_DIRECTORIES( - ${CMAKE_BINARY_DIR}/Src - ${CMAKE_SOURCE_DIR}/Src + ${SCALFMM_BINARY_DIR}/Src + ${SCALFMM_SOURCE_DIR}/Src ${SCALFMM_INCLUDES} ) diff --git a/Src/CMakeLists.txt b/Src/CMakeLists.txt index b71293cd8afe9c0ec1b69fcd4f36dcd4c195c5fc..531c3f2d90f239eaffd4cfd1ff05e96b21f43987 100644 --- a/Src/CMakeLists.txt +++ b/Src/CMakeLists.txt @@ -109,5 +109,5 @@ FOREACH(my_dir ${my_include_dirs}) INSTALL( FILES ${hpp_in_dir} DESTINATION include/${my_dir} ) ENDFOREACH() -INSTALL( FILES "${CMAKE_BINARY_DIR}/Src/ScalFmmConfig.h" DESTINATION include/Utils/${my_dir} ) +INSTALL( FILES "${SCALFMM_BINARY_DIR}/Src/ScalFmmConfig.h" DESTINATION include/Utils/${my_dir} ) diff --git a/Src/Containers/FMpiBufferReader.hpp b/Src/Containers/FMpiBufferReader.hpp index 95d9ac2e8387972b37b2c3b90d457f53dcae5896..98f351b2a1190855c87b060336aad67dc2a4d74d 100644 --- a/Src/Containers/FMpiBufferReader.hpp +++ b/Src/Containers/FMpiBufferReader.hpp @@ -21,34 +21,27 @@ #include "FAbstractBuffer.hpp" #include "../Utils/FAssert.hpp" -/** @author Cyrille Piacibello - * This class provide the same features as FBufferWriter using MPI_Pack system +/** @author Cyrille Piacibello, Berenger Bramas + * This class provide the same features as FBufferWriter * * Put some data * then insert back if needed * finally use data pointer as you like */ class FMpiBufferReader : public FAbstractBufferReader { - MPI_Comm comm; //< Communicator needed by MPI_Pack functions FSize arrayCapacity; //< Allocated space std::unique_ptr<char[]> array; //< Allocated Array FSize currentIndex; public : /*Constructor with a default arrayCapacity of 512 bytes */ - explicit FMpiBufferReader(const MPI_Comm inComm = MPI_COMM_WORLD, const FSize inDefaultCapacity = 512): - comm(inComm), + explicit FMpiBufferReader(const FSize inDefaultCapacity = 512): arrayCapacity(inDefaultCapacity), array(new char[inDefaultCapacity]), currentIndex(0){ FAssertLF(array, "Cannot allocate array"); } - /** Change the comm (or to set it later) */ - void setComm(const MPI_Comm inComm){ - comm = inComm; - } - /** To change the capacity (but reset the head to 0) */ void cleanAndResize(const FSize newCapacity){ if(newCapacity != arrayCapacity){ @@ -97,50 +90,34 @@ public : /** Get a value with memory cast */ template <class ClassType> ClassType getValue(){ - FAssertLF(arrayCapacity < std::numeric_limits<int>::max()); - FAssertLF(currentIndex < std::numeric_limits<int>::max()); - int previousIndex = int(currentIndex); + FAssertLF(currentIndex + FSize(sizeof(ClassType)) <= arrayCapacity ); ClassType value; - FMpi::Assert(MPI_Unpack(array.get(),int(arrayCapacity),&previousIndex,&value,FMpi::GetTypeCount(value),FMpi::GetType(value),comm), __LINE__); - seek(FSize(sizeof(value) + currentIndex)); - FAssertLF(previousIndex == currentIndex); + memcpy(&value, &array[currentIndex], sizeof(ClassType)); + currentIndex += sizeof(ClassType); return value; } /** Get a value with memory cast at a specified index */ template <class ClassType> ClassType getValue(const FSize ind){ - ClassType value; - FAssertLF(arrayCapacity < std::numeric_limits<int>::max()); - FAssertLF(ind < std::numeric_limits<int>::max()); - int previousIndex = int(ind); - FMpi::Assert(MPI_Unpack(array.get(),int(arrayCapacity),&previousIndex,&value,FMpi::GetTypeCount(value),FMpi::GetType(value),comm), __LINE__); - seek(FSize(sizeof(value)+ind)); - FAssertLF(previousIndex == currentIndex); - return value; + currentIndex = ind; + return getValue<ClassType>(); } /** Fill a value with memory cast */ template <class ClassType> void fillValue(ClassType* const inValue){ - FAssertLF(arrayCapacity < std::numeric_limits<int>::max()); - FAssertLF(currentIndex < std::numeric_limits<int>::max()); - int previousIndex = int(currentIndex); - FMpi::Assert(MPI_Unpack(array.get(),int(arrayCapacity),&previousIndex,inValue,FMpi::GetTypeCount(*inValue),FMpi::GetType(*inValue),comm), __LINE__); - seek(FSize(sizeof(ClassType) + currentIndex)); - FAssertLF(previousIndex == currentIndex); + FAssertLF(currentIndex + FSize(sizeof(ClassType)) <= arrayCapacity ); + memcpy(inValue, &array[currentIndex], sizeof(ClassType)); + currentIndex += sizeof(ClassType); } /** Fill one/many value(s) with memcpy */ template <class ClassType> void fillArray(ClassType* const inArray, const FSize inSize){ - FAssertLF(arrayCapacity < std::numeric_limits<int>::max()); - FAssertLF(currentIndex < std::numeric_limits<int>::max()); - FAssertLF(inSize < std::numeric_limits<int>::max()); - int previousIndex = int(currentIndex); - FMpi::Assert(MPI_Unpack(array.get(),int(arrayCapacity),&previousIndex,inArray,int(inSize)*FMpi::GetTypeCount(*inArray),FMpi::GetType(*inArray),comm), __LINE__); - seek(FSize(sizeof(ClassType) * inSize + currentIndex)); - FAssertLF(previousIndex == currentIndex); + FAssertLF(currentIndex + FSize(sizeof(ClassType))*inSize <= arrayCapacity ); + memcpy(inArray, &array[currentIndex], sizeof(ClassType)*inSize); + currentIndex += sizeof(ClassType)*inSize; } /** Same as fillValue */ diff --git a/Src/Containers/FMpiBufferWriter.hpp b/Src/Containers/FMpiBufferWriter.hpp index c2a9942f6532e1821f4d9b62fbefc7d6840d8211..5050d1f3df4f2d1461afc886ac559a37050a1a6d 100644 --- a/Src/Containers/FMpiBufferWriter.hpp +++ b/Src/Containers/FMpiBufferWriter.hpp @@ -21,22 +21,21 @@ #include "FAbstractBuffer.hpp" #include "../Utils/FAssert.hpp" -/** @author Cyrille Piacibello - * This class provide the same features as FBufferWriter using MPI_Pack system +/** @author Cyrille Piacibello, Berenger Bramas + * This class provide the same features as FBufferWriter * * Put some data * then insert back if needed * finally use data pointer as you like */ class FMpiBufferWriter : public FAbstractBufferWriter { - MPI_Comm mpiComm; //< Communicator needed by MPI_Pack functions FSize arrayCapacity; //< Allocated Space std::unique_ptr<char[]> array; //< Allocated Array FSize currentIndex; //< Currently filled space /** Test and exit if not enought space */ - void expandIfNeeded(const size_t requestedSpace) { - if( arrayCapacity < FSize(currentIndex + requestedSpace) ){ + void expandIfNeeded(const FSize requestedSpace) { + if( arrayCapacity < currentIndex + requestedSpace){ arrayCapacity = FSize(double(currentIndex + requestedSpace + 1) * 1.5); char* arrayTmp = new char[arrayCapacity]; memcpy(arrayTmp, array.get(), sizeof(char)*currentIndex); @@ -46,19 +45,13 @@ class FMpiBufferWriter : public FAbstractBufferWriter { public: /** Constructor with a default arrayCapacity of 512 bytes */ - explicit FMpiBufferWriter(const MPI_Comm inComm, const FSize inDefaultCapacity = 1024): - mpiComm(inComm), + explicit FMpiBufferWriter(const FSize inDefaultCapacity = 1024): arrayCapacity(inDefaultCapacity), array(new char[inDefaultCapacity]), currentIndex(0) {} - /** Change the comm (or to set it later) */ - void setComm(const MPI_Comm inComm){ - mpiComm = inComm; - } - /** To change the capacity (but reset the head to 0 if size if lower) */ void resize(const FSize newCapacity){ if(newCapacity != arrayCapacity){ @@ -98,10 +91,8 @@ public: template <class ClassType> void write(const ClassType& object){ expandIfNeeded(sizeof(ClassType)); - FAssertLF(currentIndex < std::numeric_limits<int>::max()); - int intCurrentIndex = int(currentIndex); - FMpi::Assert(MPI_Pack(const_cast<ClassType*>(&object), FMpi::GetTypeCount(object), FMpi::GetType(object), array.get(), int(arrayCapacity), &intCurrentIndex, mpiComm), __LINE__); - currentIndex = intCurrentIndex; + memcpy(&array[currentIndex], &object, sizeof(ClassType)); + currentIndex += sizeof(ClassType); } /** @@ -110,20 +101,15 @@ public: template <class ClassType> void write(const ClassType&& object){ expandIfNeeded(sizeof(ClassType)); - FAssertLF(arrayCapacity < std::numeric_limits<int>::max()); - int intCurrentIndex = int(currentIndex); - FMpi::Assert(MPI_Pack(const_cast<ClassType*>(&object), FMpi::GetTypeCount(object), FMpi::GetType(object), array.get(), int(arrayCapacity), &intCurrentIndex, mpiComm), __LINE__); - currentIndex = intCurrentIndex; + memcpy(&array[currentIndex], &object, sizeof(ClassType)); + currentIndex += sizeof(ClassType); } /** Write back, position + sizeof(object) has to be < size */ template <class ClassType> void writeAt(const FSize position, const ClassType& object){ - FAssertLF(FSize(position + sizeof(ClassType)) <= currentIndex); - FAssertLF(arrayCapacity < std::numeric_limits<int>::max()); - FAssertLF(position < std::numeric_limits<int>::max()); - int noConstPosition = int(position); - FMpi::Assert(MPI_Pack(const_cast<ClassType*>(&object), FMpi::GetTypeCount(object), FMpi::GetType(object), array.get(), int(arrayCapacity), &noConstPosition, mpiComm), __LINE__); + FAssertLF(position+FSize(sizeof(ClassType)) <= currentIndex); + memcpy(&array[position], &object, sizeof(ClassType)); } /** Write an array @@ -132,11 +118,8 @@ public: template <class ClassType> void write(const ClassType* const objects, const FSize inSize){ expandIfNeeded(sizeof(ClassType) * inSize); - FAssertLF(arrayCapacity < std::numeric_limits<int>::max()); - FAssertLF(inSize < std::numeric_limits<int>::max()); - int intCurrentIndex = int(currentIndex); - FMpi::Assert(MPI_Pack( const_cast<ClassType*>(objects), int(inSize)*FMpi::GetTypeCount(*objects), FMpi::GetType(*objects), array.get(), int(arrayCapacity), &intCurrentIndex, mpiComm), __LINE__); - currentIndex = intCurrentIndex; + memcpy(&array[currentIndex], objects, sizeof(ClassType)*inSize); + currentIndex += sizeof(ClassType)*inSize; } /** Equivalent to write */ diff --git a/Src/Core/FFmmAlgorithmThreadProc.hpp b/Src/Core/FFmmAlgorithmThreadProc.hpp index 14382972d98d0ba9e2910bd0b57be8694f373508..05b4a37bda4415d13bfe136edda8056f1c2874fb 100644 --- a/Src/Core/FFmmAlgorithmThreadProc.hpp +++ b/Src/Core/FFmmAlgorithmThreadProc.hpp @@ -363,6 +363,7 @@ protected: FLOG(computationCounter.tac()); FLOG( FLog::Controller << "\tFinished (@Bottom Pass (P2M) = " << counterTime.tacAndElapsed() << " s)\n" ); FLOG( FLog::Controller << "\t\t Computation : " << computationCounter.elapsed() << " s\n" ); + FLOG( FLog::Controller.flush()); } ///////////////////////////////////////////////////////////////////////////// @@ -400,7 +401,7 @@ protected: MPI_Status statusSize[8]; FSize bufferSize; - FMpiBufferWriter sendBuffer(comm.getComm(), 1);// Max = 1 + sizeof(cell)*7 + FMpiBufferWriter sendBuffer(1);// Max = 1 + sizeof(cell)*7 std::unique_ptr<FMpiBufferReader[]> recvBuffer(new FMpiBufferReader[7]); FSize recvBufferSize[7]; CellClass recvBufferCells[7]; @@ -491,7 +492,7 @@ protected: MPI_Isend(&bufferSize, 1, FMpi::GetType(bufferSize), currentProcIdToSendTo, FMpi::TagFmmM2MSize + idxLevel, comm.getComm(), &requestsSize[iterMpiRequestsSize++]); FAssertLF(sendBuffer.getSize() < std::numeric_limits<int>::max()); - MPI_Isend(sendBuffer.data(), int(sendBuffer.getSize()), MPI_PACKED, currentProcIdToSendTo, + MPI_Isend(sendBuffer.data(), int(sendBuffer.getSize()), MPI_BYTE, currentProcIdToSendTo, FMpi::TagFmmM2M + idxLevel, comm.getComm(), &requests[iterMpiRequests++]); } } @@ -532,7 +533,7 @@ protected: if(procHasWorkAtLevel(idxLevel+1, idProcSource) && procCoversMyRightBorderCell(idxLevel, idProcSource)){ recvBuffer[nbProcThatSendToMe].cleanAndResize(recvBufferSize[nbProcThatSendToMe]); FAssertLF(recvBufferSize[nbProcThatSendToMe] < std::numeric_limits<int>::max()); - MPI_Irecv(recvBuffer[nbProcThatSendToMe].data(), int(recvBufferSize[nbProcThatSendToMe]), MPI_PACKED, + MPI_Irecv(recvBuffer[nbProcThatSendToMe].data(), int(recvBufferSize[nbProcThatSendToMe]), MPI_BYTE, idProcSource, FMpi::TagFmmM2M + idxLevel, comm.getComm(), &requests[iterMpiRequests++]); nbProcThatSendToMe += 1; FAssertLF(nbProcThatSendToMe <= 7); @@ -556,7 +557,7 @@ protected: // Retreive data and merge my child and the child from others for(int idxProc = 0 ; idxProc < nbProcThatSendToMe ; ++idxProc){ - int packageFlags = int(recvBuffer[idxProc].getValue<char>()); + unsigned packageFlags = unsigned(recvBuffer[idxProc].getValue<unsigned char>()); int position = 0; int positionToInsert = 0; @@ -602,6 +603,7 @@ protected: FLOG( FLog::Controller << "\t\t Computation : " << computationCounter.elapsed() << " s\n" ); FLOG( FLog::Controller << "\t\t Single : " << singleCounter.cumulated() << " s\n" ); FLOG( FLog::Controller << "\t\t Parallel : " << parallelCounter.cumulated() << " s\n" ); + FLOG( FLog::Controller.flush()); } ///////////////////////////////////////////////////////////////////////////// @@ -754,15 +756,14 @@ protected: FLOG(sendCounter.tic()); // Then they can send and receive (because they know what they will receive) // To send in asynchrone way - MPI_Request*const requests = new MPI_Request[2 * nbProcess * OctreeHeight]; - MPI_Status*const status = new MPI_Status[2 * nbProcess * OctreeHeight]; - int iterRequest = 0; + std::vector<MPI_Request> requests; + requests.reserve(2 * nbProcess * OctreeHeight); for(int idxLevel = 2 ; idxLevel < OctreeHeight ; ++idxLevel ){ for(int idxProc = 0 ; idxProc < nbProcess ; ++idxProc){ const long long int toSendAtProcAtLevel = indexToSend[idxLevel * nbProcess + idxProc]; if(toSendAtProcAtLevel != 0){ - sendBuffer[idxLevel * nbProcess + idxProc] = new FMpiBufferWriter(comm.getComm(),int(toSendAtProcAtLevel)); + sendBuffer[idxLevel * nbProcess + idxProc] = new FMpiBufferWriter(toSendAtProcAtLevel); sendBuffer[idxLevel * nbProcess + idxProc]->write(int(toSend[idxLevel * nbProcess + idxProc].getSize())); @@ -776,20 +777,18 @@ protected: FAssertLF(sendBuffer[idxLevel * nbProcess + idxProc]->getSize() == toSendAtProcAtLevel); - FAssertLF(sendBuffer[idxLevel * nbProcess + idxProc]->getSize() < std::numeric_limits<int>::max()); - FMpi::MpiAssert( MPI_Isend( sendBuffer[idxLevel * nbProcess + idxProc]->data(), - int(sendBuffer[idxLevel * nbProcess + idxProc]->getSize()),MPI_PACKED, idxProc, - FMpi::TagLast + idxLevel, comm.getComm(), &requests[iterRequest++]) , __LINE__ ); + FMpi::ISendSplit(sendBuffer[idxLevel * nbProcess + idxProc]->data(), + sendBuffer[idxLevel * nbProcess + idxProc]->getSize(), idxProc, + FMpi::TagLast + idxLevel*100, comm, &requests); } const long long int toReceiveFromProcAtLevel = globalReceiveMap[(idxProc * nbProcess * OctreeHeight) + idxLevel * nbProcess + idProcess]; if(toReceiveFromProcAtLevel){ - recvBuffer[idxLevel * nbProcess + idxProc] = new FMpiBufferReader(comm.getComm(),int(toReceiveFromProcAtLevel)); + recvBuffer[idxLevel * nbProcess + idxProc] = new FMpiBufferReader(toReceiveFromProcAtLevel); - FAssertLF(recvBuffer[idxLevel * nbProcess + idxProc]->getCapacity() < std::numeric_limits<int>::max()); - FMpi::MpiAssert( MPI_Irecv(recvBuffer[idxLevel * nbProcess + idxProc]->data(), - int(recvBuffer[idxLevel * nbProcess + idxProc]->getCapacity()), MPI_PACKED,idxProc, - FMpi::TagLast + idxLevel, comm.getComm(), &requests[iterRequest++]) , __LINE__ ); + FMpi::IRecvSplit(recvBuffer[idxLevel * nbProcess + idxProc]->data(), + recvBuffer[idxLevel * nbProcess + idxProc]->getCapacity(), idxProc, + FMpi::TagLast + idxLevel*100, comm, &requests); } } } @@ -799,10 +798,7 @@ protected: ////////////////////////////////////////////////////////////////// // Wait to receive every things (and send every things) - FMpi::MpiAssert(MPI_Waitall(iterRequest, requests, status), __LINE__); - - delete[] requests; - delete[] status; + FMpi::MpiAssert(MPI_Waitall(int(requests.size()), requests.data(), MPI_STATUS_IGNORE), __LINE__); FLOG(sendCounter.tac()); }//End of Master region @@ -1009,6 +1005,7 @@ protected: FLOG( FLog::Controller << "\t\t Receive : " << receiveCounter.cumulated() << " s\n" ); FLOG( FLog::Controller << "\t\t Gather : " << gatherCounter.cumulated() << " s\n" ); FLOG( FLog::Controller << "\t\t Prepare : " << prepareCounter.cumulated() << " s\n" ); + FLOG( FLog::Controller.flush()); } @@ -1039,8 +1036,8 @@ protected: const int heightMinusOne = FAbstractAlgorithm::lowerWorkingLevel - 1; - FMpiBufferWriter sendBuffer(comm.getComm()); - FMpiBufferReader recvBuffer(comm.getComm()); + FMpiBufferWriter sendBuffer; + FMpiBufferReader recvBuffer; int righestProcToSendTo = nbProcess - 1; @@ -1116,7 +1113,7 @@ protected: FMpi::MpiAssert( MPI_Isend(&sendBufferSize, 1, FMpi::GetType(sendBufferSize), idxProcSend, FMpi::TagFmmL2LSize + idxLevel, comm.getComm(), &requestsSize[iterRequestsSize++]), __LINE__); FAssertLF(sendBuffer.getSize() < std::numeric_limits<int>::max()); - FMpi::MpiAssert( MPI_Isend(sendBuffer.data(), int(sendBuffer.getSize()), MPI_PACKED, idxProcSend, + FMpi::MpiAssert( MPI_Isend(sendBuffer.data(), int(sendBuffer.getSize()), MPI_BYTE, idxProcSend, FMpi::TagFmmL2L + idxLevel, comm.getComm(), &requests[iterRequests++]), __LINE__); // Inc and check the counter nbMessageSent += 1; @@ -1139,7 +1136,7 @@ protected: if(hasToReceive){ recvBuffer.cleanAndResize(recvBufferSize); FAssertLF(recvBuffer.getCapacity() < std::numeric_limits<int>::max()); - FMpi::MpiAssert( MPI_Irecv( recvBuffer.data(), int(recvBuffer.getCapacity()), MPI_PACKED, idxProcToReceive, + FMpi::MpiAssert( MPI_Irecv( recvBuffer.data(), int(recvBuffer.getCapacity()), MPI_BYTE, idxProcToReceive, FMpi::TagFmmL2L + idxLevel, comm.getComm(), &requests[iterRequests++]), __LINE__ ); } @@ -1184,6 +1181,7 @@ protected: FLOG( FLog::Controller << "\t\t Computation : " << computationCounter.cumulated() << " s\n" ); FLOG( FLog::Controller << "\t\t Prepare : " << prepareCounter.cumulated() << " s\n" ); FLOG( FLog::Controller << "\t\t Wait : " << waitCounter.cumulated() << " s\n" ); + FLOG( FLog::Controller.flush()); } @@ -1213,25 +1211,6 @@ protected: /////////////////////////////////////////////////// FLOG(prepareCounter.tic()); - // To send in asynchrone way - MPI_Request requests[2 * nbProcess]; - MPI_Status status[2 * nbProcess]; - int iterRequest = 0; - int nbMessagesToRecv = 0; - - FMpiBufferWriter**const sendBuffer = new FMpiBufferWriter*[nbProcess]; - memset(sendBuffer, 0, sizeof(FMpiBufferWriter*) * nbProcess); - - FMpiBufferReader**const recvBuffer = new FMpiBufferReader*[nbProcess]; - memset(recvBuffer, 0, sizeof(FMpiBufferReader*) * nbProcess); - - /* This a nbProcess x nbProcess matrix of integer - * let U and V be id of processes : - * globalReceiveMap[U*nbProcess + V] == size of information needed by V and own by U - */ - FSize*const globalReceiveMap = new FSize[nbProcess * nbProcess]; - memset(globalReceiveMap, 0, sizeof(FSize) * nbProcess * nbProcess); - FBoolArray leafsNeedOther(this->numberOfLeafs); int countNeedOther = 0; @@ -1320,28 +1299,43 @@ protected: #pragma omp master // nowait if(p2pEnabled){ + /* This a nbProcess x nbProcess matrix of integer + * let U and V be id of processes : + * globalReceiveMap[U*nbProcess + V] == size of information needed by V and own by U + */ + FSize*const globalReceiveMap = new FSize[nbProcess * nbProcess]; + memset(globalReceiveMap, 0, sizeof(FSize) * nbProcess * nbProcess); + //Share to all processus globalReceiveMap FLOG(gatherCounter.tic()); FMpi::MpiAssert( MPI_Allgather( partsToSend, nbProcess, FMpi::GetType(*partsToSend), globalReceiveMap, nbProcess, FMpi::GetType(*partsToSend), comm.getComm()), __LINE__ ); FLOG(gatherCounter.tac()); + FMpiBufferReader**const recvBuffer = new FMpiBufferReader*[nbProcess]; + memset(recvBuffer, 0, sizeof(FMpiBufferReader*) * nbProcess); + + FMpiBufferWriter**const sendBuffer = new FMpiBufferWriter*[nbProcess]; + memset(sendBuffer, 0, sizeof(FMpiBufferWriter*) * nbProcess); + + // To send in asynchrone way + std::vector<MPI_Request> requests; + requests.reserve(2 * nbProcess); //Prepare receive for(int idxProc = 0 ; idxProc < nbProcess ; ++idxProc){ if(globalReceiveMap[idxProc * nbProcess + idProcess]){ //if idxProc has sth for me. //allocate buffer of right size - recvBuffer[idxProc] = new FMpiBufferReader(comm.getComm(),globalReceiveMap[idxProc * nbProcess + idProcess]); - FAssertLF(recvBuffer[idxProc]->getCapacity() < std::numeric_limits<int>::max()); - FMpi::MpiAssert( MPI_Irecv(recvBuffer[idxProc]->data(), int(recvBuffer[idxProc]->getCapacity()), MPI_PACKED, - idxProc, FMpi::TagFmmP2P, comm.getComm(), &requests[iterRequest++]) , __LINE__ ); + recvBuffer[idxProc] = new FMpiBufferReader(globalReceiveMap[idxProc * nbProcess + idProcess]); + + FMpi::IRecvSplit(recvBuffer[idxProc]->data(), recvBuffer[idxProc]->getCapacity(), + idxProc, FMpi::TagFmmP2P, comm, &requests); } } - nbMessagesToRecv = iterRequest; // Prepare send for(int idxProc = 0 ; idxProc < nbProcess ; ++idxProc){ if(toSend[idxProc].getSize() != 0){ - sendBuffer[idxProc] = new FMpiBufferWriter(comm.getComm(),globalReceiveMap[idProcess*nbProcess+idxProc]); + sendBuffer[idxProc] = new FMpiBufferWriter(globalReceiveMap[idProcess*nbProcess+idxProc]); // << is equivalent to write(). (*sendBuffer[idxProc]) << toSend[idxProc].getSize(); for(int idxLeaf = 0 ; idxLeaf < toSend[idxProc].getSize() ; ++idxLeaf){ @@ -1350,9 +1344,9 @@ protected: } FAssertLF(sendBuffer[idxProc]->getSize() == globalReceiveMap[idProcess*nbProcess+idxProc]); - FAssertLF(sendBuffer[idxProc]->getSize() < std::numeric_limits<int>::max()); - FMpi::MpiAssert( MPI_Isend( sendBuffer[idxProc]->data(), int(sendBuffer[idxProc]->getSize()) , MPI_PACKED , - idxProc, FMpi::TagFmmP2P, comm.getComm(), &requests[iterRequest++]) , __LINE__ ); + + FMpi::ISendSplit(sendBuffer[idxProc]->data(), sendBuffer[idxProc]->getSize(), + idxProc, FMpi::TagFmmP2P, comm, &requests); } } @@ -1364,23 +1358,34 @@ protected: // Waitsend receive ////////////////////////////////////////////////////////// + std::unique_ptr<MPI_Status[]> status(new MPI_Status[requests.size()]); // Wait data FLOG(waitCounter.tic()); - MPI_Waitall(iterRequest, requests, status); + MPI_Waitall(int(requests.size()), requests.data(), status.get()); FLOG(waitCounter.tac()); - for(int idxRcv = 0 ; idxRcv < nbMessagesToRecv ; ++idxRcv){ - const int idxProc = status[idxRcv].MPI_SOURCE; - FSize nbLeaves; - (*recvBuffer[idxProc]) >> nbLeaves; - for(FSize idxLeaf = 0 ; idxLeaf < nbLeaves ; ++idxLeaf){ - MortonIndex leafIndex; - (*recvBuffer[idxProc]) >> leafIndex; - otherP2Ptree.createLeaf(leafIndex)->getSrc()->restore((*recvBuffer[idxProc])); + for(int idxProc = 0 ; idxProc < nbProcess ; ++idxProc){ + if(globalReceiveMap[idxProc * nbProcess + idProcess]){ //if idxProc has sth for me. + FAssertLF(recvBuffer[idxProc]); + FMpiBufferReader& currentBuffer = (*recvBuffer[idxProc]); + FSize nbLeaves; + currentBuffer >> nbLeaves; + for(FSize idxLeaf = 0 ; idxLeaf < nbLeaves ; ++idxLeaf){ + MortonIndex leafIndex; + currentBuffer >> leafIndex; + otherP2Ptree.createLeaf(leafIndex)->getSrc()->restore(currentBuffer); + } + // Realease memory early + delete recvBuffer[idxProc]; + recvBuffer[idxProc] = nullptr; } + } + + for(int idxProc = 0 ; idxProc < nbProcess ; ++idxProc){ + delete sendBuffer[idxProc]; delete recvBuffer[idxProc]; - recvBuffer[idxProc] = nullptr; } + delete[] globalReceiveMap; } /////////////////////////////////////////////////// @@ -1530,11 +1535,6 @@ protected: } } - for(int idxProc = 0 ; idxProc < nbProcess ; ++idxProc){ - delete sendBuffer[idxProc]; - delete recvBuffer[idxProc]; - } - delete[] globalReceiveMap; delete[] leafsDataArray; FLOG(computation2Counter.tac()); @@ -1546,6 +1546,7 @@ protected: FLOG( FLog::Controller << "\t\t Prepare P2P : " << prepareCounter.elapsed() << " s\n" ); FLOG( FLog::Controller << "\t\t Gather P2P : " << gatherCounter.elapsed() << " s\n" ); FLOG( FLog::Controller << "\t\t Wait : " << waitCounter.elapsed() << " s\n" ); + FLOG( FLog::Controller.flush()); } }; diff --git a/Src/Core/FFmmAlgorithmThreadProcPeriodic.hpp b/Src/Core/FFmmAlgorithmThreadProcPeriodic.hpp index 947086936ce5902577558b87a8e24b9c2928458b..b39eed9d04cf539e61581a9351397609644d5273 100644 --- a/Src/Core/FFmmAlgorithmThreadProcPeriodic.hpp +++ b/Src/Core/FFmmAlgorithmThreadProcPeriodic.hpp @@ -404,7 +404,7 @@ protected: MPI_Status statusSize[8]; FSize bufferSize; - FMpiBufferWriter sendBuffer(comm.getComm(), 1);// Max = 1 + sizeof(cell)*7 + FMpiBufferWriter sendBuffer(1);// Max = 1 + sizeof(cell)*7 std::unique_ptr<FMpiBufferReader[]> recvBuffer(new FMpiBufferReader[7]); FSize recvBufferSize[7]; CellClass recvBufferCells[7]; @@ -856,7 +856,7 @@ protected: for(int idxProc = 0 ; idxProc < nbProcess ; ++idxProc){ const long long int toSendAtProcAtLevel = indexToSend[idxLevel * nbProcess + idxProc]; if(toSendAtProcAtLevel != 0){ - sendBuffer[idxLevel * nbProcess + idxProc] = new FMpiBufferWriter(comm.getComm(),int(toSendAtProcAtLevel)); + sendBuffer[idxLevel * nbProcess + idxProc] = new FMpiBufferWriter(toSendAtProcAtLevel); sendBuffer[idxLevel * nbProcess + idxProc]->write(int(toSend[idxLevel * nbProcess + idxProc].getSize())); @@ -878,7 +878,7 @@ protected: const long long int toReceiveFromProcAtLevel = globalReceiveMap[(idxProc * nbProcess * OctreeHeight) + idxLevel * nbProcess + idProcess]; if(toReceiveFromProcAtLevel){ - recvBuffer[idxLevel * nbProcess + idxProc] = new FMpiBufferReader(comm.getComm(),int(toReceiveFromProcAtLevel)); + recvBuffer[idxLevel * nbProcess + idxProc] = new FMpiBufferReader(toReceiveFromProcAtLevel); FAssertLF(recvBuffer[idxLevel * nbProcess + idxProc]->getCapacity() < std::numeric_limits<int>::max()); FMpi::MpiAssert( MPI_Irecv(recvBuffer[idxLevel * nbProcess + idxProc]->data(), @@ -1126,7 +1126,7 @@ protected: MPI_Status*const statusSize = new MPI_Status[8]; FMpiBufferWriter sendBuffer(comm.getComm()); - FMpiBufferReader recvBuffer(comm.getComm()); + FMpiBufferReader recvBuffer; int righestProcToSendTo = nbProcess - 1; @@ -1441,7 +1441,7 @@ protected: for(int idxProc = 0 ; idxProc < nbProcess ; ++idxProc){ if(globalReceiveMap[idxProc * nbProcess + idProcess]){ //if idxProc has sth for me. //allocate buffer of right size - recvBuffer[idxProc] = new FMpiBufferReader(comm.getComm(),globalReceiveMap[idxProc * nbProcess + idProcess]); + recvBuffer[idxProc] = new FMpiBufferReader(globalReceiveMap[idxProc * nbProcess + idProcess]); FAssertLF(recvBuffer[idxProc]->getCapacity() < std::numeric_limits<int>::max()); FMpi::MpiAssert( MPI_Irecv(recvBuffer[idxProc]->data(), int(recvBuffer[idxProc]->getCapacity()), MPI_PACKED, idxProc, FMpi::TagFmmP2P, comm.getComm(), &requests[iterRequest++]) , __LINE__ ); @@ -1452,7 +1452,7 @@ protected: // Prepare send for(int idxProc = 0 ; idxProc < nbProcess ; ++idxProc){ if(toSend[idxProc].getSize() != 0){ - sendBuffer[idxProc] = new FMpiBufferWriter(comm.getComm(),globalReceiveMap[idProcess*nbProcess+idxProc]); + sendBuffer[idxProc] = new FMpiBufferWriter(globalReceiveMap[idProcess*nbProcess+idxProc]); // << is equivalent to write(). (*sendBuffer[idxProc]) << toSend[idxProc].getSize(); for(int idxLeaf = 0 ; idxLeaf < toSend[idxProc].getSize() ; ++idxLeaf){ diff --git a/Src/Files/FMpiTreeBuilder.hpp b/Src/Files/FMpiTreeBuilder.hpp index 89a57482e2eb02e01dc5f50b16e8f19258a23785..87b5ecec6def1dee1acf607d7f31a18cee3d4cd0 100644 --- a/Src/Files/FMpiTreeBuilder.hpp +++ b/Src/Files/FMpiTreeBuilder.hpp @@ -20,6 +20,7 @@ #include "../Utils/FQuickSortMpi.hpp" #include "../Utils/FBitonicSort.hpp" #include "../Utils/FTic.hpp" +#include "../Utils/FEnv.hpp" #include "../Utils/FMemUtils.hpp" @@ -41,6 +42,8 @@ template<class FReal, class ParticleClass> class FMpiTreeBuilder{ private: + static const bool VerboseLog; + /** To keep the leaves information after the sort */ struct LeafInfo { MortonIndex mindex; @@ -188,13 +191,15 @@ public: if( (*workingSize) != 0 ){ borderLeavesState[0] = leavesInfo[0]; borderLeavesState[1] = leavesInfo[leavesInfo.getSize()-1]; + FLOG(if(VerboseLog) FLog::Controller << "SCALFMM-DEBUG [" << communicator.processId() << "] First " << borderLeavesState[0].mindex << "\n"; FLog::Controller.flush(); ); + FLOG(if(VerboseLog) FLog::Controller << "SCALFMM-DEBUG [" << communicator.processId() << "] Last " << borderLeavesState[1].mindex << "\n"; FLog::Controller.flush(); ); } std::unique_ptr<LeafInfo[]> allProcFirstLeafStates(new LeafInfo[nbProcs*2]); FMpi::MpiAssert(MPI_Allgather(&borderLeavesState, sizeof(LeafInfo)*2, MPI_BYTE, allProcFirstLeafStates.get(), sizeof(LeafInfo)*2, MPI_BYTE, communicator.getComm()),__LINE__); - FVector<MPI_Request> requests; + std::vector<MPI_Request> requests; // Find what to send/recv from who bool hasSentFirstLeaf = false; @@ -209,10 +214,9 @@ public: // We found someone if(idProcToSendTo != myRank && allProcFirstLeafStates[(idProcToSendTo)*2 + 1].mindex == borderLeavesState[0].mindex){ // Post and send message for the first leaf - requests.push((MPI_Request)0); - FAssertLF(borderLeavesState[0].nbParts < std::numeric_limits<int>::max()); - FMpi::MpiAssert(MPI_Isend(&workingArray[0], int(borderLeavesState[0].nbParts), MPI_BYTE, idProcToSendTo, - FMpi::TagExchangeIndexs, communicator.getComm(), &requests[0]),__LINE__); + FMpi::ISendSplit(&workingArray[0], borderLeavesState[0].nbParts, idProcToSendTo, + FMpi::TagExchangeIndexs, communicator, &requests); + FLOG(if(VerboseLog) FLog::Controller << "SCALFMM-DEBUG [" << communicator.processId() << "] send " << borderLeavesState[0].nbParts << " to " << idProcToSendTo << "\n"; FLog::Controller.flush(); ); hasSentFirstLeaf = true; } } @@ -239,10 +243,9 @@ public: for(int postRecvIdx = (myRank+1); postRecvIdx <= idProcToRecvFrom ; ++postRecvIdx){ // If there are some on this proc if(allProcFirstLeafStates[(postRecvIdx)*2].mindex != noDataFlag){ - requests.push((MPI_Request)0); - FAssertLF(allProcFirstLeafStates[(postRecvIdx)*2].nbParts < std::numeric_limits<int>::max()); - FMpi::MpiAssert(MPI_Irecv(&receivedParticles[postPositionRecv], int(allProcFirstLeafStates[(postRecvIdx)*2].nbParts), MPI_BYTE, postRecvIdx, - FMpi::TagExchangeIndexs, communicator.getComm(), &requests[0]),__LINE__); + FMpi::IRecvSplit(&receivedParticles[postPositionRecv], allProcFirstLeafStates[(postRecvIdx)*2].nbParts, postRecvIdx, + FMpi::TagExchangeIndexs, communicator, &requests); + FLOG(if(VerboseLog) FLog::Controller << "SCALFMM-DEBUG [" << communicator.processId() << "] recv " << allProcFirstLeafStates[(postRecvIdx)*2].nbParts << " from " << postRecvIdx << "\n"; FLog::Controller.flush(); ); // Inc the write position postPositionRecv += allProcFirstLeafStates[(postRecvIdx)*2].nbParts; } @@ -252,7 +255,7 @@ public: } // Finalize communication - FMpi::MpiAssert(MPI_Waitall(int(requests.getSize()), requests.data(), MPI_STATUSES_IGNORE),__LINE__); + FMpi::MpiAssert(MPI_Waitall(int(requests.size()), requests.data(), MPI_STATUSES_IGNORE),__LINE__); // IF we sent we need to remove the first leaf if(hasSentFirstLeaf){ @@ -282,6 +285,7 @@ public: delete[] workingArray; workingArray = particlesWithExtension; (*workingSize) = finalParticlesNumber; + leavesInfo[leavesInfo.getSize()-1].nbParts += receivedParticles.size(); } } {//Filling the Array with leaves and parts //// COULD BE MOVED IN AN OTHER FUCTION @@ -319,8 +323,6 @@ public: const FSize leavesOffsetInParticles[], const ParticleClass particlesArrayInLeafOrder[], const FSize currentNbLeaves, const FSize currentNbParts, FAbstractBalanceAlgorithm * balancer){ - const FSize MAX_BYTE_PER_MPI_MESS = 2000000000; - const FSize MAX_PARTICLES_PER_MPI_MESS = FMath::Max(FSize(1), FSize(MAX_BYTE_PER_MPI_MESS/sizeof(ParticleClass))); const int myRank = communicator.processId(); const int nbProcs = communicator.processCount(); @@ -336,7 +338,10 @@ public: FMpi::MpiAssert(MPI_Allgather(const_cast<FSize*>(¤tNbLeaves), 1, MPI_LONG_LONG_INT, numberOfLeavesPerProc.get(), 1, MPI_LONG_LONG_INT, communicator.getComm()), __LINE__); - //prefix sum + + FLOG(if(VerboseLog) FLog::Controller << "SCALFMM-DEBUG [" << communicator.processId() << "] Exchange number of leaves\n"; FLog::Controller.flush(); ); + + // prefix sum std::unique_ptr<FSize[]> diffNumberOfLeavesPerProc(new FSize[nbProcs+1]); diffNumberOfLeavesPerProc[0] = 0; for(int idxProc = 0 ; idxProc < nbProcs ; ++idxProc ){ @@ -350,39 +355,53 @@ public: for(int idxProc = 0 ; idxProc < nbProcs ; ++idxProc){ allObjectives[idxProc].first = balancer->getLeft(totalNumberOfLeavesInSimulation,nbProcs,idxProc); allObjectives[idxProc].second = balancer->getRight(totalNumberOfLeavesInSimulation,nbProcs,idxProc); + if(idxProc != 0) FAssertLF(allObjectives[idxProc].first == allObjectives[idxProc-1].second); } // Ask for the pack to send std::pair<size_t, size_t> myCurrentInter = {diffNumberOfLeavesPerProc[myRank], diffNumberOfLeavesPerProc[myRank+1]}; const std::vector<FEqualize::Package> packsToSend = FEqualize::GetPackToSend(myCurrentInter, allObjectives); - std::unique_ptr<FSize[]> nbPartsPerPackToSend(new FSize[packsToSend.size()]); + + FAssertLF((currentNbLeaves == 0 && packsToSend.size() == 0) || + (packsToSend.size() && FSize(packsToSend[packsToSend.size()-1].elementTo) == currentNbLeaves)); + + FLOG(if(VerboseLog) FLog::Controller << "SCALFMM-DEBUG [" << communicator.processId() << "] Get my interval (" << packsToSend.size() << ")\n"; FLog::Controller.flush(); ); + FLOG(if(VerboseLog) FLog::Controller << "SCALFMM-DEBUG [" << communicator.processId() << "] Send data\n"; FLog::Controller.flush(); ); + // Store the requests - std::vector<MPI_Request> requestsParts; std::vector<MPI_Request> requestsNbParts; + requestsNbParts.reserve(packsToSend.size()); + // Send every thing except for me or if size == 0 + FSize totalSend = 0; + FSize sendToMe = 0; for(unsigned int idxPack = 0; idxPack< packsToSend.size() ; ++idxPack){ const FEqualize::Package& pack = packsToSend[idxPack]; + + if(idxPack != 0) FAssertLF(packsToSend[idxPack].elementFrom == packsToSend[idxPack-1].elementTo); + const long long int nbPartsPerPackToSend = leavesOffsetInParticles[pack.elementTo]-leavesOffsetInParticles[pack.elementFrom]; + totalSend += nbPartsPerPackToSend; + if(pack.idProc != myRank && 0 < (pack.elementTo-pack.elementFrom)){ // If not to me and if there is something to send - nbPartsPerPackToSend[idxPack] = leavesOffsetInParticles[pack.elementTo]-leavesOffsetInParticles[pack.elementFrom]; + FLOG(if(VerboseLog) FLog::Controller << "SCALFMM-DEBUG [" << communicator.processId() << "] pre-send to " << pack.idProc << " nb " << nbPartsPerPackToSend + << " from " << pack.elementFrom << " to " << pack.elementTo << " \n"; FLog::Controller.flush(); ); // Send the size of the data requestsNbParts.emplace_back(); - FMpi::MpiAssert(MPI_Isend(&nbPartsPerPackToSend[idxPack],1,MPI_LONG_LONG_INT,pack.idProc, - FMpi::TagExchangeIndexs+1, communicator.getComm(), &requestsNbParts.back()),__LINE__); - // Send the data - for(FSize idxMess = 0 ; idxMess < nbPartsPerPackToSend[idxPack]; idxMess += MAX_PARTICLES_PER_MPI_MESS){ - const int nbElementsInMessage = int(FMath::Min(nbPartsPerPackToSend[idxPack]-idxMess, MAX_PARTICLES_PER_MPI_MESS)); - requestsParts.emplace_back(); - FMpi::MpiAssert(MPI_Isend(const_cast<ParticleClass*>(&particlesArrayInLeafOrder[leavesOffsetInParticles[pack.elementFrom]+idxMess]), - int(sizeof(ParticleClass)*nbElementsInMessage), - MPI_BYTE, pack.idProc, int(FMpi::TagExchangeIndexs + 2 + idxMess), communicator.getComm(), &requestsParts.back()), __LINE__); - } + FMpi::MpiAssert(MPI_Isend(&nbPartsPerPackToSend,1,MPI_LONG_LONG_INT,pack.idProc, + FMpi::TagExchangeIndexs, communicator.getComm(), &requestsNbParts.back()),__LINE__); + } else { - // Nothing to send - nbPartsPerPackToSend[idxPack] = 0; + sendToMe = nbPartsPerPackToSend; + FLOG(if(VerboseLog) FLog::Controller << "SCALFMM-DEBUG [" << communicator.processId() << "] skip " << idxPack + << " from " << pack.elementFrom << " to " << pack.elementTo << " \n"; FLog::Controller.flush(); ); } } + FLOG(if(VerboseLog) FLog::Controller << "SCALFMM-DEBUG [" << communicator.processId() << "] Send done \n"; FLog::Controller.flush(); ); + // Ensure everything has been proceed + FAssertLF(totalSend == currentNbParts); + // Compute the current intervals std::vector< std::pair<size_t,size_t> > allCurrentIntervals; allCurrentIntervals.resize(nbProcs); @@ -391,18 +410,24 @@ public: allCurrentIntervals[idxProc].second = diffNumberOfLeavesPerProc[idxProc+1]; } // Ask the packs to receive to fill my objective + FLOG(if(VerboseLog) FLog::Controller << "SCALFMM-DEBUG [" << communicator.processId() << "] Get my receive interval \n"; FLog::Controller.flush(); ); std::pair<size_t, size_t> myObjective = allObjectives[myRank]; - const std::vector<FEqualize::Package> packsToRecv = FEqualize::GetPackToRecv(myObjective, allCurrentIntervals); + const std::vector<FEqualize::Package> packsToRecv = FEqualize::GetPackToRecv(myObjective, allCurrentIntervals); + FLOG(if(VerboseLog) FLog::Controller << "SCALFMM-DEBUG [" << communicator.processId() << "] recv nb particles \n"; FLog::Controller.flush(); ); // Count the number of parts to receive std::unique_ptr<FSize[]> nbPartsPerPackToRecv(new FSize[packsToRecv.size()]); for(unsigned int idxPack = 0; idxPack < packsToRecv.size(); ++idxPack){ const FEqualize::Package& pack = packsToRecv[idxPack]; + + if(idxPack != 0) FAssertLF(packsToRecv[idxPack].elementFrom == packsToRecv[idxPack-1].elementTo); + if(pack.idProc != myRank && 0 < (pack.elementTo-pack.elementFrom)){ + FLOG(if(VerboseLog) FLog::Controller << "SCALFMM-DEBUG [" << communicator.processId() << "] pre-recv from " << pack.idProc << " \n"; FLog::Controller.flush(); ); // We need to know how much particles to receive requestsNbParts.emplace_back(); FMpi::MpiAssert(MPI_Irecv(&nbPartsPerPackToRecv[idxPack], 1, MPI_LONG_LONG_INT, pack.idProc, - FMpi::TagExchangeIndexs+1, communicator.getComm(), &requestsNbParts.back()), __LINE__); + FMpi::TagExchangeIndexs, communicator.getComm(), &requestsNbParts.back()), __LINE__); } else{ if(pack.idProc == myRank){ @@ -410,6 +435,9 @@ public: const FSize sourcePosition = FMath::Max(myObjective.first, myCurrentInter.first) - myCurrentInter.first; const FSize nbLeavesToCopy = pack.elementTo-pack.elementFrom; nbPartsPerPackToRecv[idxPack] = leavesOffsetInParticles[sourcePosition+nbLeavesToCopy] - leavesOffsetInParticles[sourcePosition]; + FAssertLF(nbPartsPerPackToRecv[idxPack] == sendToMe); + FLOG(if(VerboseLog) FLog::Controller << "SCALFMM-DEBUG [" << communicator.processId() << "] skip recv " << + idxPack << " nb " << nbPartsPerPackToRecv[idxPack] << " \n"; FLog::Controller.flush(); ); } else{ // Nothing to receive from this so avoid communication @@ -418,33 +446,74 @@ public: } } + FLOG(if(VerboseLog) FLog::Controller << "SCALFMM-DEBUG [" << communicator.processId() << "] Wait \n"; FLog::Controller.flush(); ); + FMpi::MpiAssert(MPI_Waitall(int(requestsNbParts.size()), requestsNbParts.data(), MPI_STATUSES_IGNORE), __LINE__); + FLOG(if(VerboseLog) FLog::Controller << "SCALFMM-DEBUG [" << communicator.processId() << "] Wait Done \n"; FLog::Controller.flush(); ); + + std::vector<MPI_Request> requestsParts; + + for(unsigned int idxPack = 0; idxPack< packsToSend.size() ; ++idxPack){ + const FEqualize::Package& pack = packsToSend[idxPack]; + if(pack.idProc != myRank && 0 < (pack.elementTo-pack.elementFrom)){ + const long long int nbPartsPerPackToSend = leavesOffsetInParticles[pack.elementTo]-leavesOffsetInParticles[pack.elementFrom]; + + FLOG(if(VerboseLog) FLog::Controller << "SCALFMM-DEBUG [" << communicator.processId() << "] send to " + << pack.idProc << " nb " << nbPartsPerPackToSend << " \n"; FLog::Controller.flush(); ); + + FMpi::ISendSplit(&particlesArrayInLeafOrder[leavesOffsetInParticles[pack.elementFrom]], + nbPartsPerPackToSend, + pack.idProc, + FMpi::TagExchangeIndexs + 1, + communicator, + &requestsParts); + } + } + + + FLOG(if(VerboseLog) FLog::Controller << "SCALFMM-DEBUG [" << communicator.processId() << "] barrier after all send \n"; FLog::Controller.flush(); ); + + //////////////////////////////////////////////////////////////// // Count the number of leaf to receive + //////////////////////////////////////////////////////////////// FSize totalPartsToReceive = 0; for(unsigned int idxPack = 0; idxPack < packsToRecv.size(); ++idxPack){ totalPartsToReceive += nbPartsPerPackToRecv[idxPack]; } - std::vector<ParticleClass> particlesRecvBuffer; + std::unique_ptr<ParticleClass[]> particlesRecvBuffer(new ParticleClass[totalPartsToReceive]); + + //////////////////////////////////////////////////////////////// // Post all the receive and copy mine + // it is based on the nbPartsPerPackToRecv array + //////////////////////////////////////////////////////////////// if(totalPartsToReceive){ - particlesRecvBuffer.resize(totalPartsToReceive); FSize offsetToRecv = 0; for(unsigned int idxPack = 0; idxPack < packsToRecv.size(); ++idxPack){ const FEqualize::Package& pack = packsToRecv[idxPack]; + // If it is not from me if(pack.idProc != myRank && 0 < (pack.elementTo-pack.elementFrom)){ - for(FSize idxMess = 0 ; idxMess < nbPartsPerPackToRecv[idxPack]; idxMess += MAX_PARTICLES_PER_MPI_MESS){ - const int nbElementsInMessage = int(FMath::Min(nbPartsPerPackToRecv[idxPack]-idxMess, MAX_PARTICLES_PER_MPI_MESS)); - requestsParts.emplace_back(); - FMpi::MpiAssert( MPI_Irecv(&particlesRecvBuffer[offsetToRecv+idxMess], - int(sizeof(ParticleClass)*nbElementsInMessage), MPI_BYTE, pack.idProc, - int(FMpi::TagExchangeIndexs + 2 + idxMess), communicator.getComm(), &requestsParts.back()), __LINE__); - } + FLOG(if(VerboseLog) FLog::Controller << "SCALFMM-DEBUG [" << communicator.processId() << "] recv from " + << pack.idProc << " nb " << nbPartsPerPackToRecv[idxPack] << " from " << pack.elementFrom << "\n"; FLog::Controller.flush(); ); + + // We store from offset, and use nbPartsPerPackToRecv has the number + FMpi::IRecvSplit(&particlesRecvBuffer[offsetToRecv], + nbPartsPerPackToRecv[idxPack], + pack.idProc, + FMpi::TagExchangeIndexs + 1, + communicator, + &requestsParts); + } + // it is from me, just copy else if(pack.idProc == myRank){ + FLOG(if(VerboseLog) FLog::Controller << "SCALFMM-DEBUG [" << communicator.processId() << "] copy " + << idxPack << " nb " << nbPartsPerPackToRecv[idxPack] << " from " << pack.elementFrom << " \n"; FLog::Controller.flush(); ); // Copy my particles const FSize sourcePosition = FMath::Max(myObjective.first, myCurrentInter.first) - myCurrentInter.first; + // We store from offset, and use nbPartsPerPackToRecv has the number + // The reading position is the offset of the first leaf we own memcpy(&particlesRecvBuffer[offsetToRecv], &particlesArrayInLeafOrder[leavesOffsetInParticles[sourcePosition]], nbPartsPerPackToRecv[idxPack]*sizeof(ParticleClass)); } @@ -452,11 +521,15 @@ public: } } - // Finalize communication - FMpi::MpiAssert(MPI_Waitall(int(requestsParts.size()), requestsParts.data(), MPI_STATUSES_IGNORE), __LINE__); + + FLOG(if(VerboseLog) FLog::Controller << "SCALFMM-DEBUG [" << communicator.processId() << "] pre Wait \n"; FLog::Controller.flush(); ); + + FMpi::Assert( MPI_Waitall(int(requestsParts.size()), requestsParts.data(), MPI_STATUSES_IGNORE), __LINE__ ); + + FLOG(if(VerboseLog) FLog::Controller << "SCALFMM-DEBUG [" << communicator.processId() << "] Wait Done \n"; FLog::Controller.flush(); ); // Insert in the particle saver - for(FSize idPartsToStore = 0 ; idPartsToStore < int(particlesRecvBuffer.size()) ; ++idPartsToStore){ + for(FSize idPartsToStore = 0 ; idPartsToStore < totalPartsToReceive ; ++idPartsToStore){ particlesSaver->push(particlesRecvBuffer[idPartsToStore]); } } @@ -471,7 +544,7 @@ public: const FPoint<FReal>& boxCenter, const FReal boxWidth, const int treeHeight, ContainerClass* particleSaver, FAbstractBalanceAlgorithm* balancer, const SortingType sortingType = QuickSort){ - FLOG( FLog::Controller << "Particles Distribution: " << "Enter DistributeArrayToContainer\n" ; FLog::Controller.flush(); ); + FLOG( FLog::Controller << "[" << communicator.processId() << "] Particles Distribution: " << "Enter DistributeArrayToContainer\n" ; FLog::Controller.flush(); ); FLOG( FTic timer ); IndexedParticle* sortedParticlesArray = nullptr; @@ -479,9 +552,13 @@ public: // From ParticleClass get array of IndexedParticle sorted GetSortedParticlesFromArray(communicator, originalParticlesArray, originalNbParticles, sortingType, boxCenter, boxWidth, treeHeight, &sortedParticlesArray, &nbParticlesInArray); - FLOG( FLog::Controller << "Particles Distribution: " << "\t GetSortedParticlesFromArray is over (" << timer.tacAndElapsed() << "s)\n"; FLog::Controller.flush(); ); + FLOG( FLog::Controller << "[" << communicator.processId() << "] Particles Distribution: " << "\t GetSortedParticlesFromArray is over (" << timer.tacAndElapsed() << "s)\n"; FLog::Controller.flush(); ); FLOG( timer.tic() ); +// for(int idx = 0 ; idx < nbParticlesInArray ; ++idx){ +// particleSaver->push(sortedParticlesArray[idx].particle); +// } + ParticleClass* particlesArrayInLeafOrder = nullptr; FSize * leavesOffsetInParticles = nullptr; FSize nbLeaves = 0; @@ -489,7 +566,11 @@ public: MergeSplitedLeaves(communicator, sortedParticlesArray, &nbParticlesInArray, &leavesOffsetInParticles, &particlesArrayInLeafOrder, &nbLeaves); delete[] sortedParticlesArray; - FLOG( FLog::Controller << "Particles Distribution: " << "\t MergeSplitedLeaves is over (" << timer.tacAndElapsed() << "s)\n"; FLog::Controller.flush(); ); +// for(int idx = 0 ; idx < nbParticlesInArray ; ++idx){ +// particleSaver->push(particlesArrayInLeafOrder[idx]); +// } + + FLOG( FLog::Controller << "[" << communicator.processId() << "] Particles Distribution: " << "\t MergeSplitedLeaves is over (" << timer.tacAndElapsed() << "s)\n"; FLog::Controller.flush(); ); FLOG( timer.tic() ); // Equalize and balance @@ -498,9 +579,9 @@ public: delete[] particlesArrayInLeafOrder; delete[] leavesOffsetInParticles; - FLOG( FLog::Controller << "Particles Distribution: " << "\t EqualizeAndFillContainer is over (" << timer.tacAndElapsed() << "s)\n"; FLog::Controller.flush(); ); + FLOG( FLog::Controller << "[" << communicator.processId() << "] Particles Distribution: " << "\t EqualizeAndFillContainer is over (" << timer.tacAndElapsed() << "s)\n"; FLog::Controller.flush(); ); - FLOG( FLog::Controller << "Particles Distribution: " << "\t DistributeArrayToContainer is over (" << timer.cumulated() << "s)\n"; FLog::Controller.flush(); ); + FLOG( FLog::Controller << "[" << communicator.processId() << "] Particles Distribution: " << "\t DistributeArrayToContainer is over (" << timer.cumulated() << "s)\n"; FLog::Controller.flush(); ); #ifdef SCALFMM_USE_LOG /** To produce stats after the Equalize phase */ @@ -539,4 +620,10 @@ public: }; + +#ifdef SCALFMM_USE_LOG +template<class FReal, class ParticleClass> +const bool FMpiTreeBuilder<FReal,ParticleClass>::VerboseLog = FEnv::GetBool("SCALFMM_DEBUG_LOG", false); +#endif + #endif // FMPITREEBUILDER_H diff --git a/Src/GroupTree/Core/FGroupOfParticles.hpp b/Src/GroupTree/Core/FGroupOfParticles.hpp index 93478fc5ab679e466e6edfd4e8b8911a21631709..4225095238331ac3e9266d3aea9632163c061ff8 100644 --- a/Src/GroupTree/Core/FGroupOfParticles.hpp +++ b/Src/GroupTree/Core/FGroupOfParticles.hpp @@ -68,8 +68,6 @@ protected: BlockHeader* blockHeader; //< Pointer to leaves information LeafHeader* leafHeader; - //< The total number of particles in the group - const FSize nbParticlesInGroup; //< Pointers to particle position x, y, z FReal* particlePosition[3]; @@ -92,7 +90,7 @@ public: FGroupOfParticles(unsigned char* inBuffer, const size_t inAllocatedMemoryInByte, unsigned char* inAttributes) : allocatedMemoryInByte(inAllocatedMemoryInByte), memoryBuffer(inBuffer), - blockHeader(nullptr), leafHeader(nullptr), nbParticlesInGroup(0), + blockHeader(nullptr), leafHeader(nullptr), attributesBuffer(nullptr), deleteBuffer(false){ // Move the pointers to the correct position blockHeader = reinterpret_cast<BlockHeader*>(inBuffer); @@ -127,12 +125,12 @@ public: * @param inNumberOfLeaves total number of leaves in the interval (should be <= inEndingIndex-inEndingIndex) */ FGroupOfParticles(const MortonIndex inStartingIndex, const MortonIndex inEndingIndex, const int inNumberOfLeaves, const FSize inNbParticles) - : allocatedMemoryInByte(0), memoryBuffer(nullptr), blockHeader(nullptr), leafHeader(nullptr), nbParticlesInGroup(inNbParticles), + : allocatedMemoryInByte(0), memoryBuffer(nullptr), blockHeader(nullptr), leafHeader(nullptr), deleteBuffer(true){ memset(particlePosition, 0, sizeof(particlePosition)); memset(particleAttributes, 0, sizeof(particleAttributes)); - const FSize nbParticlesAllocatedInGroup = RoundToUpperParticles(nbParticlesInGroup+(MemoryAlignementParticles-1)*inNumberOfLeaves); + const FSize nbParticlesAllocatedInGroup = RoundToUpperParticles(inNbParticles+(MemoryAlignementParticles-1)*inNumberOfLeaves); // Find the number of leaf to allocate in the blocks FAssertLF((inEndingIndex-inStartingIndex) >= MortonIndex(inNumberOfLeaves)); @@ -161,6 +159,7 @@ public: blockHeader->endingIndex = inEndingIndex; blockHeader->numberOfLeavesInBlock = inNumberOfLeaves; blockHeader->nbParticlesAllocatedInGroup = nbParticlesAllocatedInGroup; + blockHeader->nbParticlesInGroup = inNbParticles; // Init particle pointers blockHeader->positionsLeadingDim = (sizeof(FReal) * nbParticlesAllocatedInGroup); @@ -247,7 +246,7 @@ public: /** Get the total number of particles in the group */ FSize getNbParticlesInGroup() const { - return nbParticlesInGroup; + return blockHeader->nbParticlesInGroup; } /** The size of the interval endingIndex-startingIndex (set from the constructor) */ diff --git a/Src/GroupTree/Core/FGroupTaskDepAlgorithm.hpp b/Src/GroupTree/Core/FGroupTaskDepAlgorithm.hpp index 04b6433c9892c29e06e6dfaa71b7842926db7a36..163585bdd98f48ac0431cb2d342d45810a81f108 100644 --- a/Src/GroupTree/Core/FGroupTaskDepAlgorithm.hpp +++ b/Src/GroupTree/Core/FGroupTaskDepAlgorithm.hpp @@ -15,7 +15,6 @@ #include "FOutOfBlockInteraction.hpp" -#include <vector> #include <vector> #include <omp.h> @@ -37,6 +36,12 @@ #define priority_if_supported(x) #endif +#undef taskname_if_supported +#ifdef OPENMP_SUPPORT_TASK_NAME +#define taskname_if_supported(n) taskname(n) +#else +#define taskname_if_supported(n) +#endif template <class OctreeClass, class CellContainerClass, class CellClass, @@ -372,7 +377,7 @@ protected: ParticleGroupClass* containers = tree->getParticleGroup(idxGroup); - #pragma omp task default(shared) firstprivate(leafCells, cellPoles, containers) depend(inout: cellPoles[0]) priority_if_supported(priorities.getInsertionPosP2M()) + #pragma omp task default(shared) firstprivate(leafCells, cellPoles, containers) depend(inout: cellPoles[0]) priority_if_supported(priorities.getInsertionPosP2M()) taskname_if_supported("P2M") { FTIME_TASKS(FTaskTimer::ScopeEvent taskTime(omp_get_thread_num(), &taskTimeRecorder, leafCells->getStartingIndex() * 20 * 8, "P2M")); KernelClass*const kernel = kernels[omp_get_thread_num()]; @@ -417,7 +422,7 @@ protected: subCellGroup = (*iterChildCells); subCellGroupPoles = (*iterChildCells)->getRawMultipoleBuffer(); - #pragma omp task default(none) firstprivate(idxLevel, currentCells, cellPoles, subCellGroup, subCellGroupPoles) depend(commute_if_supported: cellPoles[0]) depend(in: subCellGroupPoles[0]) priority_if_supported(priorities.getInsertionPosM2M(idxLevel)) + #pragma omp task default(none) firstprivate(idxLevel, currentCells, cellPoles, subCellGroup, subCellGroupPoles) depend(commute_if_supported: cellPoles[0]) depend(in: subCellGroupPoles[0]) priority_if_supported(priorities.getInsertionPosM2M(idxLevel)) taskname_if_supported("M2M") { KernelClass*const kernel = kernels[omp_get_thread_num()]; const MortonIndex firstParent = FMath::Max(currentCells->getStartingIndex(), subCellGroup->getStartingIndex()>>3); @@ -492,7 +497,7 @@ protected: PoleCellClass* cellPoles = currentCells->getRawMultipoleBuffer(); LocalCellClass* cellLocals = currentCells->getRawLocalBuffer(); -#pragma omp task default(none) firstprivate(currentCells, cellPoles, cellLocals, idxLevel) depend(commute_if_supported: cellLocals[0]) depend(in: cellPoles[0]) priority_if_supported(priorities.getInsertionPosM2L(idxLevel)) +#pragma omp task default(none) firstprivate(currentCells, cellPoles, cellLocals, idxLevel) depend(commute_if_supported: cellLocals[0]) depend(in: cellPoles[0]) priority_if_supported(priorities.getInsertionPosM2L(idxLevel)) taskname_if_supported("M2L") { FTIME_TASKS(FTaskTimer::ScopeEvent taskTime(omp_get_thread_num(), &taskTimeRecorder, ((currentCells->getStartingIndex() *20) + idxLevel ) * 8 + 2, "M2L")); const MortonIndex blockStartIdx = currentCells->getStartingIndex(); @@ -555,7 +560,7 @@ protected: LocalCellClass* cellOtherLocals = cellsOther->getRawLocalBuffer(); const std::vector<OutOfBlockInteraction>* outsideInteractions = &(*currentInteractions).interactions; - #pragma omp task default(none) firstprivate(currentCells, cellLocals, outsideInteractions, cellsOther, cellOtherPoles, idxLevel) depend(commute_if_supported: cellLocals[0]) depend(in: cellOtherPoles[0]) priority_if_supported(priorities.getInsertionPosM2LExtern(idxLevel)) + #pragma omp task default(none) firstprivate(currentCells, cellLocals, outsideInteractions, cellsOther, cellOtherPoles, idxLevel) depend(commute_if_supported: cellLocals[0]) depend(in: cellOtherPoles[0]) priority_if_supported(priorities.getInsertionPosM2LExtern(idxLevel)) taskname_if_supported("M2L-out") { FTIME_TASKS(FTaskTimer::ScopeEvent taskTime(omp_get_thread_num(), &taskTimeRecorder, (((currentCells->getStartingIndex()+1) * (cellsOther->getStartingIndex()+2)) * 20 + idxLevel) * 8 + 3, "M2L-ext")); KernelClass*const kernel = kernels[omp_get_thread_num()]; @@ -571,7 +576,7 @@ protected: } } - #pragma omp task default(none) firstprivate(currentCells, cellPoles, outsideInteractions, cellsOther, cellOtherLocals, idxLevel) depend(commute_if_supported: cellOtherLocals[0]) depend(in: cellPoles[0]) priority_if_supported(priorities.getInsertionPosM2LExtern(idxLevel)) + #pragma omp task default(none) firstprivate(currentCells, cellPoles, outsideInteractions, cellsOther, cellOtherLocals, idxLevel) depend(commute_if_supported: cellOtherLocals[0]) depend(in: cellPoles[0]) priority_if_supported(priorities.getInsertionPosM2LExtern(idxLevel)) taskname_if_supported("M2L-out") { FTIME_TASKS(FTaskTimer::ScopeEvent taskTime(omp_get_thread_num(), &taskTimeRecorder, (((currentCells->getStartingIndex()+1) * (cellsOther->getStartingIndex()+1)) * 20 + idxLevel) * 8 + 3, "M2L-ext")); KernelClass*const kernel = kernels[omp_get_thread_num()]; @@ -631,7 +636,7 @@ protected: subCellLocalGroupsLocal = (*iterChildCells)->getRawLocalBuffer(); if(noCommuteAtLastLevel == false || idxLevel != FAbstractAlgorithm::lowerWorkingLevel - 2){ - #pragma omp task default(none) firstprivate(idxLevel, currentCells, cellLocals, subCellGroup, subCellLocalGroupsLocal) depend(commute_if_supported: subCellLocalGroupsLocal[0]) depend(in: cellLocals[0]) priority_if_supported(priorities.getInsertionPosL2L(idxLevel)) + #pragma omp task default(none) firstprivate(idxLevel, currentCells, cellLocals, subCellGroup, subCellLocalGroupsLocal) depend(commute_if_supported: subCellLocalGroupsLocal[0]) depend(in: cellLocals[0]) priority_if_supported(priorities.getInsertionPosL2L(idxLevel)) taskname_if_supported("L2L") { KernelClass*const kernel = kernels[omp_get_thread_num()]; @@ -674,7 +679,7 @@ protected: } } else{ - #pragma omp task default(none) firstprivate(idxLevel, currentCells, cellLocals, subCellGroup, subCellLocalGroupsLocal) depend(inout: subCellLocalGroupsLocal[0]) depend(in: cellLocals[0]) priority_if_supported(priorities.getInsertionPosL2L(idxLevel)) + #pragma omp task default(none) firstprivate(idxLevel, currentCells, cellLocals, subCellGroup, subCellLocalGroupsLocal) depend(inout: subCellLocalGroupsLocal[0]) depend(in: cellLocals[0]) priority_if_supported(priorities.getInsertionPosL2L(idxLevel)) taskname_if_supported("L2L") { KernelClass*const kernel = kernels[omp_get_thread_num()]; @@ -760,7 +765,7 @@ protected: unsigned char* containersOtherDown = containersOther->getRawAttributesBuffer(); const std::vector<OutOfBlockInteraction>* outsideInteractions = &(*currentInteractions).interactions; -#pragma omp task default(none) firstprivate(containers, containersDown, containersOther, containersOtherDown, outsideInteractions) depend(commute_if_supported: containersOtherDown[0], containersDown[0]) priority_if_supported(priorities.getInsertionPosP2PExtern()) +#pragma omp task default(none) firstprivate(containers, containersDown, containersOther, containersOtherDown, outsideInteractions) depend(commute_if_supported: containersOtherDown[0], containersDown[0]) priority_if_supported(priorities.getInsertionPosP2PExtern()) taskname_if_supported("P2P-out") { FTIME_TASKS(FTaskTimer::ScopeEvent taskTime(omp_get_thread_num(), &taskTimeRecorder, ((containersOther->getStartingIndex()+1) * (containers->getStartingIndex()+1))*20*8 + 6, "P2P-ext")); KernelClass*const kernel = kernels[omp_get_thread_num()]; @@ -798,7 +803,7 @@ protected: ParticleGroupClass* containers = (*iterParticles); unsigned char* containersDown = containers->getRawAttributesBuffer(); - #pragma omp task default(none) firstprivate(containers, containersDown) depend(commute_if_supported: containersDown[0]) priority_if_supported(priorities.getInsertionPosP2P()) + #pragma omp task default(none) firstprivate(containers, containersDown) depend(commute_if_supported: containersDown[0]) priority_if_supported(priorities.getInsertionPosP2P()) taskname_if_supported("P2P") { FTIME_TASKS(FTaskTimer::ScopeEvent taskTime(omp_get_thread_num(), &taskTimeRecorder, containers->getStartingIndex()*20*8 + 5, "P2P")); const MortonIndex blockStartIdx = containers->getStartingIndex(); @@ -853,7 +858,7 @@ protected: ParticleGroupClass* containers = tree->getParticleGroup(idxGroup); unsigned char* containersDown = containers->getRawAttributesBuffer(); - #pragma omp task default(shared) firstprivate(leafCells, cellLocals, containers, containersDown) depend(commute_if_supported: containersDown[0]) depend(in: cellLocals[0]) priority_if_supported(priorities.getInsertionPosL2P()) + #pragma omp task default(shared) firstprivate(leafCells, cellLocals, containers, containersDown) depend(commute_if_supported: containersDown[0]) depend(in: cellLocals[0]) priority_if_supported(priorities.getInsertionPosL2P()) taskname_if_supported("L2P") { FTIME_TASKS(FTaskTimer::ScopeEvent taskTime(omp_get_thread_num(), &taskTimeRecorder, (leafCells->getStartingIndex()*20*8) + 7, "L2P")); KernelClass*const kernel = kernels[omp_get_thread_num()]; diff --git a/Src/GroupTree/Core/FGroupTaskStarpuAlgorithm.hpp b/Src/GroupTree/Core/FGroupTaskStarpuAlgorithm.hpp index eba5a5cd192f366a62416b36e8132d0252d433fc..562e3789b23b31ec0cca5ed1df13a9e21d380293 100644 --- a/Src/GroupTree/Core/FGroupTaskStarpuAlgorithm.hpp +++ b/Src/GroupTree/Core/FGroupTaskStarpuAlgorithm.hpp @@ -250,6 +250,27 @@ public: #endif } + void syncData(){ + for(int idxLevel = 0 ; idxLevel < tree->getHeight() ; ++idxLevel){ + for(int idxHandle = 0 ; idxHandle < int(cellHandles[idxLevel].size()) ; ++idxHandle){ + starpu_data_acquire(cellHandles[idxLevel][idxHandle].symb, STARPU_R); + starpu_data_release(cellHandles[idxLevel][idxHandle].symb); + starpu_data_acquire(cellHandles[idxLevel][idxHandle].up, STARPU_R); + starpu_data_release(cellHandles[idxLevel][idxHandle].up); + starpu_data_acquire(cellHandles[idxLevel][idxHandle].down, STARPU_R); + starpu_data_release(cellHandles[idxLevel][idxHandle].down); + } + } + { + for(int idxHandle = 0 ; idxHandle < int(particleHandles.size()) ; ++idxHandle){ + starpu_data_acquire(particleHandles[idxHandle].symb, STARPU_R); + starpu_data_release(particleHandles[idxHandle].symb); + starpu_data_acquire(particleHandles[idxHandle].down, STARPU_R); + starpu_data_release(particleHandles[idxHandle].down); + } + } + } + ~FGroupTaskStarPUAlgorithm(){ starpu_resume(); @@ -338,6 +359,11 @@ protected: FLOG( FLog::Controller << "\t\t Submitting the tasks took " << timerSoumission.tacAndElapsed() << "s\n" ); starpu_task_wait_for_all(); + + FLOG( FTic timerSync; ); + syncData(); + FLOG( FLog::Controller << "\t\t Moving data to the host took " << timerSync.tacAndElapsed() << "s\n" ); + starpu_pause(); #ifdef STARPU_USE_CPU diff --git a/Src/GroupTree/Core/FGroupTaskStarpuMpiAlgorithm.hpp b/Src/GroupTree/Core/FGroupTaskStarpuMpiAlgorithm.hpp index fb44ebdc6d04a9c15e2da2b631fdc8563613803d..2dc3e070d1c71216829fbb57a04a5887e4762eec 100644 --- a/Src/GroupTree/Core/FGroupTaskStarpuMpiAlgorithm.hpp +++ b/Src/GroupTree/Core/FGroupTaskStarpuMpiAlgorithm.hpp @@ -265,6 +265,27 @@ public: #endif } + void syncData(){ + for(int idxLevel = 0 ; idxLevel < tree->getHeight() ; ++idxLevel){ + for(int idxHandle = 0 ; idxHandle < int(cellHandles[idxLevel].size()) ; ++idxHandle){ + starpu_data_acquire(cellHandles[idxLevel][idxHandle].symb, STARPU_R); + starpu_data_release(cellHandles[idxLevel][idxHandle].symb); + starpu_data_acquire(cellHandles[idxLevel][idxHandle].up, STARPU_R); + starpu_data_release(cellHandles[idxLevel][idxHandle].up); + starpu_data_acquire(cellHandles[idxLevel][idxHandle].down, STARPU_R); + starpu_data_release(cellHandles[idxLevel][idxHandle].down); + } + } + { + for(int idxHandle = 0 ; idxHandle < int(particleHandles.size()) ; ++idxHandle){ + starpu_data_acquire(particleHandles[idxHandle].symb, STARPU_R); + starpu_data_release(particleHandles[idxHandle].symb); + starpu_data_acquire(particleHandles[idxHandle].down, STARPU_R); + starpu_data_release(particleHandles[idxHandle].down); + } + } + } + ~FGroupTaskStarPUMpiAlgorithm(){ starpu_resume(); @@ -324,6 +345,7 @@ public: } protected: + /** * Runs the complete algorithm. */ @@ -362,6 +384,11 @@ protected: #endif starpu_task_wait_for_all(); + + FLOG( FTic timerSync; ); + syncData(); + FLOG( FLog::Controller << "\t\t Moving data to the host took " << timerSync.tacAndElapsed() << "s\n" ); + starpu_pause(); #ifdef STARPU_USE_CPU diff --git a/Src/GroupTree/Cuda/FCudaDeviceWrapper.cu b/Src/GroupTree/Cuda/FCudaDeviceWrapper.cu index cd1980ca2599ea422155b335674bc33e093c652b..9526174d8dada5fe40a04af7960654c15978908e 100644 --- a/Src/GroupTree/Cuda/FCudaDeviceWrapper.cu +++ b/Src/GroupTree/Cuda/FCudaDeviceWrapper.cu @@ -65,70 +65,48 @@ __host__ void FCuda__bottomPassCallback(unsigned char* leafCellsPtr, std::size_t template <class SymboleCellClass, class PoleCellClass, class LocalCellClass, class CellContainerClass, class ParticleContainerGroupClass, class ParticleGroupClass, class CudaKernelClass> __global__ void FCuda__upwardPassPerform(unsigned char* currentCellsPtr, std::size_t currentCellsSize, unsigned char* currentCellsUpPtr, - FCudaParams<unsigned char*,9> subCellGroupsPtr, FCudaParams<std::size_t, 9> subCellGroupsSize, - FCudaParams<unsigned char*,9> subCellGroupsUpPtr, - int nbSubCellGroups, int idxLevel, CudaKernelClass* kernel){ + unsigned char* childCellsPtr, std::size_t childCellsSize, unsigned char* childCellsUpPtr, + int idxLevel, CudaKernelClass* kernel){ CellContainerClass currentCells(currentCellsPtr, currentCellsSize,currentCellsUpPtr,nullptr); - CellContainerClass subCellGroups[9]; - for(int idx = 0 ; idx < nbSubCellGroups ; ++idx){ - subCellGroups[idx].reset(subCellGroupsPtr.values[idx], subCellGroupsSize.values[idx], subCellGroupsUpPtr.values[idx], nullptr); - } - - const int firstCell = FCudaMin(currentCells.getNumberOfCellsInBlock(), blockIdx.x*((currentCells.getNumberOfCellsInBlock()+gridDim.x-1)/gridDim.x)); - const int lastCell = FCudaMin(currentCells.getNumberOfCellsInBlock(), (blockIdx.x+1)*((currentCells.getNumberOfCellsInBlock()+gridDim.x-1)/gridDim.x)); + CellContainerClass subCellGroup(childCellsPtr, childCellsSize,childCellsUpPtr,nullptr); - if(firstCell == currentCells.getNumberOfCellsInBlock()){ - return ; - } + const MortonIndex firstParent = FCudaMax(currentCells.getStartingIndex(), subCellGroup.getStartingIndex()>>3); + const MortonIndex lastParent = FCudaMin(currentCells.getEndingIndex()-1, (subCellGroup.getEndingIndex()-1)>>3); - FCudaAssertLF(nbSubCellGroups != 0); - int idxSubCellGroup = 0; - int idxChildCell = 0; - {// Find first child - const MortonIndex mindex = currentCells.getCellMortonIndex(firstCell); - while(idxSubCellGroup != nbSubCellGroups - && (mindex < (subCellGroups[idxSubCellGroup].getStartingIndex()>>3))){ - idxSubCellGroup += 1; - } - FCudaAssertLF(idxSubCellGroup != nbSubCellGroups); - idxChildCell = subCellGroups[idxSubCellGroup].getFistChildIdx(currentCells.getCellMortonIndex(0)); - } - FCudaAssertLF(idxChildCell != -1); + int idxParentCell = currentCells.getCellIndex(firstParent); + int idxChildCell = subCellGroup.getFistChildIdx(firstParent); - for(int cellIdx = firstCell ; cellIdx < lastCell ; ++cellIdx){ - typename CellContainerClass::CompleteCellClass cell = currentCells.getUpCell(cellIdx); - FCudaAssertLF(cell.symb->mortonIndex == currentCells.getCellMortonIndex(cellIdx)); + while(true){ + typename CellContainerClass::CompleteCellClass cell = currentCells.getUpCell(idxParentCell); typename CellContainerClass::CompleteCellClass child[8]; - FCudaAssertLF(idxSubCellGroup != nbSubCellGroups); for(int idxChild = 0 ; idxChild < 8 ; ++idxChild){ child[idxChild].symb = nullptr; } - while(idxSubCellGroup != nbSubCellGroups - && (subCellGroups[idxSubCellGroup].getCellMortonIndex(idxChildCell)>>3) == cell.symb->mortonIndex){ - const int idxChild = ((subCellGroups[idxSubCellGroup].getCellMortonIndex(idxChildCell)) & 7); - FCudaAssertLF(child[idxChild].symb == nullptr); - child[idxChild] = subCellGroups[idxSubCellGroup].getUpCell(idxChildCell); + do{ + const int idxChild = ((subCellGroup.getCellMortonIndex(idxChildCell)) & 7); + child[idxChild] = subCellGroup.getUpCell(idxChildCell); idxChildCell += 1; - if(idxChildCell == subCellGroups[idxSubCellGroup].getNumberOfCellsInBlock()){ - idxChildCell = 0; - idxSubCellGroup += 1; - } - } + }while(idxChildCell != subCellGroup.getNumberOfCellsInBlock() && cell.symb->mortonIndex == (subCellGroup.getCellMortonIndex(idxChildCell)>>3)); kernel->M2M(cell, child, idxLevel); + + if(currentCells.getCellMortonIndex(idxParentCell) == lastParent){ + break; + } + + idxParentCell += 1; } } template <class SymboleCellClass, class PoleCellClass, class LocalCellClass, class CellContainerClass, class ParticleContainerGroupClass, class ParticleGroupClass, class CudaKernelClass> __host__ void FCuda__upwardPassCallback(unsigned char* currentCellsPtr, std::size_t currentCellsSize, unsigned char* currentCellsUpPtr, - FCudaParams<unsigned char*,9> subCellGroupsPtr, FCudaParams<std::size_t, 9> subCellGroupsSize, - FCudaParams<unsigned char*,9> subCellGroupsUpPtr, - int nbSubCellGroups, int idxLevel, CudaKernelClass* kernel, cudaStream_t currentStream, + unsigned char* childCellsPtr, std::size_t childCellsSize, unsigned char* childCellsUpPtr, + int idxLevel, CudaKernelClass* kernel, cudaStream_t currentStream, const dim3 inGridSize, const dim3 inBlocksSize){ FCuda__upwardPassPerform @@ -136,8 +114,8 @@ __host__ void FCuda__upwardPassCallback(unsigned char* currentCellsPtr, std::siz CellContainerClass, ParticleContainerGroupClass, ParticleGroupClass, CudaKernelClass> <<<inGridSize, inBlocksSize, 0, currentStream>>> (currentCellsPtr, currentCellsSize,currentCellsUpPtr, - subCellGroupsPtr, subCellGroupsSize,subCellGroupsUpPtr, - nbSubCellGroups, idxLevel, kernel); + childCellsPtr, childCellsSize,childCellsUpPtr, + idxLevel, kernel); FCudaCheckAfterCall(); FCudaCheck(cudaStreamSynchronize(currentStream)); } @@ -169,11 +147,7 @@ __global__ void FCuda__transferInoutPassPerformMpi(unsigned char* currentCellsP typename CellContainerClass::CompleteCellClass cell = currentCells.getDownCell(outsideInteractions[outInterIdx].insideIdxInBlock); FCudaAssertLF(cell.symb->mortonIndex == outsideInteractions[outInterIdx].insideIndex); - typename CellContainerClass::CompleteCellClass interactions[343]; - memset(interactions, 0, 343*sizeof(interactions[0])); - interactions[outsideInteractions[outInterIdx].relativeOutPosition] = interCell; - const int counter = 1; - kernel->M2L( cell , interactions, counter, idxLevel); + kernel->M2L( cell , &interCell, &outsideInteractions[outInterIdx].relativeOutPosition, 1, idxLevel); } } } @@ -230,8 +204,7 @@ __global__ void FCuda__transferInPassPerform(unsigned char* currentCellsPtr, st const int3 coord = (FCudaTreeCoordinate::ConvertCoordinate(cell.symb->coordinates)); int counter = FCudaTreeCoordinate::GetInteractionNeighbors(coord, idxLevel,interactionsIndexes,interactionsPosition); - typename CellContainerClass::CompleteCellClass interactions[343]; - memset(interactions, 0, 343*sizeof(interactions[0])); + typename CellContainerClass::CompleteCellClass interactions[189]; int counterExistingCell = 0; for(int idxInter = 0 ; idxInter < counter ; ++idxInter){ @@ -239,15 +212,14 @@ __global__ void FCuda__transferInPassPerform(unsigned char* currentCellsPtr, st const int cellPos = currentCells.getCellIndex(interactionsIndexes[idxInter]); if(cellPos != -1){ typename CellContainerClass::CompleteCellClass interCell = currentCells.getUpCell(cellPos); - FCudaAssertLF(interCell.symb->mortonIndex == interactionsIndexes[idxInter]); - FCudaAssertLF(interactions[interactionsPosition[idxInter]].symb == nullptr); - interactions[interactionsPosition[idxInter]] = interCell; + interactions[counterExistingCell] = interCell; + interactionsPosition[counterExistingCell] = interactionsPosition[idxInter]; counterExistingCell += 1; } } } - kernel->M2L( cell , interactions, counterExistingCell, idxLevel); + kernel->M2L( cell , interactions, interactionsPosition, counterExistingCell, idxLevel); } } @@ -272,36 +244,37 @@ __host__ void FCuda__transferInPassCallback(unsigned char* currentCellsPtr, std: template <class SymboleCellClass, class PoleCellClass, class LocalCellClass, class CellContainerClass, class ParticleContainerGroupClass, class ParticleGroupClass, class CudaKernelClass> __global__ void FCuda__transferInoutPassPerform(unsigned char* currentCellsPtr, std::size_t currentCellsSize, - unsigned char* currentCellsUpPtr, unsigned char* currentCellsDownPtr, + unsigned char* currentCellsDownPtr, unsigned char* externalCellsPtr, std::size_t externalCellsSize, - unsigned char* externalCellsUpPtr, unsigned char* externalCellsDownPtr, - int idxLevel, const OutOfBlockInteraction* outsideInteractions, + unsigned char* externalCellsUpPtr, + int idxLevel, int mode, const OutOfBlockInteraction* outsideInteractions, int nbOutsideInteractions, CudaKernelClass* kernel){ if(blockIdx.x != 0){ return; } - CellContainerClass currentCells(currentCellsPtr, currentCellsSize, currentCellsUpPtr, currentCellsDownPtr); - CellContainerClass cellsOther(externalCellsPtr, externalCellsSize, externalCellsUpPtr, externalCellsDownPtr); + CellContainerClass currentCells(currentCellsPtr, currentCellsSize, nullptr, currentCellsDownPtr); + CellContainerClass cellsOther(externalCellsPtr, externalCellsSize, externalCellsUpPtr, nullptr); - for(int outInterIdx = 0 ; outInterIdx < nbOutsideInteractions ; ++outInterIdx){ - const int cellPos = cellsOther.getCellIndex(outsideInteractions[outInterIdx].outIndex); - if(cellPos != -1){ - typename CellContainerClass::CompleteCellClass interCell = cellsOther.getCompleteCell(outsideInteractions[outInterIdx].outIndex); + if(mode == 1){ + for(int outInterIdx = 0 ; outInterIdx < nbOutsideInteractions ; ++outInterIdx){ + typename CellContainerClass::CompleteCellClass interCell = cellsOther.getUpCell(outsideInteractions[outInterIdx].outsideIdxInBlock); FCudaAssertLF(interCell.symb->mortonIndex == outsideInteractions[outInterIdx].outIndex); - typename CellContainerClass::CompleteCellClass cell = currentCells.getCompleteCell(outsideInteractions[outInterIdx].insideIdxInBlock); - FCudaAssertLF(cell.symb); + typename CellContainerClass::CompleteCellClass cell = currentCells.getDownCell(outsideInteractions[outInterIdx].insideIdxInBlock); FCudaAssertLF(cell.symb->mortonIndex == outsideInteractions[outInterIdx].insideIndex); - typename CellContainerClass::CompleteCellClass interactions[343]; - memset(interactions, 0, 343*sizeof(interactions[0])); - interactions[outsideInteractions[outInterIdx].relativeOutPosition] = interCell; - const int counter = 1; - kernel->M2L( cell , interactions, counter, idxLevel); + kernel->M2L( cell , &interCell, &outsideInteractions[outInterIdx].relativeOutPosition, 1, idxLevel); + } + } + else{ + for(int outInterIdx = 0 ; outInterIdx < nbOutsideInteractions ; ++outInterIdx){ + typename CellContainerClass::CompleteCellClass cell = cellsOther.getUpCell(outsideInteractions[outInterIdx].insideIdxInBlock); + FCudaAssertLF(cell.symb->mortonIndex == outsideInteractions[outInterIdx].insideIndex); + typename CellContainerClass::CompleteCellClass interCell = currentCells.getDownCell(outsideInteractions[outInterIdx].outsideIdxInBlock); + FCudaAssertLF(interCell.symb->mortonIndex == outsideInteractions[outInterIdx].outIndex); - interactions[outsideInteractions[outInterIdx].relativeOutPosition].symb = nullptr; - interactions[FMGetOppositeInterIndex(outsideInteractions[outInterIdx].relativeOutPosition)] = cell; - kernel->M2L( interCell , interactions, counter, idxLevel); + const int otherPosition = FMGetOppositeInterIndex(outsideInteractions[outInterIdx].relativeOutPosition); + kernel->M2L( interCell , &cell, &otherPosition, 1, idxLevel); } } } @@ -310,10 +283,10 @@ __global__ void FCuda__transferInoutPassPerform(unsigned char* currentCellsPtr, template <class SymboleCellClass, class PoleCellClass, class LocalCellClass, class CellContainerClass, class ParticleContainerGroupClass, class ParticleGroupClass, class CudaKernelClass> __host__ void FCuda__transferInoutPassCallback(unsigned char* currentCellsPtr, std::size_t currentCellsSize, - unsigned char* currentCellsUpPtr, unsigned char* currentCellsDownPtr, + unsigned char* currentCellsDownPtr, unsigned char* externalCellsPtr, std::size_t externalCellsSize, - unsigned char* externalCellsUpPtr, unsigned char* externalCellsDownPtr, - int idxLevel, const OutOfBlockInteraction* outsideInteractions, + unsigned char* externalCellsUpPtr, + int idxLevel, int mode, const OutOfBlockInteraction* outsideInteractions, int nbOutsideInteractions, CudaKernelClass* kernel, cudaStream_t currentStream, const dim3 inGridSize, const dim3 inBlocksSize){ OutOfBlockInteraction* cuOutsideInteractions; @@ -326,10 +299,10 @@ __host__ void FCuda__transferInoutPassCallback(unsigned char* currentCellsPtr, s <SymboleCellClass, PoleCellClass, LocalCellClass, CellContainerClass, ParticleContainerGroupClass, ParticleGroupClass, CudaKernelClass> <<<inGridSize, inBlocksSize, 0, currentStream>>>(currentCellsPtr, currentCellsSize, - currentCellsUpPtr, currentCellsDownPtr, + currentCellsDownPtr, externalCellsPtr, externalCellsSize, - externalCellsUpPtr, externalCellsDownPtr, - idxLevel, cuOutsideInteractions, + externalCellsUpPtr, + idxLevel, mode, cuOutsideInteractions, nbOutsideInteractions, kernel); FCudaCheckAfterCall(); FCudaCheck(cudaStreamSynchronize(currentStream)); @@ -345,77 +318,56 @@ __host__ void FCuda__transferInoutPassCallback(unsigned char* currentCellsPtr, s template <class SymboleCellClass, class PoleCellClass, class LocalCellClass, class CellContainerClass, class ParticleContainerGroupClass, class ParticleGroupClass, class CudaKernelClass> __global__ void FCuda__downardPassPerform(unsigned char* currentCellsPtr, std::size_t currentCellsSize, unsigned char* currentCellsDownPtr, - FCudaParams<unsigned char*,9> subCellGroupsPtr, FCudaParams<std::size_t,9> subCellGroupsSize, - FCudaParams<unsigned char*,9> subCellGroupsDownPtr, - int nbSubCellGroups, int idxLevel, CudaKernelClass* kernel){ - FCudaAssertLF(nbSubCellGroups != 0); - CellContainerClass currentCells(currentCellsPtr, currentCellsSize, nullptr, currentCellsDownPtr); - CellContainerClass subCellGroups[9]; - for(int idx = 0 ; idx < nbSubCellGroups ; ++idx){ - subCellGroups[idx].reset(subCellGroupsPtr.values[idx], subCellGroupsSize.values[idx], nullptr, subCellGroupsDownPtr.values[idx]); - } + unsigned char* childCellsPtr, std::size_t childCellsSize, unsigned char* childCellsDownPtr, + int idxLevel, CudaKernelClass* kernel){ + CellContainerClass currentCells(currentCellsPtr, currentCellsSize,nullptr,currentCellsDownPtr); + CellContainerClass subCellGroup(childCellsPtr, childCellsSize,nullptr,childCellsDownPtr); - const int firstCell = FCudaMin(currentCells.getNumberOfCellsInBlock(), blockIdx.x*((currentCells.getNumberOfCellsInBlock()+gridDim.x-1)/gridDim.x)); - const int lastCell = FCudaMin(currentCells.getNumberOfCellsInBlock(), (blockIdx.x+1)*((currentCells.getNumberOfCellsInBlock()+gridDim.x-1)/gridDim.x)); + const MortonIndex firstParent = FCudaMax(currentCells.getStartingIndex(), subCellGroup.getStartingIndex()>>3); + const MortonIndex lastParent = FCudaMin(currentCells.getEndingIndex()-1, (subCellGroup.getEndingIndex()-1)>>3); - if(firstCell == currentCells.getNumberOfCellsInBlock()){ - return ; - } + int idxParentCell = currentCells.getCellIndex(firstParent); + int idxChildCell = subCellGroup.getFistChildIdx(firstParent); - FCudaAssertLF(nbSubCellGroups != 0); - int idxSubCellGroup = 0; - int idxChildCell = 0; - {// Find first child - const MortonIndex mindex = currentCells.getCellMortonIndex(firstCell); - while(idxSubCellGroup != nbSubCellGroups - && (mindex < (subCellGroups[idxSubCellGroup].getStartingIndex()>>3))){ - idxSubCellGroup += 1; - } - FCudaAssertLF(idxSubCellGroup != nbSubCellGroups); - idxChildCell = subCellGroups[idxSubCellGroup].getFistChildIdx(currentCells.getCellMortonIndex(0)); - } - FCudaAssertLF(idxChildCell != -1); - - for(int cellIdx = firstCell ; cellIdx < lastCell ; ++cellIdx){ - typename CellContainerClass::CompleteCellClass cell = currentCells.getDownCell(cellIdx); - FCudaAssertLF(cell.symb->mortonIndex == currentCells.getCellMortonIndex(cellIdx)); + while(true){ + typename CellContainerClass::CompleteCellClass cell = currentCells.getDownCell(idxParentCell); typename CellContainerClass::CompleteCellClass child[8]; + for(int idxChild = 0 ; idxChild < 8 ; ++idxChild){ child[idxChild].symb = nullptr; } - while(idxSubCellGroup != nbSubCellGroups - && (subCellGroups[idxSubCellGroup].getCellMortonIndex(idxChildCell)>>3) == cell.symb->mortonIndex){ - const int idxChild = ((subCellGroups[idxSubCellGroup].getCellMortonIndex(idxChildCell)) & 7); - FCudaAssertLF(child[idxChild].symb == nullptr); - child[idxChild] = subCellGroups[idxSubCellGroup].getDownCell(idxChildCell); + do{ + const int idxChild = ((subCellGroup.getCellMortonIndex(idxChildCell)) & 7); + child[idxChild] = subCellGroup.getDownCell(idxChildCell); idxChildCell += 1; - if(idxChildCell == subCellGroups[idxSubCellGroup].getNumberOfCellsInBlock()){ - idxChildCell = 0; - idxSubCellGroup += 1; - } - } + }while(idxChildCell != subCellGroup.getNumberOfCellsInBlock() && cell.symb->mortonIndex == (subCellGroup.getCellMortonIndex(idxChildCell)>>3)); kernel->L2L(cell, child, idxLevel); + + if(currentCells.getCellMortonIndex(idxParentCell) == lastParent){ + break; + } + + idxParentCell += 1; } } template <class SymboleCellClass, class PoleCellClass, class LocalCellClass, class CellContainerClass, class ParticleContainerGroupClass, class ParticleGroupClass, class CudaKernelClass> __host__ void FCuda__downardPassCallback(unsigned char* currentCellsPtr, std::size_t currentCellsSize, unsigned char* currentCellsDownPtr, - FCudaParams<unsigned char*,9> subCellGroupsPtr, FCudaParams<std::size_t,9> subCellGroupsSize, - FCudaParams<unsigned char*,9> subCellGroupsDownPtr, - int nbSubCellGroups, int idxLevel, CudaKernelClass* kernel, cudaStream_t currentStream, + unsigned char* childCellsPtr, std::size_t childCellsSize, unsigned char* childCellsDownPtr, + int idxLevel, CudaKernelClass* kernel, cudaStream_t currentStream, const dim3 inGridSize, const dim3 inBlocksSize){ FCuda__downardPassPerform <SymboleCellClass, PoleCellClass, LocalCellClass, CellContainerClass, ParticleContainerGroupClass, ParticleGroupClass, CudaKernelClass> <<<inGridSize, inBlocksSize, 0, currentStream>>> - (currentCellsPtr, currentCellsSize, currentCellsDownPtr, subCellGroupsPtr, subCellGroupsSize, subCellGroupsDownPtr, - nbSubCellGroups, idxLevel, kernel); + (currentCellsPtr, currentCellsSize, currentCellsDownPtr, childCellsPtr, childCellsSize, childCellsDownPtr, + idxLevel, kernel); FCudaCheckAfterCall(); FCudaCheck(cudaStreamSynchronize(currentStream)); } @@ -442,11 +394,9 @@ __global__ void FCuda__directInoutPassPerformMpi(unsigned char* containersPtr, s if(leafPos != -1){ ParticleGroupClass interParticles = containersOther.template getLeaf<ParticleGroupClass>(leafPos); ParticleGroupClass particles = containers.template getLeaf<ParticleGroupClass>(outsideInteractions[outInterIdx].insideIdxInBlock); - ParticleGroupClass* interactions[27]; - memset(interactions, 0, 27*sizeof(ParticleGroupClass*)); - interactions[outsideInteractions[outInterIdx].relativeOutPosition] = &interParticles; - const int counter = 1; - kernel->P2PRemote( FCudaTreeCoordinate::GetPositionFromMorton(outsideInteractions[outInterIdx].insideIndex, treeHeight-1), &particles, &particles , interactions, counter); + + kernel->P2PRemote( FCudaTreeCoordinate::GetPositionFromMorton(outsideInteractions[outInterIdx].insideIndex, treeHeight-1), + &particles, &particles , &interParticles, &outsideInteractions[outInterIdx].relativeOutPosition, 1); } } } @@ -502,9 +452,7 @@ __global__ void FCuda__directInPassPerform(unsigned char* containersPtr, std::si const int3 coord = FCudaTreeCoordinate::GetPositionFromMorton(mindex, treeHeight-1); int counter = FCudaTreeCoordinate::GetNeighborsIndexes(coord, treeHeight,interactionsIndexes,interactionsPosition); - ParticleGroupClass interactionsObjects[27]; - ParticleGroupClass* interactions[27]; - memset(interactions, 0, 27*sizeof(ParticleGroupClass*)); + ParticleGroupClass interactionsObjects[26]; int counterExistingCell = 0; for(int idxInter = 0 ; idxInter < counter ; ++idxInter){ @@ -512,14 +460,13 @@ __global__ void FCuda__directInPassPerform(unsigned char* containersPtr, std::si const int leafPos = containers.getLeafIndex(interactionsIndexes[idxInter]); if(leafPos != -1){ interactionsObjects[counterExistingCell] = containers.template getLeaf<ParticleGroupClass>(leafPos); - FCudaAssertLF(interactions[interactionsPosition[idxInter]] == nullptr); - interactions[interactionsPosition[idxInter]] = &interactionsObjects[counterExistingCell]; + interactionsPosition[counterExistingCell] = interactionsPosition[idxInter]; counterExistingCell += 1; } } } - kernel->P2P( coord, &particles, &particles , interactions, counterExistingCell); + kernel->P2P( coord, &particles, &particles , interactionsObjects, interactionsPosition, counterExistingCell); } } @@ -548,7 +495,7 @@ __global__ void FCuda__directInoutPassPerform(unsigned char* containersPtr, std: } ParticleContainerGroupClass containers(containersPtr, containersSize, containersDownPtr); - ParticleContainerGroupClass containersOther(externalContainersPtr, externalContainersSize, externalContainersPtr); + ParticleContainerGroupClass containersOther(externalContainersPtr, externalContainersSize, externalContainersDownPtr); for(int outInterIdx = 0 ; outInterIdx < nbOutsideInteractions ; ++outInterIdx){ const int leafPos = containersOther.getLeafIndex(outsideInteractions[outInterIdx].outIndex); @@ -559,15 +506,13 @@ __global__ void FCuda__directInoutPassPerform(unsigned char* containersPtr, std: FCudaAssertLF(containersOther.getLeafMortonIndex(leafPos) == outsideInteractions[outInterIdx].outIndex); FCudaAssertLF(containers.getLeafMortonIndex(outsideInteractions[outInterIdx].insideIdxInBlock) == outsideInteractions[outInterIdx].insideIndex); - ParticleGroupClass* interactions[27]; - memset(interactions, 0, 27*sizeof(ParticleGroupClass*)); - interactions[outsideInteractions[outInterIdx].relativeOutPosition] = &interParticles; - const int counter = 1; - kernel->P2PRemote( FCudaTreeCoordinate::GetPositionFromMorton(outsideInteractions[outInterIdx].insideIndex, treeHeight-1), &particles, &particles , interactions, counter); - interactions[outsideInteractions[outInterIdx].relativeOutPosition] = nullptr; - interactions[FMGetOppositeNeighIndex(outsideInteractions[outInterIdx].relativeOutPosition)] = &particles; - kernel->P2PRemote( FCudaTreeCoordinate::GetPositionFromMorton(outsideInteractions[outInterIdx].outIndex, treeHeight-1), &interParticles, &interParticles , interactions, counter); + kernel->P2POuter( FCudaTreeCoordinate::GetPositionFromMorton(outsideInteractions[outInterIdx].insideIndex, treeHeight-1), + &particles , &interParticles, &outsideInteractions[outInterIdx].relativeOutPosition, 1); + + const int otherPosition = FMGetOppositeNeighIndex(outsideInteractions[outInterIdx].relativeOutPosition); + kernel->P2POuter( FCudaTreeCoordinate::GetPositionFromMorton(outsideInteractions[outInterIdx].outIndex, treeHeight-1), + &interParticles , &particles, &otherPosition, 1); } } } @@ -682,9 +627,8 @@ template void FCuda__bottomPassCallback<FCudaEmptyCellSymb, int, int, FCudaGroup template void FCuda__upwardPassCallback<FCudaEmptyCellSymb, int, int, FCudaGroupOfCells<FCudaEmptyCellSymb, int, int>, FCudaGroupOfParticles<int,0,0,int>, FCudaGroupAttachedLeaf<int,0,0,int>, FCudaEmptyKernel<int> > (unsigned char* currentCellsPtr, std::size_t currentCellsSize, unsigned char* currentCellsUpPtr, -FCudaParams<unsigned char*,9> subCellGroupsPtr, FCudaParams<std::size_t, 9> subCellGroupsSize, -FCudaParams<unsigned char*,9> subCellGroupsUpPtr, -int nbSubCellGroups, int idxLevel, FCudaEmptyKernel<int>* kernel, cudaStream_t currentStream, +unsigned char* childCellsPtr, std::size_t childCellsSize, unsigned char* childCellsUpPtr, +int idxLevel, FCudaEmptyKernel<int>* kernel, cudaStream_t currentStream, const dim3 inGridSize, const dim3 inBlocksSize); #ifdef SCALFMM_USE_MPI template void FCuda__transferInoutPassCallbackMpi<FCudaEmptyCellSymb, int, int, FCudaGroupOfCells<FCudaEmptyCellSymb, int, int>, @@ -705,19 +649,18 @@ template void FCuda__transferInPassCallback<FCudaEmptyCellSymb, int, int, FCudaG template void FCuda__transferInoutPassCallback<FCudaEmptyCellSymb, int, int, FCudaGroupOfCells<FCudaEmptyCellSymb, int, int>, FCudaGroupOfParticles<int,0,0,int>, FCudaGroupAttachedLeaf<int,0,0,int>, FCudaEmptyKernel<int> > (unsigned char* currentCellsPtr, std::size_t currentCellsSize, - unsigned char* currentCellsUpPtr, unsigned char* currentCellsDownPtr, - unsigned char* externalCellsPtr, std::size_t externalCellsSize, - unsigned char* externalCellsUpPtr, unsigned char* externalCellsDownPtr, - int idxLevel, const OutOfBlockInteraction* outsideInteractions, - int nbOutsideInteractions, FCudaEmptyKernel<int>* kernel, cudaStream_t currentStream, - const dim3 inGridSize, const dim3 inBlocksSize); +unsigned char* currentCellsDownPtr, +unsigned char* externalCellsPtr, std::size_t externalCellsSize, +unsigned char* externalCellsUpPtr, +int idxLevel, int mode, const OutOfBlockInteraction* outsideInteractions, +int nbOutsideInteractions, FCudaEmptyKernel<int>* kernel, cudaStream_t currentStream, + const dim3 inGridSize, const dim3 inBlocksSize); template void FCuda__downardPassCallback<FCudaEmptyCellSymb, int, int, FCudaGroupOfCells<FCudaEmptyCellSymb, int, int>, FCudaGroupOfParticles<int,0,0,int>, FCudaGroupAttachedLeaf<int,0,0,int>, FCudaEmptyKernel<int> > (unsigned char* currentCellsPtr, std::size_t currentCellsSize, unsigned char* currentCellsDownPtr, - FCudaParams<unsigned char*,9> subCellGroupsPtr, FCudaParams<std::size_t,9> subCellGroupsSize, - FCudaParams<unsigned char*,9> subCellGroupsDownPtr, - int nbSubCellGroups, int idxLevel, FCudaEmptyKernel<int>* kernel, cudaStream_t currentStream, +unsigned char* childCellsPtr, std::size_t childCellsSize, unsigned char* childCellsDownPtr, +int idxLevel, FCudaEmptyKernel<int>* kernel, cudaStream_t currentStream, const dim3 inGridSize, const dim3 inBlocksSize); #ifdef SCALFMM_USE_MPI template void FCuda__directInoutPassCallbackMpi<FCudaEmptyCellSymb, int, int, FCudaGroupOfCells<FCudaEmptyCellSymb, int, int>, @@ -770,9 +713,8 @@ unsigned char* containersPtr, std::size_t containersSize, template void FCuda__upwardPassCallback<FTestCellPODCore, FTestCellPODData, FTestCellPODData, FCudaGroupOfCells<FTestCellPODCore, FTestCellPODData, FTestCellPODData>, FCudaGroupOfParticles<float,0, 1, long long int>, FCudaGroupAttachedLeaf<float,0, 1, long long int>, FTestCudaKernels<float> > (unsigned char* currentCellsPtr, std::size_t currentCellsSize, unsigned char* currentCellsUpPtr, - FCudaParams<unsigned char*,9> subCellGroupsPtr, FCudaParams<std::size_t, 9> subCellGroupsSize, - FCudaParams<unsigned char*,9> subCellGroupsUpPtr, - int nbSubCellGroups, int idxLevel, FTestCudaKernels<float>* kernel, cudaStream_t currentStream, + unsigned char* childCellsPtr, std::size_t childCellsSize, unsigned char* childCellsUpPtr, +int idxLevel, FTestCudaKernels<float>* kernel, cudaStream_t currentStream, const dim3 inGridSize, const dim3 inBlocksSize); #ifdef SCALFMM_USE_MPI template void FCuda__transferInoutPassCallbackMpi<FTestCellPODCore, FTestCellPODData, FTestCellPODData, FCudaGroupOfCells<FTestCellPODCore, FTestCellPODData, FTestCellPODData>, @@ -793,19 +735,18 @@ template void FCuda__transferInPassCallback<FTestCellPODCore, FTestCellPODData, template void FCuda__transferInoutPassCallback<FTestCellPODCore, FTestCellPODData, FTestCellPODData, FCudaGroupOfCells<FTestCellPODCore, FTestCellPODData, FTestCellPODData>, FCudaGroupOfParticles<float,0, 1, long long int>, FCudaGroupAttachedLeaf<float,0, 1, long long int>, FTestCudaKernels<float> > (unsigned char* currentCellsPtr, std::size_t currentCellsSize, - unsigned char* currentCellsUpPtr, unsigned char* currentCellsDownPtr, - unsigned char* externalCellsPtr, std::size_t externalCellsSize, - unsigned char* externalCellsUpPtr, unsigned char* externalCellsDownPtr, - int idxLevel, const OutOfBlockInteraction* outsideInteractions, - int nbOutsideInteractions, FTestCudaKernels<float>* kernel, cudaStream_t currentStream, - const dim3 inGridSize, const dim3 inBlocksSize); +unsigned char* currentCellsDownPtr, +unsigned char* externalCellsPtr, std::size_t externalCellsSize, +unsigned char* externalCellsUpPtr, +int idxLevel, int mode, const OutOfBlockInteraction* outsideInteractions, +int nbOutsideInteractions, FTestCudaKernels<float>* kernel, cudaStream_t currentStream, + const dim3 inGridSize, const dim3 inBlocksSize); template void FCuda__downardPassCallback<FTestCellPODCore, FTestCellPODData, FTestCellPODData, FCudaGroupOfCells<FTestCellPODCore, FTestCellPODData, FTestCellPODData>, FCudaGroupOfParticles<float,0, 1, long long int>, FCudaGroupAttachedLeaf<float,0, 1, long long int>, FTestCudaKernels<float> > (unsigned char* currentCellsPtr, std::size_t currentCellsSize, unsigned char* currentCellsDownPtr, - FCudaParams<unsigned char*,9> subCellGroupsPtr, FCudaParams<std::size_t,9> subCellGroupsSize, - FCudaParams<unsigned char*,9> subCellGroupsDownPtr, - int nbSubCellGroups, int idxLevel, FTestCudaKernels<float>* kernel, cudaStream_t currentStream, + unsigned char* childCellsPtr, std::size_t childCellsSize, unsigned char* childCellsDownPtr, +int idxLevel, FTestCudaKernels<float>* kernel, cudaStream_t currentStream, const dim3 inGridSize, const dim3 inBlocksSize); #ifdef SCALFMM_USE_MPI template void FCuda__directInoutPassCallbackMpi<FTestCellPODCore, FTestCellPODData, FTestCellPODData, FCudaGroupOfCells<FTestCellPODCore, FTestCellPODData, FTestCellPODData>, @@ -856,9 +797,8 @@ unsigned char* containersPtr, std::size_t containersSize, template void FCuda__upwardPassCallback<FTestCellPODCore, FTestCellPODData, FTestCellPODData, FCudaGroupOfCells<FTestCellPODCore, FTestCellPODData, FTestCellPODData>, FCudaGroupOfParticles<double,0, 1, long long int>, FCudaGroupAttachedLeaf<double,0, 1, long long int>, FTestCudaKernels<double> > (unsigned char* currentCellsPtr, std::size_t currentCellsSize, unsigned char* currentCellsUpPtr, - FCudaParams<unsigned char*,9> subCellGroupsPtr, FCudaParams<std::size_t, 9> subCellGroupsSize, - FCudaParams<unsigned char*,9> subCellGroupsUpPtr, - int nbSubCellGroups, int idxLevel, FTestCudaKernels<double>* kernel, cudaStream_t currentStream, + unsigned char* childCellsPtr, std::size_t childCellsSize, unsigned char* childCellsUpPtr, +int idxLevel, FTestCudaKernels<double>* kernel, cudaStream_t currentStream, const dim3 inGridSize, const dim3 inBlocksSize); #ifdef SCALFMM_USE_MPI template void FCuda__transferInoutPassCallbackMpi<FTestCellPODCore, FTestCellPODData, FTestCellPODData, FCudaGroupOfCells<FTestCellPODCore, FTestCellPODData, FTestCellPODData>, @@ -879,19 +819,18 @@ template void FCuda__transferInPassCallback<FTestCellPODCore, FTestCellPODData, template void FCuda__transferInoutPassCallback<FTestCellPODCore, FTestCellPODData, FTestCellPODData, FCudaGroupOfCells<FTestCellPODCore, FTestCellPODData, FTestCellPODData>, FCudaGroupOfParticles<double,0, 1, long long int>, FCudaGroupAttachedLeaf<double,0, 1, long long int>, FTestCudaKernels<double> > (unsigned char* currentCellsPtr, std::size_t currentCellsSize, - unsigned char* currentCellsUpPtr, unsigned char* currentCellsDownPtr, - unsigned char* externalCellsPtr, std::size_t externalCellsSize, - unsigned char* externalCellsUpPtr, unsigned char* externalCellsDownPtr, - int idxLevel, const OutOfBlockInteraction* outsideInteractions, - int nbOutsideInteractions, FTestCudaKernels<double>* kernel, cudaStream_t currentStream, - const dim3 inGridSize, const dim3 inBlocksSize); +unsigned char* currentCellsDownPtr, +unsigned char* externalCellsPtr, std::size_t externalCellsSize, +unsigned char* externalCellsUpPtr, +int idxLevel, int mode, const OutOfBlockInteraction* outsideInteractions, +int nbOutsideInteractions, FTestCudaKernels<double>* kernel, cudaStream_t currentStream, + const dim3 inGridSize, const dim3 inBlocksSize); template void FCuda__downardPassCallback<FTestCellPODCore, FTestCellPODData, FTestCellPODData, FCudaGroupOfCells<FTestCellPODCore, FTestCellPODData, FTestCellPODData>, FCudaGroupOfParticles<double,0, 1, long long int>, FCudaGroupAttachedLeaf<double,0, 1, long long int>, FTestCudaKernels<double> > (unsigned char* currentCellsPtr, std::size_t currentCellsSize, unsigned char* currentCellsDownPtr, - FCudaParams<unsigned char*,9> subCellGroupsPtr, FCudaParams<std::size_t,9> subCellGroupsSize, - FCudaParams<unsigned char*,9> subCellGroupsDownPtr, - int nbSubCellGroups, int idxLevel, FTestCudaKernels<double>* kernel, cudaStream_t currentStream, + unsigned char* childCellsPtr, std::size_t childCellsSize, unsigned char* childCellsDownPtr, + int idxLevel, FTestCudaKernels<double>* kernel, cudaStream_t currentStream, const dim3 inGridSize, const dim3 inBlocksSize); #ifdef SCALFMM_USE_MPI template void FCuda__directInoutPassCallbackMpi<FTestCellPODCore, FTestCellPODData, FTestCellPODData, FCudaGroupOfCells<FTestCellPODCore, FTestCellPODData, FTestCellPODData>, @@ -936,22 +875,21 @@ template dim3 FCuda__GetBlockSize< FTestCudaKernels<double> >(FTestCudaKernels<d #include "../P2P/FCudaP2P.hpp" template void FCuda__bottomPassCallback<FCudaEmptyCellSymb, int, int, FCudaGroupOfCells<FCudaEmptyCellSymb, int, int>, - FCudaGroupOfParticles<float,4, 4, float>, FCudaGroupAttachedLeaf<float,4, 4, float>, FCudaP2P<float> > + FCudaGroupOfParticles<float,1, 4, float>, FCudaGroupAttachedLeaf<float,1, 4, float>, FCudaP2P<float> > (unsigned char* leafCellsPtr, std::size_t leafCellsSize, unsigned char* leafCellsUpPtr, unsigned char* containersPtr, std::size_t containersSize, FCudaP2P<float>* kernel, cudaStream_t currentStream, const dim3 inGridSize, const dim3 inBlocksSize); template void FCuda__upwardPassCallback<FCudaEmptyCellSymb, int, int, FCudaGroupOfCells<FCudaEmptyCellSymb, int, int>, - FCudaGroupOfParticles<float,4, 4, float>, FCudaGroupAttachedLeaf<float,4, 4, float>, FCudaP2P<float> > + FCudaGroupOfParticles<float,1, 4, float>, FCudaGroupAttachedLeaf<float,1, 4, float>, FCudaP2P<float> > (unsigned char* currentCellsPtr, std::size_t currentCellsSize, unsigned char* currentCellsUpPtr, - FCudaParams<unsigned char*,9> subCellGroupsPtr, FCudaParams<std::size_t, 9> subCellGroupsSize, - FCudaParams<unsigned char*,9> subCellGroupsUpPtr, - int nbSubCellGroups, int idxLevel, FCudaP2P<float>* kernel, cudaStream_t currentStream, + unsigned char* childCellsPtr, std::size_t childCellsSize, unsigned char* childCellsUpPtr, + int idxLevel, FCudaP2P<float>* kernel, cudaStream_t currentStream, const dim3 inGridSize, const dim3 inBlocksSize); #ifdef SCALFMM_USE_MPI template void FCuda__transferInoutPassCallbackMpi<FCudaEmptyCellSymb, int, int, FCudaGroupOfCells<FCudaEmptyCellSymb, int, int>, - FCudaGroupOfParticles<float,4, 4, float>, FCudaGroupAttachedLeaf<float,4, 4, float>, FCudaP2P<float> > + FCudaGroupOfParticles<float,1, 4, float>, FCudaGroupAttachedLeaf<float,1, 4, float>, FCudaP2P<float> > (unsigned char* currentCellsPtr, std::size_t currentCellsSize, unsigned char* currentCellsDownPtr, unsigned char* externalCellsPtr, std::size_t externalCellsSize, unsigned char* externalCellsUpPtr, int idxLevel, const OutOfBlockInteraction* outsideInteractions, @@ -959,32 +897,31 @@ template void FCuda__transferInoutPassCallbackMpi<FCudaEmptyCellSymb, int, int, const dim3 inGridSize, const dim3 inBlocksSize); #endif template void FCuda__transferInPassCallback<FCudaEmptyCellSymb, int, int, FCudaGroupOfCells<FCudaEmptyCellSymb, int, int>, - FCudaGroupOfParticles<float,4, 4, float>, FCudaGroupAttachedLeaf<float,4, 4, float>, FCudaP2P<float> > + FCudaGroupOfParticles<float,1, 4, float>, FCudaGroupAttachedLeaf<float,1, 4, float>, FCudaP2P<float> > (unsigned char* currentCellsPtr, std::size_t currentCellsSize, unsigned char* currentCellsUpPtr, unsigned char* currentCellsDownPtr, int idxLevel, FCudaP2P<float>* kernel, cudaStream_t currentStream, const dim3 inGridSize, const dim3 inBlocksSize); template void FCuda__transferInoutPassCallback<FCudaEmptyCellSymb, int, int, FCudaGroupOfCells<FCudaEmptyCellSymb, int, int>, - FCudaGroupOfParticles<float,4, 4, float>, FCudaGroupAttachedLeaf<float,4, 4, float>, FCudaP2P<float> > + FCudaGroupOfParticles<float,1, 4, float>, FCudaGroupAttachedLeaf<float,1, 4, float>, FCudaP2P<float> > (unsigned char* currentCellsPtr, std::size_t currentCellsSize, - unsigned char* currentCellsUpPtr, unsigned char* currentCellsDownPtr, - unsigned char* externalCellsPtr, std::size_t externalCellsSize, - unsigned char* externalCellsUpPtr, unsigned char* externalCellsDownPtr, - int idxLevel, const OutOfBlockInteraction* outsideInteractions, - int nbOutsideInteractions, FCudaP2P<float>* kernel, cudaStream_t currentStream, - const dim3 inGridSize, const dim3 inBlocksSize); +unsigned char* currentCellsDownPtr, +unsigned char* externalCellsPtr, std::size_t externalCellsSize, +unsigned char* externalCellsUpPtr, +int idxLevel, int mode, const OutOfBlockInteraction* outsideInteractions, +int nbOutsideInteractions, FCudaP2P<float>* kernel, cudaStream_t currentStream, + const dim3 inGridSize, const dim3 inBlocksSize); template void FCuda__downardPassCallback<FCudaEmptyCellSymb, int, int, FCudaGroupOfCells<FCudaEmptyCellSymb, int, int>, - FCudaGroupOfParticles<float,4, 4, float>, FCudaGroupAttachedLeaf<float,4, 4, float>, FCudaP2P<float> > + FCudaGroupOfParticles<float,1, 4, float>, FCudaGroupAttachedLeaf<float,1, 4, float>, FCudaP2P<float> > (unsigned char* currentCellsPtr, std::size_t currentCellsSize, unsigned char* currentCellsDownPtr, - FCudaParams<unsigned char*,9> subCellGroupsPtr, FCudaParams<std::size_t,9> subCellGroupsSize, - FCudaParams<unsigned char*,9> subCellGroupsDownPtr, - int nbSubCellGroups, int idxLevel, FCudaP2P<float>* kernel, cudaStream_t currentStream, + unsigned char* childCellsPtr, std::size_t childCellsSize, unsigned char* childCellsDownPtr, +int idxLevel, FCudaP2P<float>* kernel, cudaStream_t currentStream, const dim3 inGridSize, const dim3 inBlocksSize); #ifdef SCALFMM_USE_MPI template void FCuda__directInoutPassCallbackMpi<FCudaEmptyCellSymb, int, int, FCudaGroupOfCells<FCudaEmptyCellSymb, int, int>, - FCudaGroupOfParticles<float,4, 4, float>, FCudaGroupAttachedLeaf<float,4, 4, float>, FCudaP2P<float> > + FCudaGroupOfParticles<float,1, 4, float>, FCudaGroupAttachedLeaf<float,1, 4, float>, FCudaP2P<float> > (unsigned char* containersPtr, std::size_t containersSize, unsigned char* containersDownPtr, unsigned char* externalContainersPtr, std::size_t externalContainersSize, const OutOfBlockInteraction* outsideInteractions, @@ -992,13 +929,13 @@ template void FCuda__directInoutPassCallbackMpi<FCudaEmptyCellSymb, int, int, FC const dim3 inGridSize, const dim3 inBlocksSize); #endif template void FCuda__directInPassCallback<FCudaEmptyCellSymb, int, int, FCudaGroupOfCells<FCudaEmptyCellSymb, int, int>, - FCudaGroupOfParticles<float,4, 4, float>, FCudaGroupAttachedLeaf<float,4, 4, float>, FCudaP2P<float> > + FCudaGroupOfParticles<float,1, 4, float>, FCudaGroupAttachedLeaf<float,1, 4, float>, FCudaP2P<float> > (unsigned char* containersPtr, std::size_t containersSize, unsigned char* containersDownPtr, const int treeHeight, FCudaP2P<float>* kernel, cudaStream_t currentStream, const dim3 inGridSize, const dim3 inBlocksSize); template void FCuda__directInoutPassCallback<FCudaEmptyCellSymb, int, int, FCudaGroupOfCells<FCudaEmptyCellSymb, int, int>, - FCudaGroupOfParticles<float,4, 4, float>, FCudaGroupAttachedLeaf<float,4, 4, float>, FCudaP2P<float> > + FCudaGroupOfParticles<float,1, 4, float>, FCudaGroupAttachedLeaf<float,1, 4, float>, FCudaP2P<float> > (unsigned char* containersPtr, std::size_t containersSize, unsigned char* containersDownPtr, unsigned char* externalContainersPtr, std::size_t externalContainersSize, unsigned char* externalContainersDownPtr, const OutOfBlockInteraction* outsideInteractions, @@ -1006,7 +943,7 @@ template void FCuda__directInoutPassCallback<FCudaEmptyCellSymb, int, int, FCuda const dim3 inGridSize, const dim3 inBlocksSize); template void FCuda__mergePassCallback<FCudaEmptyCellSymb, int, int, FCudaGroupOfCells<FCudaEmptyCellSymb, int, int>, - FCudaGroupOfParticles<float,4, 4, float>, FCudaGroupAttachedLeaf<float,4, 4, float>, FCudaP2P<float> > + FCudaGroupOfParticles<float,1, 4, float>, FCudaGroupAttachedLeaf<float,1, 4, float>, FCudaP2P<float> > (unsigned char* leafCellsPtr, std::size_t leafCellsSize, unsigned char* leafCellsDownPtr, unsigned char* containersPtr, std::size_t containersSize, unsigned char* containersDownPtr, FCudaP2P<float>* kernel, cudaStream_t currentStream, @@ -1022,22 +959,21 @@ template dim3 FCuda__GetBlockSize< FCudaP2P<float> >(FCudaP2P<float>* cukernel); template void FCuda__bottomPassCallback<FCudaEmptyCellSymb, int, int, FCudaGroupOfCells<FCudaEmptyCellSymb, int, int>, - FCudaGroupOfParticles<double,4, 4, double>, FCudaGroupAttachedLeaf<double,4, 4, double>, FCudaP2P<double> > + FCudaGroupOfParticles<double,1, 4, double>, FCudaGroupAttachedLeaf<double,1, 4, double>, FCudaP2P<double> > (unsigned char* leafCellsPtr, std::size_t leafCellsSize, unsigned char* leafCellsUpPtr, unsigned char* containersPtr, std::size_t containersSize, FCudaP2P<double>* kernel, cudaStream_t currentStream, const dim3 inGridSize, const dim3 inBlocksSize); template void FCuda__upwardPassCallback<FCudaEmptyCellSymb, int, int, FCudaGroupOfCells<FCudaEmptyCellSymb, int, int>, - FCudaGroupOfParticles<double,4, 4, double>, FCudaGroupAttachedLeaf<double,4, 4, double>, FCudaP2P<double> > + FCudaGroupOfParticles<double,1, 4, double>, FCudaGroupAttachedLeaf<double,1, 4, double>, FCudaP2P<double> > (unsigned char* currentCellsPtr, std::size_t currentCellsSize, unsigned char* currentCellsUpPtr, - FCudaParams<unsigned char*,9> subCellGroupsPtr, FCudaParams<std::size_t, 9> subCellGroupsSize, - FCudaParams<unsigned char*,9> subCellGroupsUpPtr, - int nbSubCellGroups, int idxLevel, FCudaP2P<double>* kernel, cudaStream_t currentStream, + unsigned char* childCellsPtr, std::size_t childCellsSize, unsigned char* childCellsUpPtr, +int idxLevel, FCudaP2P<double>* kernel, cudaStream_t currentStream, const dim3 inGridSize, const dim3 inBlocksSize); #ifdef SCALFMM_USE_MPI template void FCuda__transferInoutPassCallbackMpi<FCudaEmptyCellSymb, int, int, FCudaGroupOfCells<FCudaEmptyCellSymb, int, int>, - FCudaGroupOfParticles<double,4, 4, double>, FCudaGroupAttachedLeaf<double,4, 4, double>, FCudaP2P<double> > + FCudaGroupOfParticles<double,1, 4, double>, FCudaGroupAttachedLeaf<double,1, 4, double>, FCudaP2P<double> > (unsigned char* currentCellsPtr, std::size_t currentCellsSize, unsigned char* currentCellsDownPtr, unsigned char* externalCellsPtr, std::size_t externalCellsSize, unsigned char* externalCellsUpPtr, int idxLevel, const OutOfBlockInteraction* outsideInteractions, @@ -1045,32 +981,31 @@ template void FCuda__transferInoutPassCallbackMpi<FCudaEmptyCellSymb, int, int, const dim3 inGridSize, const dim3 inBlocksSize); #endif template void FCuda__transferInPassCallback<FCudaEmptyCellSymb, int, int, FCudaGroupOfCells<FCudaEmptyCellSymb, int, int>, - FCudaGroupOfParticles<double,4, 4, double>, FCudaGroupAttachedLeaf<double,4, 4, double>, FCudaP2P<double> > + FCudaGroupOfParticles<double,1, 4, double>, FCudaGroupAttachedLeaf<double,1, 4, double>, FCudaP2P<double> > (unsigned char* currentCellsPtr, std::size_t currentCellsSize, unsigned char* currentCellsUpPtr, unsigned char* currentCellsDownPtr, int idxLevel, FCudaP2P<double>* kernel, cudaStream_t currentStream, const dim3 inGridSize, const dim3 inBlocksSize); template void FCuda__transferInoutPassCallback<FCudaEmptyCellSymb, int, int, FCudaGroupOfCells<FCudaEmptyCellSymb, int, int>, - FCudaGroupOfParticles<double,4, 4, double>, FCudaGroupAttachedLeaf<double,4, 4, double>, FCudaP2P<double> > + FCudaGroupOfParticles<double,1, 4, double>, FCudaGroupAttachedLeaf<double,1, 4, double>, FCudaP2P<double> > (unsigned char* currentCellsPtr, std::size_t currentCellsSize, - unsigned char* currentCellsUpPtr, unsigned char* currentCellsDownPtr, - unsigned char* externalCellsPtr, std::size_t externalCellsSize, - unsigned char* externalCellsUpPtr, unsigned char* externalCellsDownPtr, - int idxLevel, const OutOfBlockInteraction* outsideInteractions, - int nbOutsideInteractions, FCudaP2P<double>* kernel, cudaStream_t currentStream, - const dim3 inGridSize, const dim3 inBlocksSize); +unsigned char* currentCellsDownPtr, +unsigned char* externalCellsPtr, std::size_t externalCellsSize, +unsigned char* externalCellsUpPtr, +int idxLevel, int mode, const OutOfBlockInteraction* outsideInteractions, +int nbOutsideInteractions, FCudaP2P<double>* kernel, cudaStream_t currentStream, + const dim3 inGridSize, const dim3 inBlocksSize); template void FCuda__downardPassCallback<FCudaEmptyCellSymb, int, int, FCudaGroupOfCells<FCudaEmptyCellSymb, int, int>, - FCudaGroupOfParticles<double,4, 4, double>, FCudaGroupAttachedLeaf<double,4, 4, double>, FCudaP2P<double> > + FCudaGroupOfParticles<double,1, 4, double>, FCudaGroupAttachedLeaf<double,1, 4, double>, FCudaP2P<double> > (unsigned char* currentCellsPtr, std::size_t currentCellsSize, unsigned char* currentCellsDownPtr, - FCudaParams<unsigned char*,9> subCellGroupsPtr, FCudaParams<std::size_t,9> subCellGroupsSize, - FCudaParams<unsigned char*,9> subCellGroupsDownPtr, - int nbSubCellGroups, int idxLevel, FCudaP2P<double>* kernel, cudaStream_t currentStream, + unsigned char* childCellsPtr, std::size_t childCellsSize, unsigned char* childCellsDownPtr, +int idxLevel, FCudaP2P<double>* kernel, cudaStream_t currentStream, const dim3 inGridSize, const dim3 inBlocksSize); #ifdef SCALFMM_USE_MPI template void FCuda__directInoutPassCallbackMpi<FCudaEmptyCellSymb, int, int, FCudaGroupOfCells<FCudaEmptyCellSymb, int, int>, - FCudaGroupOfParticles<double,4, 4, double>, FCudaGroupAttachedLeaf<double,4, 4, double>, FCudaP2P<double> > + FCudaGroupOfParticles<double,1, 4, double>, FCudaGroupAttachedLeaf<double,1, 4, double>, FCudaP2P<double> > (unsigned char* containersPtr, std::size_t containersSize, unsigned char* containersDownPtr, unsigned char* externalContainersPtr, std::size_t externalContainersSize, const OutOfBlockInteraction* outsideInteractions, @@ -1078,13 +1013,13 @@ template void FCuda__directInoutPassCallbackMpi<FCudaEmptyCellSymb, int, int, FC const dim3 inGridSize, const dim3 inBlocksSize); #endif template void FCuda__directInPassCallback<FCudaEmptyCellSymb, int, int, FCudaGroupOfCells<FCudaEmptyCellSymb, int, int>, - FCudaGroupOfParticles<double,4, 4, double>, FCudaGroupAttachedLeaf<double,4, 4, double>, FCudaP2P<double> > + FCudaGroupOfParticles<double,1, 4, double>, FCudaGroupAttachedLeaf<double,1, 4, double>, FCudaP2P<double> > (unsigned char* containersPtr, std::size_t containersSize, unsigned char* containersDownPtr, const int treeHeight, FCudaP2P<double>* kernel, cudaStream_t currentStream, const dim3 inGridSize, const dim3 inBlocksSize); template void FCuda__directInoutPassCallback<FCudaEmptyCellSymb, int, int, FCudaGroupOfCells<FCudaEmptyCellSymb, int, int>, - FCudaGroupOfParticles<double,4, 4, double>, FCudaGroupAttachedLeaf<double,4, 4, double>, FCudaP2P<double> > + FCudaGroupOfParticles<double,1, 4, double>, FCudaGroupAttachedLeaf<double,1, 4, double>, FCudaP2P<double> > (unsigned char* containersPtr, std::size_t containersSize, unsigned char* containersDownPtr, unsigned char* externalContainersPtr, std::size_t externalContainersSize, unsigned char* externalContainersDownPtr, const OutOfBlockInteraction* outsideInteractions, @@ -1092,7 +1027,7 @@ template void FCuda__directInoutPassCallback<FCudaEmptyCellSymb, int, int, FCuda const dim3 inGridSize, const dim3 inBlocksSize); template void FCuda__mergePassCallback<FCudaEmptyCellSymb, int, int, FCudaGroupOfCells<FCudaEmptyCellSymb, int, int>, - FCudaGroupOfParticles<double,4, 4, double>, FCudaGroupAttachedLeaf<double,4, 4, double>, FCudaP2P<double> > + FCudaGroupOfParticles<double,1, 4, double>, FCudaGroupAttachedLeaf<double,1, 4, double>, FCudaP2P<double> > (unsigned char* leafCellsPtr, std::size_t leafCellsSize, unsigned char* leafCellsDownPtr, unsigned char* containersPtr, std::size_t containersSize, unsigned char* containersDownPtr, FCudaP2P<double>* kernel, cudaStream_t currentStream, diff --git a/Src/GroupTree/Cuda/FCudaDeviceWrapper.hpp b/Src/GroupTree/Cuda/FCudaDeviceWrapper.hpp index 5559c908570cc75388ecfa1c2018ab4e98d53c59..4ba6615534e597cca892fbd02ce44a5c93a6e8cf 100644 --- a/Src/GroupTree/Cuda/FCudaDeviceWrapper.hpp +++ b/Src/GroupTree/Cuda/FCudaDeviceWrapper.hpp @@ -17,9 +17,8 @@ template <class SymboleCellClass, class PoleCellClass, class LocalCellClass, class CellContainerClass, class ParticleContainerGroupClass, class ParticleGroupClass, class CudaKernelClass> void FCuda__upwardPassCallback( unsigned char* currentCellsPtr, std::size_t currentCellsSize, unsigned char* currentCellsUpPtr, - FCudaParams<unsigned char*,9> subCellGroupsPtr, FCudaParams<std::size_t, 9> subCellGroupsSize, - FCudaParams<unsigned char*,9> subCellGroupsUpPtr, - int nbSubCellGroups, int idxLevel, CudaKernelClass* kernel, cudaStream_t currentStream, + unsigned char* childCellsPtr, std::size_t childCellsSize, unsigned char* childCellsUpPtr, + int idxLevel, CudaKernelClass* kernel, cudaStream_t currentStream, const dim3 inGridSize, const dim3 inBlocksSize); #ifdef SCALFMM_USE_MPI template <class SymboleCellClass, class PoleCellClass, class LocalCellClass, @@ -43,10 +42,10 @@ template <class SymboleCellClass, class PoleCellClass, class LocalCellClass, class CellContainerClass, class ParticleContainerGroupClass, class ParticleGroupClass, class CudaKernelClass> void FCuda__transferInoutPassCallback( unsigned char* currentCellsPtr, std::size_t currentCellsSize, - unsigned char* currentCellsUpPtr, unsigned char* currentCellsDownPtr, + unsigned char* currentCellsUpPtr, unsigned char* externalCellsPtr, std::size_t externalCellsSize, - unsigned char* externalCellsUpPtr, unsigned char* externalCellsDownPtr, - int idxLevel, const OutOfBlockInteraction* outsideInteractions, + unsigned char* externalCellsDownPtr, + int idxLevel, int mode, const OutOfBlockInteraction* outsideInteractions, int nbOutsideInteractions, CudaKernelClass* kernel, cudaStream_t currentStream, const dim3 inGridSize, const dim3 inBlocksSize); @@ -54,9 +53,8 @@ template <class SymboleCellClass, class PoleCellClass, class LocalCellClass, class CellContainerClass, class ParticleContainerGroupClass, class ParticleGroupClass, class CudaKernelClass> void FCuda__downardPassCallback( unsigned char* currentCellsPtr, std::size_t currentCellsSize, unsigned char* currentCellsDownPtr, - FCudaParams<unsigned char*,9> subCellGroupsPtr, FCudaParams<std::size_t,9> subCellGroupsSize, - FCudaParams<unsigned char*,9> subCellGroupsDownPtr, - int nbSubCellGroups, int idxLevel, CudaKernelClass* kernel, cudaStream_t currentStream, + unsigned char* childCellsPtr, std::size_t childCellsSize, unsigned char* childCellsDownPtr, + int idxLevel, CudaKernelClass* kernel, cudaStream_t currentStream, const dim3 inGridSize, const dim3 inBlocksSize); #ifdef SCALFMM_USE_MPI template <class SymboleCellClass, class PoleCellClass, class LocalCellClass, diff --git a/Src/GroupTree/Cuda/FCudaEmptyKernel.hpp b/Src/GroupTree/Cuda/FCudaEmptyKernel.hpp index 6eb975c226ccad4e0556554b33a9128a04a295cf..1cdecb9f2aeb6af06d35c96b3aa69044f78b2091 100644 --- a/Src/GroupTree/Cuda/FCudaEmptyKernel.hpp +++ b/Src/GroupTree/Cuda/FCudaEmptyKernel.hpp @@ -22,7 +22,8 @@ public: __device__ void M2M(CellClass /*pole*/, const CellClass /*child*/[8], const int /*level*/) { } - __device__ void M2L(CellClass /*pole*/, const CellClass /*distantNeighbors*/[343], + __device__ void M2L(CellClass /*pole*/, const CellClass* /*distantNeighbors*/, + const int* /*neighPositions*/, const int /*size*/, const int /*level*/) { } @@ -34,12 +35,20 @@ public: __device__ void P2P(const int3& , ContainerClass* const /*targets*/, const ContainerClass* const /*sources*/, - ContainerClass* const /*directNeighborsParticles*/[27], const int ){ + ContainerClass* const /*directNeighborsParticles*/, + const int* /*neighborPositions*/, const int ){ + } + + __device__ void P2POuter(const int3& , + ContainerClass* const /*targets*/, + ContainerClass* const /*directNeighborsParticles*/, + const int* /*neighborPositions*/,const int ){ } __device__ void P2PRemote(const int3& , ContainerClass* const /*targets*/, const ContainerClass* const /*sources*/, - ContainerClass* const /*directNeighborsParticles*/[27], const int ){ + ContainerClass* const /*directNeighborsParticles*/, + const int* /*neighborPositions*/,const int ){ } __host__ static FCudaEmptyKernel* InitKernelKernel(void*){ diff --git a/Src/GroupTree/Cuda/FCudaGroupAttachedLeaf.hpp b/Src/GroupTree/Cuda/FCudaGroupAttachedLeaf.hpp index 275a8c1adfb2a5269da5ef74eb17b877728c505e..0ea8daecbb6c252ad7faa54837e5a20b114f9945 100644 --- a/Src/GroupTree/Cuda/FCudaGroupAttachedLeaf.hpp +++ b/Src/GroupTree/Cuda/FCudaGroupAttachedLeaf.hpp @@ -37,7 +37,7 @@ public: positionsPointers[2] = reinterpret_cast<FReal*>(reinterpret_cast<unsigned char*>(inPositionBuffer) + inLeadingPosition*2); for(unsigned idxAttribute = 0 ; idxAttribute < NbSymbAttributes ; ++idxAttribute){ - attributes[idxAttribute] = reinterpret_cast<AttributeClass*>(reinterpret_cast<unsigned char*>(inPositionBuffer) + inLeadingPosition*(idxAttribute+3)); + attributes[idxAttribute] = reinterpret_cast<AttributeClass*>(reinterpret_cast<unsigned char*>(inPositionBuffer) + inLeadingPosition*3 + inLeadingAttributes*idxAttribute); } // Redirect pointers to data diff --git a/Src/GroupTree/Cuda/FCudaGroupOfParticles.hpp b/Src/GroupTree/Cuda/FCudaGroupOfParticles.hpp index d7d8aff5cea8627d3ad487b3e1286fc1af7bed2b..a119cc22ed8b35c53fc0fc835a621fb1ce15e95f 100644 --- a/Src/GroupTree/Cuda/FCudaGroupOfParticles.hpp +++ b/Src/GroupTree/Cuda/FCudaGroupOfParticles.hpp @@ -59,8 +59,6 @@ protected: BlockHeader* blockHeader; //< Pointer to leaves information LeafHeader* leafHeader; - //< The total number of particles in the group - const FSize nbParticlesInGroup; //< Pointers to particle position x, y, z FReal* particlePosition[3]; @@ -78,11 +76,12 @@ public: __device__ FCudaGroupOfParticles(unsigned char* inBuffer, const size_t inAllocatedMemoryInByte, unsigned char* inAttributes) : allocatedMemoryInByte(inAllocatedMemoryInByte), memoryBuffer(inBuffer), - blockHeader(nullptr), leafHeader(nullptr), nbParticlesInGroup(0), + blockHeader(nullptr), leafHeader(nullptr), attributesBuffer(nullptr){ // Move the pointers to the correct position - blockHeader = reinterpret_cast<BlockHeader*>(memoryBuffer); - leafHeader = reinterpret_cast<LeafHeader*>(memoryBuffer+sizeof(BlockHeader)+(blockHeader->numberOfLeavesInBlock*sizeof(int))); + blockHeader = reinterpret_cast<BlockHeader*>(inBuffer); + inBuffer += sizeof(BlockHeader); + leafHeader = reinterpret_cast<LeafHeader*>(inBuffer); // Init particle pointers // Assert blockHeader->positionsLeadingDim == (sizeof(FReal) * blockHeader->nbParticlesAllocatedInGroup); @@ -122,7 +121,7 @@ public: /** Get the total number of particles in the group */ __device__ FSize getNbParticlesInGroup() const { - return nbParticlesInGroup; + return blockHeader->nbParticlesInGroup; } /** The size of the interval endingIndex-startingIndex (set from the constructor) */ diff --git a/Src/GroupTree/OpenCl/FEmptyOpenCLCode.hpp b/Src/GroupTree/OpenCl/FEmptyOpenCLCode.hpp index 4dc38f2e34d2bf71ac8f93bc9bffaa2c61012737..bf4648b2ed88db61d3e30ef262c338f147f435f4 100644 --- a/Src/GroupTree/OpenCl/FEmptyOpenCLCode.hpp +++ b/Src/GroupTree/OpenCl/FEmptyOpenCLCode.hpp @@ -21,57 +21,52 @@ public: MortonIndex insideIndex;\ int relativeOutPosition;\ int insideIdxInBlock;\ + int outsideIdxInBlock;\ } __attribute__ ((aligned (DefaultStructAlign)));\ - struct Uptr9{\ - __global unsigned char* ptrs[9];\ - } __attribute__ ((aligned (DefaultStructAlign)));\ - struct size_t9{\ - size_t v[9];\ - }__attribute__ ((aligned (DefaultStructAlign)));\ __kernel void FOpenCL__bottomPassPerform(__global unsigned char* leafCellsPtr, size_t leafCellsSize,__global unsigned char* leafCellsUpPtr,\ - __global unsigned char* containersPtr, size_t containersSize,\ - __global void* userkernel ){\ + __global unsigned char* containersPtr, size_t containersSize,\ + __global void* userkernel ){\ }\ __kernel void FOpenCL__upwardPassPerform(__global unsigned char* currentCellsPtr, size_t currentCellsSize, __global unsigned char* currentCellsUpPtr,\ - struct Uptr9 subCellGroupsPtr, struct size_t9 subCellGroupsSize, struct Uptr9 subCellGroupsUpPtr,\ - int nbSubCellGroups, int idxLevel, __global void* userkernel){\ + __global unsigned char* childCellsPtr, size_t childCellsSize, __global unsigned char* childCellsUpPtr,\ + int idxLevel, __global void* userkernel){\ }\ __kernel void FOpenCL__transferInoutPassPerformMpi(__global unsigned char* currentCellsPtr, size_t currentCellsSize, __global unsigned char* currentCellsDownPtr,\ - __global unsigned char* externalCellsPtr, size_t externalCellsSize, __global unsigned char* externalCellsUpPtr,\ - int idxLevel, const __global struct OutOfBlockInteraction* outsideInteractions,\ - size_t nbOutsideInteractions, __global void* userkernel){\ + __global unsigned char* externalCellsPtr, size_t externalCellsSize, __global unsigned char* externalCellsUpPtr,\ + int idxLevel, const __global struct OutOfBlockInteraction* outsideInteractions,\ + size_t nbOutsideInteractions, __global void* userkernel){\ }\ __kernel void FOpenCL__transferInPassPerform(__global unsigned char* currentCellsPtr, size_t currentCellsSize,\ - __global unsigned char* currentCellsUpPtr, __global unsigned char* currentCellsDownPtr,\ - int idxLevel, __global void* userkernel){\ + __global unsigned char* currentCellsUpPtr, __global unsigned char* currentCellsDownPtr,\ + int idxLevel, __global void* userkernel){\ }\ __kernel void FOpenCL__transferInoutPassPerform(__global unsigned char* currentCellsPtr, size_t currentCellsSize,\ - __global unsigned char* currentCellsUpPtr, __global unsigned char* currentCellsDownPtr,\ - __global unsigned char* externalCellsPtr, size_t externalCellsSize,\ - __global unsigned char* externalCellsUpPtr, __global unsigned char* externalCellsDownPtr,\ - int idxLevel, const __global struct OutOfBlockInteraction* outsideInteractions,\ - size_t nbOutsideInteractions, __global void* userkernel){\ + __global unsigned char* currentCellsUpPtr,\ + __global unsigned char* externalCellsPtr, size_t externalCellsSize,\ + __global unsigned char* externalCellsDownPtr,\ + int idxLevel, int mode, const __global struct OutOfBlockInteraction* outsideInteractions,\ + size_t nbOutsideInteractions, __global void* userkernel){\ }\ __kernel void FOpenCL__downardPassPerform(__global unsigned char* currentCellsPtr, size_t currentCellsSize, __global unsigned char* currentCellsDownPtr,\ - struct Uptr9 subCellGroupsPtr, struct size_t9 subCellGroupsSize, struct Uptr9 subCellGroupsDownPtr,\ - int nbSubCellGroups, int idxLevel, __global void* userkernel){\ + __global unsigned char* childCellsPtr, size_t childCellsSize, __global unsigned char* childCellsDownPtr,\ + int idxLevel, __global void* userkernel){\ }\ __kernel void FOpenCL__directInoutPassPerformMpi(__global unsigned char* containersPtr, size_t containersSize, __global unsigned char* containersDownPtr,\ - __global unsigned char* externalContainersPtr, size_t externalContainersSize, __global unsigned char* outsideInteractionsCl,\ - const __global struct OutOfBlockInteraction* outsideInteractions,\ - size_t nbOutsideInteractions, const int treeHeight, __global void* userkernel){\ + __global unsigned char* externalContainersPtr, size_t externalContainersSize, __global unsigned char* outsideInteractionsCl,\ + const __global struct OutOfBlockInteraction* outsideInteractions,\ + size_t nbOutsideInteractions, const int treeHeight, __global void* userkernel){\ }\ __kernel void FOpenCL__directInPassPerform(__global unsigned char* containersPtr, size_t containersSize, __global unsigned char* containersDownPtr,\ - const int treeHeight, __global void* userkernel){\ + const int treeHeight, __global void* userkernel){\ }\ __kernel void FOpenCL__directInoutPassPerform(__global unsigned char* containersPtr, size_t containersSize, __global unsigned char* containersDownPtr,\ - __global unsigned char* externalContainersPtr, size_t externalContainersSize, __global unsigned char* externalContainersDownPtr,\ - const __global struct OutOfBlockInteraction* outsideInteractions,\ - size_t nbOutsideInteractions, const int treeHeight, __global void* userkernel){\ + __global unsigned char* externalContainersPtr, size_t externalContainersSize, __global unsigned char* externalContainersDownPtr,\ + const __global struct OutOfBlockInteraction* outsideInteractions,\ + size_t nbOutsideInteractions, const int treeHeight, __global void* userkernel){\ }\ __kernel void FOpenCL__mergePassPerform(__global unsigned char* leafCellsPtr, size_t leafCellsSize, __global unsigned char* leafCellsDownPtr,\ - __global unsigned char* containersPtr, size_t containersSize, __global unsigned char* containersDownPtr,\ - __global void* userkernel){\ + __global unsigned char* containersPtr, size_t containersSize, __global unsigned char* containersDownPtr,\ + __global void* userkernel){\ }"; return kernelcode; } diff --git a/Src/GroupTree/OpenCl/FOpenCLDeviceWrapper.hpp b/Src/GroupTree/OpenCl/FOpenCLDeviceWrapper.hpp index 4e54284e4df2e12bcaf44d23ade01f4e751dd671..eaa6220a5d72f27210a6ea821fd53c2805f3cacf 100644 --- a/Src/GroupTree/OpenCl/FOpenCLDeviceWrapper.hpp +++ b/Src/GroupTree/OpenCl/FOpenCLDeviceWrapper.hpp @@ -21,14 +21,6 @@ template <class OriginalKernelClass, class KernelFilenameClass = FEmptyOpenCLCode> class FOpenCLDeviceWrapper { protected: - struct alignas(FStarPUDefaultAlign::StructAlign) Uptr9{ - cl_mem ptrs[9]; - }; - - struct alignas(FStarPUDefaultAlign::StructAlign) size_t9{ - size_t v[9]; - }; - static void SetKernelArgs(cl_kernel& /*kernel*/, const int /*pos*/){ } template <class ParamClass, class... Args> @@ -168,17 +160,11 @@ public: void upwardPassPerform(cl_mem currentCellsPtr, size_t currentCellsSize, cl_mem currentCellsUpPtr, - cl_mem subCellGroupsPtr[9], size_t subCellGroupsSize[9], cl_mem subCellGroupsUpPtr[9], - int nbSubCellGroups, int idxLevel, const int intervalSize){ - Uptr9 ptrs; - memcpy(ptrs.ptrs, subCellGroupsPtr, sizeof(cl_mem)*9); - size_t9 sizes; - memcpy(sizes.v, subCellGroupsSize, sizeof(size_t)*9); - Uptr9 ptrsUp; - memcpy(ptrsUp.ptrs, subCellGroupsUpPtr, sizeof(cl_mem)*9); + cl_mem subCellGroupsPtr, size_t subCellGroupsSize, cl_mem subCellGroupsUpPtr, + int idxLevel, const int intervalSize){ SetKernelArgs(kernel_upwardPassPerform, 0, ¤tCellsPtr, ¤tCellsSize, ¤tCellsUpPtr, - &ptrs, &sizes, &ptrsUp, &nbSubCellGroups, &idxLevel, &user_data); + &subCellGroupsPtr, &subCellGroupsSize, &subCellGroupsUpPtr, &idxLevel, &user_data); const int err = clEnqueueNDRangeKernel(queue_upwardPassPerform, kernel_upwardPassPerform, kernelFilename.getNbDims(), NULL, kernelFilename.getNbGroups(intervalSize), kernelFilename.getGroupSize(), 0, NULL, NULL); if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err); @@ -205,29 +191,22 @@ public: if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err); } - void transferInoutPassPerform(cl_mem currentCellsPtr, size_t currentCellsSize, cl_mem currentCellsUpPtr, cl_mem currentCellsDownPtr, - cl_mem externalCellsPtr, size_t externalCellsSize, cl_mem externalCellsUpPtr, cl_mem externalCellsDownPtr, - int idxLevel, cl_mem outsideInteractionsCl, size_t outsideInteractionsSize, const int intervalSize){ - SetKernelArgs(kernel_transferInoutPassPerform, 0, ¤tCellsPtr,¤tCellsSize, ¤tCellsUpPtr, ¤tCellsDownPtr, - &externalCellsPtr, &externalCellsSize, &externalCellsUpPtr, &externalCellsDownPtr, - &idxLevel, &outsideInteractionsCl,&outsideInteractionsSize, &user_data); + void transferInoutPassPerform(cl_mem currentCellsPtr, size_t currentCellsSize, cl_mem currentCellsUpPtr, + cl_mem externalCellsPtr, size_t externalCellsSize, cl_mem externalCellsDownPtr, + int idxLevel, const int mode, cl_mem outsideInteractionsCl, size_t outsideInteractionsSize, const int intervalSize){ + SetKernelArgs(kernel_transferInoutPassPerform, 0, ¤tCellsPtr,¤tCellsSize, ¤tCellsUpPtr, + &externalCellsPtr, &externalCellsSize, &externalCellsDownPtr, + &idxLevel, &mode, &outsideInteractionsCl,&outsideInteractionsSize, &user_data); const int err = clEnqueueNDRangeKernel(queue_transferInoutPassPerform, kernel_transferInoutPassPerform, kernelFilename.getNbDims(), NULL, kernelFilename.getNbGroups(intervalSize), kernelFilename.getGroupSize(), 0, NULL, NULL); if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err); } void downardPassPerform(cl_mem currentCellsPtr, size_t currentCellsSize, cl_mem currentCellsDownPtr, - cl_mem subCellGroupsPtr[9], size_t subCellGroupsSize[9], cl_mem subCellGroupsDownPtr[9], - int nbSubCellGroups, int idxLevel, const int intervalSize){ - Uptr9 ptrs; - memcpy(ptrs.ptrs, subCellGroupsPtr, sizeof(cl_mem)*9); - size_t9 sizes; - memcpy(sizes.v, subCellGroupsSize, sizeof(size_t)*9); - Uptr9 ptrsDown; - memcpy(ptrsDown.ptrs, subCellGroupsDownPtr, sizeof(cl_mem)*9); - + cl_mem subCellGroupsPtr, size_t subCellGroupsSize, cl_mem subCellGroupsDownPtr, + int idxLevel, const int intervalSize){ SetKernelArgs(kernel_downardPassPerform, 0, ¤tCellsPtr, ¤tCellsSize, ¤tCellsDownPtr, - &ptrs, &sizes, &ptrsDown, &nbSubCellGroups, &idxLevel, &user_data); + &subCellGroupsPtr, &subCellGroupsSize, &subCellGroupsDownPtr, &idxLevel, &user_data); const int err = clEnqueueNDRangeKernel(queue_downardPassPerform, kernel_downardPassPerform, kernelFilename.getNbDims(), NULL, kernelFilename.getNbGroups(intervalSize), kernelFilename.getGroupSize(), 0, NULL, NULL); if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err); diff --git a/Src/GroupTree/P2P/FCudaP2P.hpp b/Src/GroupTree/P2P/FCudaP2P.hpp index e9807bacfcd322a14d0d8f1311e946269b3e83f3..41f0d93b4b1cff4a33c49331bd49c34718601164 100644 --- a/Src/GroupTree/P2P/FCudaP2P.hpp +++ b/Src/GroupTree/P2P/FCudaP2P.hpp @@ -6,29 +6,6 @@ #include "../Cuda/FCudaEmptyCellSymb.hpp" #include "../Cuda/FCudaCompositeCell.hpp" -#define DirectMacro(targetX, targetY, targetZ, targetPhys, \ - forceX, forceY, forceZ, potential,\ - sourcesX, sourcesY, sourcesZ, sourcesPhys)\ -{\ - FReal dx = sourcesX - targetX;\ - FReal dy = sourcesY - targetY;\ - FReal dz = sourcesZ - targetZ;\ - \ - FReal inv_square_distance = FReal(1.0) / (dx*dx + dy*dy + dz*dz);\ - FReal inv_distance = sqrt(inv_square_distance);\ - \ - inv_square_distance *= inv_distance;\ - inv_square_distance *= targetPhys * sourcesPhys;\ - \ - dx *= inv_square_distance;\ - dy *= inv_square_distance;\ - dz *= inv_square_distance;\ - \ - forceX += dx;\ - forceY += dy;\ - forceZ += dz;\ - sourcesPhys += inv_distance * sourcesPhys;\ - } #define Min(x,y) ((x)<(y)?(x):(y)) #define Max(x,y) ((x)>(y)?(x):(y)) @@ -40,6 +17,30 @@ template <class FReal> class FCudaP2P { protected: public: + + __device__ void DirectComputation(const FReal& targetX, const FReal& targetY, const FReal& targetZ,const FReal& targetPhys, + FReal& forceX, FReal& forceY,FReal& forceZ, FReal& potential, + const FReal& sourcesX, const FReal& sourcesY, const FReal& sourcesZ, const FReal& sourcesPhys) const { + FReal dx = sourcesX - targetX; + FReal dy = sourcesY - targetY; + FReal dz = sourcesZ - targetZ; + + FReal inv_square_distance = FReal(1.0) / (dx*dx + dy*dy + dz*dz); + FReal inv_distance = sqrt(inv_square_distance); + + inv_square_distance *= inv_distance; + inv_square_distance *= targetPhys * sourcesPhys; + + dx *= inv_square_distance; + dy *= inv_square_distance; + dz *= inv_square_distance; + + forceX += dx; + forceY += dy; + forceZ += dz; + potential += inv_distance * sourcesPhys; + } + static double DSqrt(const double val){ return sqrt(val); } @@ -48,10 +49,10 @@ public: return sqrtf(val); } - typedef FCudaGroupAttachedLeaf<FReal,4,4,FReal> ContainerClass; + typedef FCudaGroupAttachedLeaf<FReal,1,4,FReal> ContainerClass; typedef FCudaCompositeCell<FCudaEmptyCellSymb,int,int> CellClass; - static const int SHARE_SIZE = 128; + static const int SHARE_SIZE = 1;//128; __device__ void P2M(CellClass /*pole*/, const ContainerClass* const /*particles*/) { } @@ -59,8 +60,9 @@ public: __device__ void M2M(CellClass /*pole*/, const CellClass /*child*/[8], const int /*level*/) { } - __device__ void M2L(CellClass /*pole*/, const CellClass /*distantNeighbors*/[343], - const int /*size*/, const int /*level*/) { + __device__ void M2L(CellClass /*pole*/, const CellClass* /*distantNeighbors*/, + const int* /*neighPositions*/, + const int /*size*/, const int /*level*/) { } __device__ void L2L(const CellClass /*local*/, CellClass /*child*/[8], const int /*level*/) { @@ -71,25 +73,22 @@ public: __device__ void P2P(const int3& pos, ContainerClass* const targets, const ContainerClass* const sources, - ContainerClass* const directNeighborsParticles[27], const int counter){ + ContainerClass* const directNeighborsParticles, + const int* neighborPositions, const int counter){ // Compute with other - P2PRemote(pos, targets, sources, directNeighborsParticles, counter); + P2PRemote(pos, targets, sources, directNeighborsParticles, neighborPositions, counter); // Compute inside - const int nbLoops = (targets->getNbParticles()+blockDim.x-1)/blockDim.x; - for(int idxLoop = 0 ; idxLoop < nbLoops; ++idxLoop){ - const int idxPart = (idxLoop*blockDim.x+threadIdx.x); + for(int idxPart = threadIdx.x ; idxPart < targets->getNbParticles()+blockDim.x-1 ; idxPart += blockDim.x){ const bool threadCompute = (idxPart < targets->getNbParticles()); FReal targetX, targetY, targetZ, targetPhys; FReal forceX = 0, forceY = 0, forceZ = 0, potential = 0; - if(threadCompute){ - targetX = targets->getPositions()[0][idxPart]; - targetY = targets->getPositions()[1][idxPart]; - targetZ = targets->getPositions()[2][idxPart]; - targetPhys = targets->getAttribute(0)[idxPart]; - } + targetX = (threadCompute? targets->getPositions()[0][idxPart] : 0); + targetY = (threadCompute? targets->getPositions()[1][idxPart] : 0); + targetZ = (threadCompute? targets->getPositions()[2][idxPart] : 0); + targetPhys = (threadCompute? targets->getAttribute(0)[idxPart] : 0); for(int idxCopy = 0 ; idxCopy < targets->getNbParticles() ; idxCopy += SHARE_SIZE){ __shared__ FReal sourcesX[SHARE_SIZE]; @@ -99,57 +98,61 @@ public: const int nbCopies = Min(SHARE_SIZE, targets->getNbParticles()-idxCopy); if(threadIdx.x < nbCopies){ - sourcesX[threadIdx.x] = targets->getPositions()[0][idxPart]; - sourcesY[threadIdx.x] = targets->getPositions()[1][idxPart]; - sourcesZ[threadIdx.x] = targets->getPositions()[2][idxPart]; - sourcesPhys[threadIdx.x] = targets->getAttribute(0)[idxPart]; + sourcesX[threadIdx.x] = targets->getPositions()[0][threadIdx.x+idxCopy]; + sourcesY[threadIdx.x] = targets->getPositions()[1][threadIdx.x+idxCopy]; + sourcesZ[threadIdx.x] = targets->getPositions()[2][threadIdx.x+idxCopy]; + sourcesPhys[threadIdx.x] = targets->getAttribute(0)[threadIdx.x+idxCopy]; } __syncthreads(); if(threadCompute){ - const int leftCopies = Min(idxPart, nbCopies); + int leftCopies = nbCopies; + if(idxCopy <= idxPart && idxPart < idxCopy + nbCopies){ + leftCopies = idxPart - idxCopy; + } + // Left Part for(int otherIndex = 0; otherIndex < leftCopies - 3; otherIndex += 4) { // unrolling x4 - DirectMacro(targetX, targetY, targetZ, targetPhys, - forceX, forceY, forceZ, potential, - sourcesX[otherIndex], sourcesY[otherIndex], sourcesZ[otherIndex], sourcesPhys[otherIndex]); - DirectMacro(targetX, targetY, targetZ, targetPhys, - forceX, forceY, forceZ, potential, - sourcesX[otherIndex+1], sourcesY[otherIndex+1], sourcesZ[otherIndex+1], sourcesPhys[otherIndex+1]); - DirectMacro(targetX, targetY, targetZ, targetPhys, - forceX, forceY, forceZ, potential, - sourcesX[otherIndex+2], sourcesY[otherIndex+2], sourcesZ[otherIndex+2], sourcesPhys[otherIndex+2]); - DirectMacro(targetX, targetY, targetZ, targetPhys, - forceX, forceY, forceZ, potential, - sourcesX[otherIndex+3], sourcesY[otherIndex+3], sourcesZ[otherIndex+3], sourcesPhys[otherIndex+3]); + DirectComputation(targetX, targetY, targetZ, targetPhys, + forceX, forceY, forceZ, potential, + sourcesX[otherIndex], sourcesY[otherIndex], sourcesZ[otherIndex], sourcesPhys[otherIndex]); + DirectComputation(targetX, targetY, targetZ, targetPhys, + forceX, forceY, forceZ, potential, + sourcesX[otherIndex+1], sourcesY[otherIndex+1], sourcesZ[otherIndex+1], sourcesPhys[otherIndex+1]); + DirectComputation(targetX, targetY, targetZ, targetPhys, + forceX, forceY, forceZ, potential, + sourcesX[otherIndex+2], sourcesY[otherIndex+2], sourcesZ[otherIndex+2], sourcesPhys[otherIndex+2]); + DirectComputation(targetX, targetY, targetZ, targetPhys, + forceX, forceY, forceZ, potential, + sourcesX[otherIndex+3], sourcesY[otherIndex+3], sourcesZ[otherIndex+3], sourcesPhys[otherIndex+3]); } - for(int otherIndex = (leftCopies/4) * 4; otherIndex < nbCopies; ++otherIndex) { // if nk%4 is not zero - DirectMacro(targetX, targetY, targetZ, targetPhys, - forceX, forceY, forceZ, potential, - sourcesX[otherIndex], sourcesY[otherIndex], sourcesZ[otherIndex], sourcesPhys[otherIndex]); + for(int otherIndex = (leftCopies/4) * 4; otherIndex < leftCopies; ++otherIndex) { // if nk%4 is not zero + DirectComputation(targetX, targetY, targetZ, targetPhys, + forceX, forceY, forceZ, potential, + sourcesX[otherIndex], sourcesY[otherIndex], sourcesZ[otherIndex], sourcesPhys[otherIndex]); } // Right Part for(int otherIndex = leftCopies+1; otherIndex < nbCopies - 3; otherIndex += 4) { // unrolling x4 - DirectMacro(targetX, targetY, targetZ, targetPhys, - forceX, forceY, forceZ, potential, - sourcesX[otherIndex], sourcesY[otherIndex], sourcesZ[otherIndex], sourcesPhys[otherIndex]); - DirectMacro(targetX, targetY, targetZ, targetPhys, - forceX, forceY, forceZ, potential, - sourcesX[otherIndex+1], sourcesY[otherIndex+1], sourcesZ[otherIndex+1], sourcesPhys[otherIndex+1]); - DirectMacro(targetX, targetY, targetZ, targetPhys, - forceX, forceY, forceZ, potential, - sourcesX[otherIndex+2], sourcesY[otherIndex+2], sourcesZ[otherIndex+2], sourcesPhys[otherIndex+2]); - DirectMacro(targetX, targetY, targetZ, targetPhys, - forceX, forceY, forceZ, potential, - sourcesX[otherIndex+3], sourcesY[otherIndex+3], sourcesZ[otherIndex+3], sourcesPhys[otherIndex+3]); + DirectComputation(targetX, targetY, targetZ, targetPhys, + forceX, forceY, forceZ, potential, + sourcesX[otherIndex], sourcesY[otherIndex], sourcesZ[otherIndex], sourcesPhys[otherIndex]); + DirectComputation(targetX, targetY, targetZ, targetPhys, + forceX, forceY, forceZ, potential, + sourcesX[otherIndex+1], sourcesY[otherIndex+1], sourcesZ[otherIndex+1], sourcesPhys[otherIndex+1]); + DirectComputation(targetX, targetY, targetZ, targetPhys, + forceX, forceY, forceZ, potential, + sourcesX[otherIndex+2], sourcesY[otherIndex+2], sourcesZ[otherIndex+2], sourcesPhys[otherIndex+2]); + DirectComputation(targetX, targetY, targetZ, targetPhys, + forceX, forceY, forceZ, potential, + sourcesX[otherIndex+3], sourcesY[otherIndex+3], sourcesZ[otherIndex+3], sourcesPhys[otherIndex+3]); } - for(int otherIndex = Max(leftCopies+1, (nbCopies/4) * 4); otherIndex < nbCopies; ++otherIndex) { // if nk%4 is not zero - DirectMacro(targetX, targetY, targetZ, targetPhys, - forceX, forceY, forceZ, potential, - sourcesX[otherIndex], sourcesY[otherIndex], sourcesZ[otherIndex], sourcesPhys[otherIndex]); + for(int otherIndex = leftCopies+1 + ((nbCopies-(leftCopies+1))/4)*4 ; otherIndex < nbCopies; ++otherIndex) { // if nk%4 is not zero + DirectComputation(targetX, targetY, targetZ, targetPhys, + forceX, forceY, forceZ, potential, + sourcesX[otherIndex], sourcesY[otherIndex], sourcesZ[otherIndex], sourcesPhys[otherIndex]); } } @@ -157,86 +160,155 @@ public: } if( threadCompute ){ - targets->getAttribute(1)[idxPart] += forceX; - targets->getAttribute(2)[idxPart] += forceY; - targets->getAttribute(3)[idxPart] += forceZ; - targets->getAttribute(4)[idxPart] += potential; + targets->getAttribute(1)[idxPart] += potential; + targets->getAttribute(2)[idxPart] += forceX; + targets->getAttribute(3)[idxPart] += forceY; + targets->getAttribute(4)[idxPart] += forceZ; } + __syncthreads(); } } __device__ void P2PRemote(const int3& , ContainerClass* const targets, const ContainerClass* const /*sources*/, - ContainerClass* const directNeighborsParticles[27], const int ){ - for(int idxNeigh = 0 ; idxNeigh < 27 ; ++idxNeigh){ - if(directNeighborsParticles[idxNeigh]){ - const int nbLoops = (targets->getNbParticles()+blockDim.x-1)/blockDim.x; - - for(int idxLoop = 0 ; idxLoop < nbLoops; ++idxLoop){ - const int idxPart = (idxLoop*blockDim.x+threadIdx.x); - const bool threadCompute = (idxPart < targets->getNbParticles()); + ContainerClass* const directNeighborsParticles, + const int* /*neighborsPositions*/, const int counter){ + for(int idxNeigh = 0 ; idxNeigh < counter ; ++idxNeigh){ + + for(int idxPart = threadIdx.x ; idxPart < targets->getNbParticles()+blockDim.x-1 ; idxPart += blockDim.x){ + const bool threadCompute = (idxPart < targets->getNbParticles()); + + FReal targetX, targetY, targetZ, targetPhys; + FReal forceX = 0, forceY = 0, forceZ = 0, potential = 0; + + targetX = (threadCompute? targets->getPositions()[0][idxPart] : 0); + targetY = (threadCompute? targets->getPositions()[1][idxPart] : 0); + targetZ = (threadCompute? targets->getPositions()[2][idxPart] : 0); + targetPhys = (threadCompute? targets->getAttribute(0)[idxPart] : 0); + + for(int idxCopy = 0 ; idxCopy < directNeighborsParticles[idxNeigh].getNbParticles() ; idxCopy += SHARE_SIZE){ + __shared__ FReal sourcesX[SHARE_SIZE]; + __shared__ FReal sourcesY[SHARE_SIZE]; + __shared__ FReal sourcesZ[SHARE_SIZE]; + __shared__ FReal sourcesPhys[SHARE_SIZE]; + + const int nbCopies = Min(SHARE_SIZE, directNeighborsParticles[idxNeigh].getNbParticles()-idxCopy); + if(threadIdx.x < nbCopies){ + sourcesX[threadIdx.x] = directNeighborsParticles[idxNeigh].getPositions()[0][threadIdx.x+idxCopy]; + sourcesY[threadIdx.x] = directNeighborsParticles[idxNeigh].getPositions()[1][threadIdx.x+idxCopy]; + sourcesZ[threadIdx.x] = directNeighborsParticles[idxNeigh].getPositions()[2][threadIdx.x+idxCopy]; + sourcesPhys[threadIdx.x] = directNeighborsParticles[idxNeigh].getAttribute(0)[threadIdx.x+idxCopy]; + } - FReal targetX, targetY, targetZ, targetPhys; - FReal forceX = 0, forceY = 0, forceZ = 0, potential = 0; + __syncthreads(); if(threadCompute){ - targetX = targets->getPositions()[0][idxPart]; - targetY = targets->getPositions()[1][idxPart]; - targetZ = targets->getPositions()[2][idxPart]; - targetPhys = targets->getAttribute(0)[idxPart]; - } - - for(int idxCopy = 0 ; idxCopy < directNeighborsParticles[idxNeigh]->getNbParticles() ; idxCopy += SHARE_SIZE){ - __shared__ FReal sourcesX[SHARE_SIZE]; - __shared__ FReal sourcesY[SHARE_SIZE]; - __shared__ FReal sourcesZ[SHARE_SIZE]; - __shared__ FReal sourcesPhys[SHARE_SIZE]; - - const int nbCopies = Min(SHARE_SIZE, directNeighborsParticles[idxNeigh]->getNbParticles()-idxCopy); - if(threadIdx.x < nbCopies){ - sourcesX[threadIdx.x] = directNeighborsParticles[idxNeigh]->getPositions()[0][idxPart]; - sourcesY[threadIdx.x] = directNeighborsParticles[idxNeigh]->getPositions()[1][idxPart]; - sourcesZ[threadIdx.x] = directNeighborsParticles[idxNeigh]->getPositions()[2][idxPart]; - sourcesPhys[threadIdx.x] = directNeighborsParticles[idxNeigh]->getAttribute(0)[idxPart]; + for(int otherIndex = 0; otherIndex < nbCopies - 3; otherIndex += 4) { // unrolling x4 + DirectComputation(targetX, targetY, targetZ, targetPhys, + forceX, forceY, forceZ, potential, + sourcesX[otherIndex], sourcesY[otherIndex], sourcesZ[otherIndex], sourcesPhys[otherIndex]); + DirectComputation(targetX, targetY, targetZ, targetPhys, + forceX, forceY, forceZ, potential, + sourcesX[otherIndex+1], sourcesY[otherIndex+1], sourcesZ[otherIndex+1], sourcesPhys[otherIndex+1]); + DirectComputation(targetX, targetY, targetZ, targetPhys, + forceX, forceY, forceZ, potential, + sourcesX[otherIndex+2], sourcesY[otherIndex+2], sourcesZ[otherIndex+2], sourcesPhys[otherIndex+2]); + DirectComputation(targetX, targetY, targetZ, targetPhys, + forceX, forceY, forceZ, potential, + sourcesX[otherIndex+3], sourcesY[otherIndex+3], sourcesZ[otherIndex+3], sourcesPhys[otherIndex+3]); } - __syncthreads(); - - if(threadCompute){ - for(int otherIndex = 0; otherIndex < nbCopies - 3; otherIndex += 4) { // unrolling x4 - DirectMacro(targetX, targetY, targetZ, targetPhys, - forceX, forceY, forceZ, potential, - sourcesX[otherIndex], sourcesY[otherIndex], sourcesZ[otherIndex], sourcesPhys[otherIndex]); - DirectMacro(targetX, targetY, targetZ, targetPhys, - forceX, forceY, forceZ, potential, - sourcesX[otherIndex+1], sourcesY[otherIndex+1], sourcesZ[otherIndex+1], sourcesPhys[otherIndex+1]); - DirectMacro(targetX, targetY, targetZ, targetPhys, - forceX, forceY, forceZ, potential, - sourcesX[otherIndex+2], sourcesY[otherIndex+2], sourcesZ[otherIndex+2], sourcesPhys[otherIndex+2]); - DirectMacro(targetX, targetY, targetZ, targetPhys, - forceX, forceY, forceZ, potential, - sourcesX[otherIndex+3], sourcesY[otherIndex+3], sourcesZ[otherIndex+3], sourcesPhys[otherIndex+3]); - } - - for(int otherIndex = (nbCopies/4) * 4; otherIndex < nbCopies; ++otherIndex) { // if nk%4 is not zero - DirectMacro(targetX, targetY, targetZ, targetPhys, - forceX, forceY, forceZ, potential, - sourcesX[otherIndex], sourcesY[otherIndex], sourcesZ[otherIndex], sourcesPhys[otherIndex]); - } + for(int otherIndex = (nbCopies/4) * 4; otherIndex < nbCopies; ++otherIndex) { // if nk%4 is not zero + DirectComputation(targetX, targetY, targetZ, targetPhys, + forceX, forceY, forceZ, potential, + sourcesX[otherIndex], sourcesY[otherIndex], sourcesZ[otherIndex], sourcesPhys[otherIndex]); } + } + + __syncthreads(); + } - __syncthreads(); + if( threadCompute ){ + targets->getAttribute(1)[idxPart] += potential; + targets->getAttribute(2)[idxPart] += forceX; + targets->getAttribute(3)[idxPart] += forceY; + targets->getAttribute(4)[idxPart] += forceZ; + } + + + __syncthreads(); + } + } + } + + __device__ void P2POuter(const int3& , + ContainerClass* const targets, + ContainerClass* const directNeighborsParticles, + const int* /*neighborsPositions*/, const int counter){ + for(int idxNeigh = 0 ; idxNeigh < counter ; ++idxNeigh){ + + for(int idxPart = threadIdx.x ; idxPart < targets->getNbParticles()+blockDim.x-1 ; idxPart += blockDim.x){ + const bool threadCompute = (idxPart < targets->getNbParticles()); + + FReal targetX, targetY, targetZ, targetPhys; + FReal forceX = 0, forceY = 0, forceZ = 0, potential = 0; + + targetX = (threadCompute? targets->getPositions()[0][idxPart] : 0); + targetY = (threadCompute? targets->getPositions()[1][idxPart] : 0); + targetZ = (threadCompute? targets->getPositions()[2][idxPart] : 0); + targetPhys = (threadCompute? targets->getAttribute(0)[idxPart] : 0); + + for(int idxCopy = 0 ; idxCopy < directNeighborsParticles[idxNeigh].getNbParticles() ; idxCopy += SHARE_SIZE){ + __shared__ FReal sourcesX[SHARE_SIZE]; + __shared__ FReal sourcesY[SHARE_SIZE]; + __shared__ FReal sourcesZ[SHARE_SIZE]; + __shared__ FReal sourcesPhys[SHARE_SIZE]; + + const int nbCopies = Min(SHARE_SIZE, directNeighborsParticles[idxNeigh].getNbParticles()-idxCopy); + if(threadIdx.x < nbCopies){ + sourcesX[threadIdx.x] = directNeighborsParticles[idxNeigh].getPositions()[0][threadIdx.x+idxCopy]; + sourcesY[threadIdx.x] = directNeighborsParticles[idxNeigh].getPositions()[1][threadIdx.x+idxCopy]; + sourcesZ[threadIdx.x] = directNeighborsParticles[idxNeigh].getPositions()[2][threadIdx.x+idxCopy]; + sourcesPhys[threadIdx.x] = directNeighborsParticles[idxNeigh].getAttribute(0)[threadIdx.x+idxCopy]; } - if( threadCompute ){ - targets->getAttribute(1)[idxPart] += forceX; - targets->getAttribute(2)[idxPart] += forceY; - targets->getAttribute(3)[idxPart] += forceZ; - targets->getAttribute(4)[idxPart] += potential; + __syncthreads(); + + if(threadCompute){ + for(int otherIndex = 0; otherIndex < nbCopies - 3; otherIndex += 4) { // unrolling x4 + DirectComputation(targetX, targetY, targetZ, targetPhys, + forceX, forceY, forceZ, potential, + sourcesX[otherIndex], sourcesY[otherIndex], sourcesZ[otherIndex], sourcesPhys[otherIndex]); + DirectComputation(targetX, targetY, targetZ, targetPhys, + forceX, forceY, forceZ, potential, + sourcesX[otherIndex+1], sourcesY[otherIndex+1], sourcesZ[otherIndex+1], sourcesPhys[otherIndex+1]); + DirectComputation(targetX, targetY, targetZ, targetPhys, + forceX, forceY, forceZ, potential, + sourcesX[otherIndex+2], sourcesY[otherIndex+2], sourcesZ[otherIndex+2], sourcesPhys[otherIndex+2]); + DirectComputation(targetX, targetY, targetZ, targetPhys, + forceX, forceY, forceZ, potential, + sourcesX[otherIndex+3], sourcesY[otherIndex+3], sourcesZ[otherIndex+3], sourcesPhys[otherIndex+3]); + } + + for(int otherIndex = (nbCopies/4) * 4; otherIndex < nbCopies; ++otherIndex) { // if nk%4 is not zero + DirectComputation(targetX, targetY, targetZ, targetPhys, + forceX, forceY, forceZ, potential, + sourcesX[otherIndex], sourcesY[otherIndex], sourcesZ[otherIndex], sourcesPhys[otherIndex]); + } } + __syncthreads(); } + + if( threadCompute ){ + targets->getAttribute(1)[idxPart] += potential; + targets->getAttribute(2)[idxPart] += forceX; + targets->getAttribute(3)[idxPart] += forceY; + targets->getAttribute(4)[idxPart] += forceZ; + } + + __syncthreads(); } } } @@ -250,7 +322,7 @@ public: } __host__ static dim3 GetGridSize(const int intervalSize){ - return intervalSize; + return 1; //intervalSize; } __host__ static dim3 GetBlocksSize(){ diff --git a/Src/GroupTree/StarPUUtils/FStarPUCpuWrapper.hpp b/Src/GroupTree/StarPUUtils/FStarPUCpuWrapper.hpp index 8c54c132ce7b33f26f09b4015c049e0878a0c45e..a538cb272487c557fbcd25df5217eb5c5292d323 100644 --- a/Src/GroupTree/StarPUUtils/FStarPUCpuWrapper.hpp +++ b/Src/GroupTree/StarPUUtils/FStarPUCpuWrapper.hpp @@ -26,7 +26,7 @@ #include <starpu.h> //} -#ifdef STARPU_USE_MPI +#if defined(STARPU_USE_MPI) && defined(SCALFMM_USE_MPI) //extern "C"{ #include <starpu_mpi.h> //} @@ -184,7 +184,7 @@ public: ///////////////////////////////////////////////////////////////////////////////////// /// Transfer Pass Mpi ///////////////////////////////////////////////////////////////////////////////////// -#ifdef STARPU_USE_MPI +#if defined(STARPU_USE_MPI) && defined(SCALFMM_USE_MPI) static void transferInoutPassCallbackMpi(void *buffers[], void *cl_arg){ CellContainerClass currentCells((unsigned char*)STARPU_VARIABLE_GET_PTR(buffers[0]), STARPU_VARIABLE_GET_ELEMSIZE(buffers[0]), @@ -405,7 +405,7 @@ public: /// Direct Pass MPI ///////////////////////////////////////////////////////////////////////////////////// -#ifdef STARPU_USE_MPI +#if defined(STARPU_USE_MPI) && defined(SCALFMM_USE_MPI) static void directInoutPassCallbackMpi(void *buffers[], void *cl_arg){ ParticleGroupClass containers((unsigned char*)STARPU_VARIABLE_GET_PTR(buffers[0]), STARPU_VARIABLE_GET_ELEMSIZE(buffers[0]), diff --git a/Src/GroupTree/StarPUUtils/FStarPUCudaWrapper.hpp b/Src/GroupTree/StarPUUtils/FStarPUCudaWrapper.hpp index 26c4a050f19eb411426305edd155b12a7f04aa26..5e438872e83e205ad5fdc3638c822eae068d6c2f 100644 --- a/Src/GroupTree/StarPUUtils/FStarPUCudaWrapper.hpp +++ b/Src/GroupTree/StarPUUtils/FStarPUCudaWrapper.hpp @@ -23,7 +23,7 @@ #include <starpu.h> -#ifdef STARPU_USE_MPI +#if defined(STARPU_USE_MPI) && defined(SCALFMM_USE_MPI) #include <starpu_mpi.h> #endif @@ -96,21 +96,9 @@ public: FStarPUPtrInterface* worker = nullptr; int nbSubCellGroups = 0; int idxLevel = 0; - int intervalSize; + int intervalSize = 0; starpu_codelet_unpack_args(cl_arg, &worker, &nbSubCellGroups, &idxLevel, &intervalSize); - FCudaParams<unsigned char*,9> subCellGroupsPtr; - memset(&subCellGroupsPtr, 0, sizeof(subCellGroupsPtr)); - FCudaParams<std::size_t,9> subCellGroupsSize; - memset(&subCellGroupsPtr, 0, sizeof(subCellGroupsSize)); - FCudaParams<unsigned char*,9> subCellGroupsUpPtr; - memset(&subCellGroupsUpPtr, 0, sizeof(subCellGroupsUpPtr)); - for(int idxSubGroup = 0; idxSubGroup < nbSubCellGroups ; ++idxSubGroup){ - subCellGroupsPtr.values[idxSubGroup] = ((unsigned char*)STARPU_VARIABLE_GET_PTR(buffers[(idxSubGroup*2)+2])); - subCellGroupsSize.values[idxSubGroup] = STARPU_VARIABLE_GET_ELEMSIZE(buffers[(idxSubGroup*2)+2]); - subCellGroupsUpPtr.values[idxSubGroup] = (unsigned char*)STARPU_VARIABLE_GET_PTR(buffers[(idxSubGroup*2)+3]); - } - CudaKernelClass* kernel = worker->get<ThisClass>(FSTARPU_CPU_IDX)->kernels[starpu_worker_get_id()]; FCuda__upwardPassCallback< SymboleCellClass, PoleCellClass, LocalCellClass, @@ -118,20 +106,22 @@ public: (unsigned char*)STARPU_VARIABLE_GET_PTR(buffers[0]), STARPU_VARIABLE_GET_ELEMSIZE(buffers[0]), (unsigned char*)STARPU_VARIABLE_GET_PTR(buffers[1]), - subCellGroupsPtr,subCellGroupsSize,subCellGroupsUpPtr, - nbSubCellGroups, idxLevel, kernel, starpu_cuda_get_local_stream(), + (unsigned char*)STARPU_VARIABLE_GET_PTR(buffers[2]), + STARPU_VARIABLE_GET_ELEMSIZE(buffers[2]), + (unsigned char*)STARPU_VARIABLE_GET_PTR(buffers[3]), + idxLevel, kernel, starpu_cuda_get_local_stream(), FCuda__GetGridSize(kernel,intervalSize),FCuda__GetBlockSize(kernel)); } ///////////////////////////////////////////////////////////////////////////////////// /// Transfer Pass Mpi ///////////////////////////////////////////////////////////////////////////////////// -#ifdef STARPU_USE_MPI +#if defined(STARPU_USE_MPI) && defined(SCALFMM_USE_MPI) static void transferInoutPassCallbackMpi(void *buffers[], void *cl_arg){ FStarPUPtrInterface* worker = nullptr; int idxLevel = 0; - const std::vector<OutOfBlockInteraction>* outsideInteractions; - int intervalSize; + const std::vector<OutOfBlockInteraction>* outsideInteractions = nullptr; + int intervalSize = 0; starpu_codelet_unpack_args(cl_arg, &worker, &idxLevel, &outsideInteractions, &intervalSize); CudaKernelClass* kernel = worker->get<ThisClass>(FSTARPU_CPU_IDX)->kernels[starpu_worker_get_id()]; @@ -156,7 +146,7 @@ public: static void transferInPassCallback(void *buffers[], void *cl_arg){ FStarPUPtrInterface* worker = nullptr; int idxLevel = 0; - int intervalSize; + int intervalSize = 0; starpu_codelet_unpack_args(cl_arg, &worker, &idxLevel, &intervalSize); CudaKernelClass* kernel = worker->get<ThisClass>(FSTARPU_CPU_IDX)->kernels[starpu_worker_get_id()]; @@ -174,9 +164,10 @@ public: static void transferInoutPassCallback(void *buffers[], void *cl_arg){ FStarPUPtrInterface* worker = nullptr; int idxLevel = 0; - const std::vector<OutOfBlockInteraction>* outsideInteractions; - int intervalSize; - starpu_codelet_unpack_args(cl_arg, &worker, &idxLevel, &outsideInteractions, &intervalSize); + const std::vector<OutOfBlockInteraction>* outsideInteractions = nullptr; + int intervalSize = 0; + int mode = 0; + starpu_codelet_unpack_args(cl_arg, &worker, &idxLevel, &outsideInteractions, &intervalSize, &mode); CudaKernelClass* kernel = worker->get<ThisClass>(FSTARPU_CPU_IDX)->kernels[starpu_worker_get_id()]; @@ -186,11 +177,9 @@ public: STARPU_VARIABLE_GET_ELEMSIZE(buffers[0]), (unsigned char*)STARPU_VARIABLE_GET_PTR(buffers[1]), (unsigned char*)STARPU_VARIABLE_GET_PTR(buffers[2]), + STARPU_VARIABLE_GET_ELEMSIZE(buffers[2]), (unsigned char*)STARPU_VARIABLE_GET_PTR(buffers[3]), - STARPU_VARIABLE_GET_ELEMSIZE(buffers[3]), - (unsigned char*)STARPU_VARIABLE_GET_PTR(buffers[4]), - (unsigned char*)STARPU_VARIABLE_GET_PTR(buffers[5]), - idxLevel, outsideInteractions->data(), int(outsideInteractions->size()), kernel, + idxLevel, mode, outsideInteractions->data(), int(outsideInteractions->size()), kernel, starpu_cuda_get_local_stream(), FCuda__GetGridSize(kernel,intervalSize),FCuda__GetBlockSize(kernel)); } @@ -202,21 +191,9 @@ public: FStarPUPtrInterface* worker = nullptr; int nbSubCellGroups = 0; int idxLevel = 0; - int intervalSize; + int intervalSize = 0; starpu_codelet_unpack_args(cl_arg, &worker, &nbSubCellGroups, &idxLevel, &intervalSize); - FCudaParams<unsigned char*,9> subCellGroupsPtr; - memset(&subCellGroupsPtr, 0, sizeof(subCellGroupsPtr)); - FCudaParams<std::size_t,9> subCellGroupsSize; - memset(&subCellGroupsPtr, 0, sizeof(subCellGroupsSize)); - FCudaParams<unsigned char*,9> subCellGroupsDownPtr; - memset(&subCellGroupsDownPtr, 0, sizeof(subCellGroupsDownPtr)); - for(int idxSubGroup = 0; idxSubGroup < nbSubCellGroups ; ++idxSubGroup){ - subCellGroupsPtr.values[idxSubGroup] = ((unsigned char*)STARPU_VARIABLE_GET_PTR(buffers[(idxSubGroup*2)+2])); - subCellGroupsSize.values[idxSubGroup] = (STARPU_VARIABLE_GET_ELEMSIZE(buffers[(idxSubGroup*2)+2])); - subCellGroupsDownPtr.values[idxSubGroup] = ((unsigned char*)STARPU_VARIABLE_GET_PTR(buffers[(idxSubGroup*2)+3])); - } - CudaKernelClass* kernel = worker->get<ThisClass>(FSTARPU_CPU_IDX)->kernels[starpu_worker_get_id()]; FCuda__downardPassCallback< SymboleCellClass, PoleCellClass, LocalCellClass, @@ -224,20 +201,22 @@ public: (unsigned char*)STARPU_VARIABLE_GET_PTR(buffers[0]), STARPU_VARIABLE_GET_ELEMSIZE(buffers[0]), (unsigned char*)STARPU_VARIABLE_GET_PTR(buffers[1]), - subCellGroupsPtr,subCellGroupsSize,subCellGroupsDownPtr, - nbSubCellGroups, idxLevel, kernel, starpu_cuda_get_local_stream(), + (unsigned char*)STARPU_VARIABLE_GET_PTR(buffers[2]), + STARPU_VARIABLE_GET_ELEMSIZE(buffers[2]), + (unsigned char*)STARPU_VARIABLE_GET_PTR(buffers[3]), + idxLevel, kernel, starpu_cuda_get_local_stream(), FCuda__GetGridSize(kernel,intervalSize),FCuda__GetBlockSize(kernel)); } ///////////////////////////////////////////////////////////////////////////////////// /// Direct Pass MPI ///////////////////////////////////////////////////////////////////////////////////// -#ifdef STARPU_USE_MPI +#if defined(STARPU_USE_MPI) && defined(SCALFMM_USE_MPI) static void directInoutPassCallbackMpi(void *buffers[], void *cl_arg){ FStarPUPtrInterface* worker = nullptr; const std::vector<OutOfBlockInteraction>* outsideInteractions = nullptr; - int intervalSize; + int intervalSize = 0; starpu_codelet_unpack_args(cl_arg, &worker, &outsideInteractions, &intervalSize); CudaKernelClass* kernel = worker->get<ThisClass>(FSTARPU_CPU_IDX)->kernels[starpu_worker_get_id()]; @@ -260,7 +239,7 @@ public: static void directInPassCallback(void *buffers[], void *cl_arg){ FStarPUPtrInterface* worker = nullptr; - int intervalSize; + int intervalSize = 0; starpu_codelet_unpack_args(cl_arg, &worker, &intervalSize); CudaKernelClass* kernel = worker->get<ThisClass>(FSTARPU_CPU_IDX)->kernels[starpu_worker_get_id()]; @@ -276,7 +255,7 @@ public: static void directInoutPassCallback(void *buffers[], void *cl_arg){ FStarPUPtrInterface* worker = nullptr; const std::vector<OutOfBlockInteraction>* outsideInteractions = nullptr; - int intervalSize; + int intervalSize = 0; starpu_codelet_unpack_args(cl_arg, &worker, &outsideInteractions, &intervalSize); CudaKernelClass* kernel = worker->get<ThisClass>(FSTARPU_CPU_IDX)->kernels[starpu_worker_get_id()]; diff --git a/Src/GroupTree/StarPUUtils/FStarPUOpenClWrapper.hpp b/Src/GroupTree/StarPUUtils/FStarPUOpenClWrapper.hpp index c7d181d61f8cf6d7fec262eff775efc576323539..2620e4d072f8239e40624df5ab5892b88228d553 100644 --- a/Src/GroupTree/StarPUUtils/FStarPUOpenClWrapper.hpp +++ b/Src/GroupTree/StarPUUtils/FStarPUOpenClWrapper.hpp @@ -25,7 +25,7 @@ #include <starpu.h> -#ifdef STARPU_USE_MPI +#if defined(STARPU_USE_MPI) && defined(SCALFMM_USE_MPI) #include <starpu_mpi.h> #endif @@ -100,22 +100,13 @@ public: int intervalSize; starpu_codelet_unpack_args(cl_arg, &worker, &nbSubCellGroups, &idxLevel, &intervalSize); - cl_mem subCellGroupsPtr[9]; - memset(subCellGroupsPtr, 0, 9*sizeof(cl_mem)); - cl_mem subCellGroupsUpPtr[9]; - memset(subCellGroupsUpPtr, 0, 9*sizeof(cl_mem)); - size_t subCellGroupsSize[9]; - memset(subCellGroupsSize, 0, 9*sizeof(size_t)); - for(int idxSubGroup = 0; idxSubGroup < nbSubCellGroups ; ++idxSubGroup){ - subCellGroupsPtr[idxSubGroup] = ((cl_mem)STARPU_VARIABLE_GET_DEV_HANDLE(buffers[(idxSubGroup*2)+2])); - subCellGroupsSize[idxSubGroup] = (STARPU_VARIABLE_GET_ELEMSIZE(buffers[(idxSubGroup*2)+2])); - subCellGroupsUpPtr[idxSubGroup] = ((cl_mem)STARPU_VARIABLE_GET_DEV_HANDLE(buffers[(idxSubGroup*2)+3])); - } + cl_mem otherCellsPtr = ((cl_mem)STARPU_VARIABLE_GET_DEV_HANDLE(buffers[2])); + size_t otherCellsSize = STARPU_VARIABLE_GET_ELEMSIZE(buffers[2]); + cl_mem otherCellsUpPtr = ((cl_mem)STARPU_VARIABLE_GET_DEV_HANDLE(buffers[3])); OpenCLKernelClass* kernel = worker->get<ThisClass>(FSTARPU_OPENCL_IDX)->kernels[starpu_worker_get_id()]; kernel->upwardPassPerform(currentCellsPtr, currentCellsSize, currentCellsUpPtr, - subCellGroupsPtr, subCellGroupsSize, subCellGroupsUpPtr, - nbSubCellGroups, idxLevel, + otherCellsPtr, otherCellsSize, otherCellsUpPtr, idxLevel, intervalSize); } @@ -123,7 +114,7 @@ public: ///////////////////////////////////////////////////////////////////////////////////// /// Transfer Pass Mpi ///////////////////////////////////////////////////////////////////////////////////// -#ifdef STARPU_USE_MPI +#if defined(STARPU_USE_MPI) && defined(SCALFMM_USE_MPI) static void transferInoutPassCallbackMpi(void *buffers[], void *cl_arg){ cl_mem currentCellsPtr = ((cl_mem)STARPU_VARIABLE_GET_DEV_HANDLE(buffers[0])); size_t currentCellsSize = STARPU_VARIABLE_GET_ELEMSIZE(buffers[0]); @@ -180,19 +171,18 @@ public: static void transferInoutPassCallback(void *buffers[], void *cl_arg){ cl_mem currentCellsPtr = ((cl_mem)STARPU_VARIABLE_GET_DEV_HANDLE(buffers[0])); size_t currentCellsSize = STARPU_VARIABLE_GET_ELEMSIZE(buffers[0]); - cl_mem currentCellsUpPtr = ((cl_mem)STARPU_VARIABLE_GET_DEV_HANDLE(buffers[1])); - cl_mem currentCellsDownPtr = ((cl_mem)STARPU_VARIABLE_GET_DEV_HANDLE(buffers[2])); + cl_mem currentCellsDownPtr = ((cl_mem)STARPU_VARIABLE_GET_DEV_HANDLE(buffers[1])); - cl_mem externalCellsPtr = ((cl_mem)STARPU_VARIABLE_GET_DEV_HANDLE(buffers[3])); - size_t externalCellsSize = STARPU_VARIABLE_GET_ELEMSIZE(buffers[3]); - cl_mem externalCellsUpPtr = ((cl_mem)STARPU_VARIABLE_GET_DEV_HANDLE(buffers[4])); - cl_mem externalCellsDownPtr = ((cl_mem)STARPU_VARIABLE_GET_DEV_HANDLE(buffers[5])); + cl_mem externalCellsPtr = ((cl_mem)STARPU_VARIABLE_GET_DEV_HANDLE(buffers[2])); + size_t externalCellsSize = STARPU_VARIABLE_GET_ELEMSIZE(buffers[2]); + cl_mem externalCellsUpPtr = ((cl_mem)STARPU_VARIABLE_GET_DEV_HANDLE(buffers[3])); FStarPUPtrInterface* worker = nullptr; int idxLevel = 0; const std::vector<OutOfBlockInteraction>* outsideInteractions; int intervalSize; - starpu_codelet_unpack_args(cl_arg, &worker, &idxLevel, &outsideInteractions, &intervalSize); + int mode = 0; + starpu_codelet_unpack_args(cl_arg, &worker, &idxLevel, &outsideInteractions, &intervalSize, &mode); OpenCLKernelClass* kernel = worker->get<ThisClass>(FSTARPU_OPENCL_IDX)->kernels[starpu_worker_get_id()]; cl_int errcode_ret; @@ -202,9 +192,9 @@ public: const_cast<OutOfBlockInteraction*>(outsideInteractions->data()), &errcode_ret); FAssertLF(outsideInteractionsCl && errcode_ret == CL_SUCCESS); - kernel->transferInoutPassPerform(currentCellsPtr, currentCellsSize, currentCellsUpPtr, currentCellsDownPtr, - externalCellsPtr, externalCellsSize, externalCellsUpPtr, externalCellsDownPtr, - idxLevel, outsideInteractionsCl, outsideInteractions->size(), + kernel->transferInoutPassPerform(currentCellsPtr, currentCellsSize, currentCellsDownPtr, + externalCellsPtr, externalCellsSize, externalCellsUpPtr, + idxLevel, mode, outsideInteractionsCl, outsideInteractions->size(), intervalSize); clReleaseMemObject(outsideInteractionsCl); @@ -225,22 +215,13 @@ public: int intervalSize; starpu_codelet_unpack_args(cl_arg, &worker, &nbSubCellGroups, &idxLevel, &intervalSize); - cl_mem subCellGroupsPtr[9]; - memset(subCellGroupsPtr, 0, 9*sizeof(cl_mem)); - cl_mem subCellGroupsDownPtr[9]; - memset(subCellGroupsDownPtr, 0, 9*sizeof(cl_mem)); - size_t subCellGroupsSize[9]; - memset(subCellGroupsSize, 0, 9*sizeof(size_t)); - for(int idxSubGroup = 0; idxSubGroup < nbSubCellGroups ; ++idxSubGroup){ - subCellGroupsPtr[idxSubGroup] = ((cl_mem)STARPU_VARIABLE_GET_DEV_HANDLE(buffers[(idxSubGroup*2)+2])); - subCellGroupsSize[idxSubGroup] = (STARPU_VARIABLE_GET_ELEMSIZE(buffers[(idxSubGroup*2)+2])); - subCellGroupsDownPtr[idxSubGroup] = ((cl_mem)STARPU_VARIABLE_GET_DEV_HANDLE(buffers[(idxSubGroup*2)+3])); - } + cl_mem otherCellsPtr = ((cl_mem)STARPU_VARIABLE_GET_DEV_HANDLE(buffers[2])); + size_t otherCellsSize = STARPU_VARIABLE_GET_ELEMSIZE(buffers[2]); + cl_mem otherCellsDownPtr = ((cl_mem)STARPU_VARIABLE_GET_DEV_HANDLE(buffers[3])); OpenCLKernelClass* kernel = worker->get<ThisClass>(FSTARPU_OPENCL_IDX)->kernels[starpu_worker_get_id()]; kernel->downardPassPerform(currentCellsPtr, currentCellsSize, currentCellsDownPtr, - subCellGroupsPtr, subCellGroupsSize, subCellGroupsDownPtr, - nbSubCellGroups, idxLevel, + otherCellsPtr, otherCellsSize, otherCellsDownPtr, idxLevel, intervalSize); } @@ -248,7 +229,7 @@ public: /// Direct Pass MPI ///////////////////////////////////////////////////////////////////////////////////// -#ifdef STARPU_USE_MPI +#if defined(STARPU_USE_MPI) && defined(SCALFMM_USE_MPI) static void directInoutPassCallbackMpi(void *buffers[], void *cl_arg){ cl_mem containersPtr = ((cl_mem)STARPU_VARIABLE_GET_DEV_HANDLE(buffers[0])); size_t containersSize = STARPU_VARIABLE_GET_ELEMSIZE(buffers[0]); diff --git a/Src/GroupTree/TestKernel/FCudaTestKernels.hpp b/Src/GroupTree/TestKernel/FCudaTestKernels.hpp index 23325ea829ea174b4d1fa7946d24895c2e1b4259..a4d77289aca71346d3941447cc836f0d86cef87d 100644 --- a/Src/GroupTree/TestKernel/FCudaTestKernels.hpp +++ b/Src/GroupTree/TestKernel/FCudaTestKernels.hpp @@ -34,13 +34,12 @@ public: } /** Before Downward */ - __device__ void M2L(CellClass local, const CellClass distantNeighbors[343], const int /*size*/, const int /*level*/) { + __device__ void M2L(CellClass local, const CellClass* distantNeighbors, + const int* /*neighPositions*/, const int size, const int /*level*/) { if(threadIdx.x == 0) { // The pole is impacted by what represent other poles - for(int idx = 0 ; idx < 343 ; ++idx){ - if(distantNeighbors[idx].symb){ - *local.down += *distantNeighbors[idx].up; - } + for(int idx = 0 ; idx < size ; ++idx){ + *local.down += *distantNeighbors[idx].up; } } } @@ -71,18 +70,18 @@ public: /** After Downward */ __device__ void P2P(const int3& , - ContainerClass* const targets, const ContainerClass* const sources, - ContainerClass* const directNeighborsParticles[27], const int ){ + ContainerClass* const targets, const ContainerClass* const sources, + ContainerClass* const directNeighborsParticles, + const int* /*neighborPositions*/, + const int counter){ if(threadIdx.x == 0) { // Each particles targeted is impacted by the particles sources long long int inc = sources->getNbParticles(); if(targets == sources){ inc -= 1; } - for(int idx = 0 ; idx < 27 ; ++idx){ - if( directNeighborsParticles[idx] ){ - inc += directNeighborsParticles[idx]->getNbParticles(); - } + for(int idx = 0 ; idx < counter ; ++idx){ + inc += directNeighborsParticles[idx].getNbParticles(); } long long int*const particlesAttributes = targets->template getAttribute<0>(); @@ -94,15 +93,35 @@ public: /** After Downward */ __device__ void P2PRemote(const int3& , - ContainerClass* const targets, const ContainerClass* const sources, - ContainerClass* const directNeighborsParticles[27], const int ){ + ContainerClass* const targets, + const ContainerClass* const sources, + ContainerClass* const directNeighborsParticles, + const int* /*neighborPositions*/, + const int counter){ if(threadIdx.x == 0) { // Each particles targeted is impacted by the particles sources long long int inc = 0; - for(int idx = 0 ; idx < 27 ; ++idx){ - if( directNeighborsParticles[idx] ){ - inc += directNeighborsParticles[idx]->getNbParticles(); - } + for(int idx = 0 ; idx < counter ; ++idx){ + inc += directNeighborsParticles[idx].getNbParticles(); + } + + long long int*const particlesAttributes = targets->template getAttribute<0>(); + for(FSize idxPart = 0 ; idxPart < targets->getNbParticles() ; ++idxPart){ + particlesAttributes[idxPart] += inc; + } + } + } + + __device__ void P2POuter(const int3& , + ContainerClass* const targets, + ContainerClass* const directNeighborsParticles, + const int* /*neighborPositions*/, + const int counter){ + if(threadIdx.x == 0) { + // Each particles targeted is impacted by the particles sources + long long int inc = 0; + for(int idx = 0 ; idx < counter ; ++idx){ + inc += directNeighborsParticles[idx].getNbParticles(); } long long int*const particlesAttributes = targets->template getAttribute<0>(); diff --git a/Src/GroupTree/TestKernel/FTestKernel.cl b/Src/GroupTree/TestKernel/FTestKernel.cl index c09971e1146526ecf2c3b6e5694efa310f580e69..287b5d974b8776844c7bc43379f6697a43a46e84 100644 --- a/Src/GroupTree/TestKernel/FTestKernel.cl +++ b/Src/GroupTree/TestKernel/FTestKernel.cl @@ -51,6 +51,7 @@ struct OutOfBlockInteraction{ MortonIndex insideIndex; int relativeOutPosition; int insideIdxInBlock; + int outsideIdxInBlock; } __attribute__ ((aligned (DefaultStructAlign))); #define Between(inValue, inMin, inMax) ( (inMin) <= (inValue) && (inValue) < (inMax) ) @@ -560,12 +561,10 @@ void M2M(struct FWrappeCell pole, struct FWrappeCell child[8], const int level, } } -void M2L(struct FWrappeCell const pole, const struct FWrappeCell distantNeighbors[343], -const int size, const int level, __global void* user_data) { - for(int idxNeigh = 0 ; idxNeigh < 343 ; ++idxNeigh){ - if(distantNeighbors[idxNeigh].symb){ - *pole.down += *distantNeighbors[idxNeigh].up; - } +void M2L(struct FWrappeCell const pole, const struct FWrappeCell* distantNeighbors, + const int* relativePositions, const int size, const int level, __global void* user_data) { + for(int idxNeigh = 0 ; idxNeigh < size ; ++idxNeigh){ + *pole.down += *distantNeighbors[idxNeigh].up; } } @@ -610,6 +609,15 @@ void P2PRemote(const int3 pos, } } +void P2POuter(const int3 pos, + struct FOpenCLGroupAttachedLeaf targets, const struct FOpenCLGroupAttachedLeaf sources, + struct FOpenCLGroupAttachedLeaf directNeighborsParticles, const int position, __global void* user_data){ + __global long long* partdown = targets.attributes[0]; + for(FSize idxPart = 0 ; idxPart < targets.nbParticles ; ++idxPart){ + partdown[idxPart] += directNeighborsParticles.nbParticles; + } +} + int3 getCoordinate(const struct FWrappeCell cell) { int3 coord; coord.x = cell.symb->coordinates[0]; @@ -659,49 +667,43 @@ __kernel void FOpenCL__bottomPassPerform(__global unsigned char* leafCellsPtr, s ///////////////////////////////////////////////////////////////////////////////////// __kernel void FOpenCL__upwardPassPerform(__global unsigned char* currentCellsPtr, size_t currentCellsSize, __global unsigned char* currentCellsUpPtr, - struct Uptr9 subCellGroupsPtr, struct size_t9 subCellGroupsSize, struct Uptr9 subCellGroupsUpPtr, - int nbSubCellGroups, int idxLevel, __global void* userkernel){ + __global unsigned char* childCellsPtr, size_t childCellsSize, __global unsigned char* childCellsUpPtr, + int idxLevel, __global void* userkernel){ struct FOpenCLGroupOfCells currentCells = BuildFOpenCLGroupOfCells(currentCellsPtr, currentCellsSize, currentCellsUpPtr, NULLPTR); - struct FOpenCLGroupOfCells subCellGroups[9]; - for(int idx = 0 ; idx < nbSubCellGroups ; ++idx){ - subCellGroups[idx] = BuildFOpenCLGroupOfCells(subCellGroupsPtr.ptrs[idx], subCellGroupsSize.v[idx], subCellGroupsUpPtr.ptrs[idx], NULLPTR); - } - - FOpenCLAssertLF(nbSubCellGroups != 0); const int nbCells = FOpenCLGroupOfCells_getNumberOfCellsInBlock(¤tCells); - int idxSubCellGroup = 0; - int idxChildCell = FOpenCLGroupOfCells_getFistChildIdx(&subCellGroups[0], FOpenCLGroupOfCells_getCellIndex(¤tCells, 0)); + struct FOpenCLGroupOfCells childCells = BuildFOpenCLGroupOfCells(childCellsPtr, childCellsSize, childCellsUpPtr, NULLPTR); + const int childNbCells = FOpenCLGroupOfCells_getNumberOfCellsInBlock(&childCells); - for(int idxCell = 0 ; idxCell < nbCells ; ++idxCell){ - struct FWrappeCell cell = FOpenCLGroupOfCells_getUpCell(¤tCells, idxCell); - FOpenCLAssertLF(cell.symb->mortonIndex == FOpenCLGroupOfCells_getCellMortonIndex(¤tCells, idxCell)); - struct FWrappeCell child[8]; + const MortonIndex firstParent = FOpenCLMax(FOpenCLGroupOfCells_getStartingIndex(¤tCells), FOpenCLGroupOfCells_getStartingIndex(&childCells)>>3); + const MortonIndex lastParent = FOpenCLMin(FOpenCLGroupOfCells_getEndingIndex(¤tCells)-1, (FOpenCLGroupOfCells_getEndingIndex(&childCells)-1)>>3); - FOpenCLAssertLF(idxSubCellGroup != nbSubCellGroups); + int idxParentCell = FOpenCLGroupOfCells_getCellIndex(¤tCells,firstParent); + int idxChildCell = FOpenCLGroupOfCells_getFistChildIdx(&childCells,firstParent); + while(true){ + struct FWrappeCell cell = FOpenCLGroupOfCells_getUpCell(¤tCells, idxParentCell); + struct FWrappeCell child[8]; for(int idxChild = 0 ; idxChild < 8 ; ++idxChild){ child[idxChild].symb = NULLPTR; } - while(idxSubCellGroup != nbSubCellGroups - && (FOpenCLGroupOfCells_getCellMortonIndex(&subCellGroups[idxSubCellGroup], idxChildCell)>>3) == cell.symb->mortonIndex){ - const int idxChild = ((FOpenCLGroupOfCells_getCellMortonIndex(&subCellGroups[idxSubCellGroup], idxChildCell)) & 7); - - child[idxChild] = FOpenCLGroupOfCells_getUpCell(&subCellGroups[idxSubCellGroup], idxChildCell); + do{ + const int idxChild = ((FOpenCLGroupOfCells_getCellMortonIndex(&childCells,idxChildCell)) & 7); + child[idxChild] = FOpenCLGroupOfCells_getUpCell(&childCells, idxChildCell); idxChildCell += 1; + }while(idxChildCell != childNbCells && cell.symb->mortonIndex == (FOpenCLGroupOfCells_getCellMortonIndex(&childCells, idxChildCell)>>3)); - if(idxChildCell == FOpenCLGroupOfCells_getNumberOfCellsInBlock(&subCellGroups[idxSubCellGroup])){ - idxChildCell = 0; - idxSubCellGroup += 1; - } + M2M(cell, child, idxLevel, userkernel); + + if(FOpenCLGroupOfCells_getCellMortonIndex(¤tCells, idxParentCell) == lastParent){ + break; } - M2M(cell, child, idxLevel, userkernel); + idxParentCell += 1; } } - ///////////////////////////////////////////////////////////////////////////////////// /// Transfer Pass Mpi ///////////////////////////////////////////////////////////////////////////////////// @@ -723,11 +725,9 @@ __kernel void FOpenCL__transferInoutPassPerformMpi(__global unsigned char* curr struct FWrappeCell cell = FOpenCLGroupOfCells_getDownCell(¤tCells, outsideInteractions[outInterIdx].insideIdxInBlock); FOpenCLAssertLF(cell.symb->mortonIndex == outsideInteractions[outInterIdx].insideIndex); - struct FWrappeCell interactions[343]; - FSetToNullptr343(interactions); - interactions[outsideInteractions[outInterIdx].relativeOutPosition] = interCell; - const int counter = 1; - M2L( cell , interactions, counter, idxLevel, userkernel); + const int relativeOutPosition = outsideInteractions[outInterIdx].relativeOutPosition; + M2L( cell , &interCell, &relativeOutPosition, + 1, idxLevel, userkernel); } } } @@ -766,47 +766,50 @@ __kernel void FOpenCL__transferInPassPerform(__global unsigned char* currentCel const int cellPos = FOpenCLGroupOfCells_getCellIndex(¤tCells, interactionsIndexes[idxInter]); if(cellPos != -1){ struct FWrappeCell interCell = FOpenCLGroupOfCells_getUpCell(¤tCells, cellPos); - FOpenCLAssertLF(interCell.symb->mortonIndex == interactionsIndexes[idxInter]); - FOpenCLAssertLF(interactions[interactionsPosition[idxInter]].symb == NULLPTR); - interactions[interactionsPosition[idxInter]] = interCell; + interactions[counterExistingCell] = interCell; + interactionsPosition[counterExistingCell] = interactionsPosition[idxInter]; counterExistingCell += 1; } } } - M2L( cell , interactions, counterExistingCell, idxLevel, userkernel); + M2L( cell , interactions, interactionsPosition, + counterExistingCell, idxLevel, userkernel); } } __kernel void FOpenCL__transferInoutPassPerform(__global unsigned char* currentCellsPtr, size_t currentCellsSize, - __global unsigned char* currentCellsUpPtr, __global unsigned char* currentCellsDownPtr, + __global unsigned char* currentCellsUpPtr, __global unsigned char* externalCellsPtr, size_t externalCellsSize, - __global unsigned char* externalCellsUpPtr, __global unsigned char* externalCellsDownPtr, - int idxLevel, const __global struct OutOfBlockInteraction* outsideInteractions, + __global unsigned char* externalCellsDownPtr, + int idxLevel, int mode, const __global struct OutOfBlockInteraction* outsideInteractions, size_t nbOutsideInteractions, __global void* userkernel){ - struct FOpenCLGroupOfCells currentCells = BuildFOpenCLGroupOfCells(currentCellsPtr, currentCellsSize, currentCellsUpPtr, currentCellsDownPtr); - struct FOpenCLGroupOfCells cellsOther = BuildFOpenCLGroupOfCells(externalCellsPtr, externalCellsSize, externalCellsUpPtr, externalCellsDownPtr); + struct FOpenCLGroupOfCells currentCells = BuildFOpenCLGroupOfCells(currentCellsPtr, currentCellsSize, currentCellsUpPtr, NULLPTR); + struct FOpenCLGroupOfCells cellsOther = BuildFOpenCLGroupOfCells(externalCellsPtr, externalCellsSize, NULLPTR, externalCellsDownPtr); - for(int outInterIdx = 0 ; outInterIdx < nbOutsideInteractions ; ++outInterIdx){ - const int cellPos = FOpenCLGroupOfCells_getCellIndex(&cellsOther, outsideInteractions[outInterIdx].outIndex); - if(cellPos != -1){ - FOpenCLAssertLF(outsideInteractions[outInterIdx].outIndex == FOpenCLGroupOfCells_getCellMortonIndex(&cellsOther, outsideInteractions[outInterIdx].outIndex)); - struct FWrappeCell interCell = FOpenCLGroupOfCells_getUpCell(&cellsOther, cellPos); + if(mode == 1){ + for(int outInterIdx = 0 ; outInterIdx < nbOutsideInteractions ; ++outInterIdx){ + struct FWrappeCell interCell = FOpenCLGroupOfCells_getUpCell(&cellsOther, outsideInteractions[outInterIdx].outsideIdxInBlock); FOpenCLAssertLF(interCell.symb->mortonIndex == outsideInteractions[outInterIdx].outIndex); struct FWrappeCell cell = FOpenCLGroupOfCells_getDownCell(¤tCells, outsideInteractions[outInterIdx].insideIdxInBlock); FOpenCLAssertLF(cell.symb->mortonIndex == outsideInteractions[outInterIdx].insideIndex); - struct FWrappeCell interactions[343]; - FSetToNullptr343(interactions); - interactions[outsideInteractions[outInterIdx].relativeOutPosition] = interCell; - const int counter = 1; - M2L( cell , interactions, counter, idxLevel, userkernel); + const int relativeOutPosition = outsideInteractions[outInterIdx].relativeOutPosition; + M2L( cell , &interCell, &relativeOutPosition, + 1, idxLevel, userkernel); + } + } + else{ + for(int outInterIdx = 0 ; outInterIdx < nbOutsideInteractions ; ++outInterIdx){ + struct FWrappeCell interCell = FOpenCLGroupOfCells_getDownCell(&cellsOther, outsideInteractions[outInterIdx].outsideIdxInBlock); + FOpenCLAssertLF(interCell.symb->mortonIndex == outsideInteractions[outInterIdx].outIndex); + struct FWrappeCell cell = FOpenCLGroupOfCells_getUpCell(¤tCells, outsideInteractions[outInterIdx].insideIdxInBlock); + FOpenCLAssertLF(cell.symb->mortonIndex == outsideInteractions[outInterIdx].insideIndex); - interactions[outsideInteractions[outInterIdx].relativeOutPosition].symb = NULLPTR; - interactions[FMGetOppositeInterIndex(outsideInteractions[outInterIdx].relativeOutPosition)] = cell; - M2L( interCell , interactions, counter, idxLevel, userkernel); + const int relativepos = FMGetOppositeInterIndex(outsideInteractions[outInterIdx].relativeOutPosition); + M2L( interCell , &cell, &relativepos, 1, idxLevel, userkernel); } } } @@ -819,46 +822,39 @@ __kernel void FOpenCL__transferInoutPassPerform(__global unsigned char* currentC __kernel void FOpenCL__downardPassPerform(__global unsigned char* currentCellsPtr, size_t currentCellsSize, __global unsigned char* currentCellsDownPtr, - struct Uptr9 subCellGroupsPtr, struct size_t9 subCellGroupsSize, struct Uptr9 subCellGroupsDownPtr, - int nbSubCellGroups, int idxLevel, __global void* userkernel){ - FOpenCLAssertLF(nbSubCellGroups != 0); + __global unsigned char* childCellsPtr, size_t childCellsSize, __global unsigned char* childCellsDownPtr, + int idxLevel, __global void* userkernel){ struct FOpenCLGroupOfCells currentCells = BuildFOpenCLGroupOfCells(currentCellsPtr, currentCellsSize, NULLPTR, currentCellsDownPtr); - struct FOpenCLGroupOfCells subCellGroups[9]; - for(int idx = 0 ; idx < nbSubCellGroups ; ++idx){ - subCellGroups[idx] = BuildFOpenCLGroupOfCells(subCellGroupsPtr.ptrs[idx], subCellGroupsSize.v[idx], NULLPTR, subCellGroupsDownPtr.ptrs[idx]); - } - - - FOpenCLAssertLF(nbSubCellGroups != 0); const int nbCells = FOpenCLGroupOfCells_getNumberOfCellsInBlock(¤tCells); - int idxSubCellGroup = 0; - int idxChildCell = FOpenCLGroupOfCells_getFistChildIdx(&subCellGroups[0], FOpenCLGroupOfCells_getCellIndex(¤tCells, 0)); + struct FOpenCLGroupOfCells childCells = BuildFOpenCLGroupOfCells(childCellsPtr, childCellsSize, NULLPTR, childCellsDownPtr); + const int childNbCells = FOpenCLGroupOfCells_getNumberOfCellsInBlock(&childCells); - for(int idxCell = 0 ; idxCell < nbCells ; ++idxCell){ - struct FWrappeCell cell = FOpenCLGroupOfCells_getDownCell(¤tCells, idxCell); - FOpenCLAssertLF(cell.symb->mortonIndex == FOpenCLGroupOfCells_getCellMortonIndex(¤tCells, idxCell)); - struct FWrappeCell child[8]; + const MortonIndex firstParent = FOpenCLMax(FOpenCLGroupOfCells_getStartingIndex(¤tCells), FOpenCLGroupOfCells_getStartingIndex(&childCells)>>3); + const MortonIndex lastParent = FOpenCLMin(FOpenCLGroupOfCells_getEndingIndex(¤tCells)-1, (FOpenCLGroupOfCells_getEndingIndex(&childCells)-1)>>3); - FOpenCLAssertLF(idxSubCellGroup != nbSubCellGroups); + int idxParentCell = FOpenCLGroupOfCells_getCellIndex(¤tCells,firstParent); + int idxChildCell = FOpenCLGroupOfCells_getFistChildIdx(&childCells,firstParent); + while(true){ + struct FWrappeCell cell = FOpenCLGroupOfCells_getDownCell(¤tCells, idxParentCell); + struct FWrappeCell child[8]; for(int idxChild = 0 ; idxChild < 8 ; ++idxChild){ child[idxChild].symb = NULLPTR; } - while(idxSubCellGroup != nbSubCellGroups - && (FOpenCLGroupOfCells_getCellMortonIndex(&subCellGroups[idxSubCellGroup], idxChildCell)>>3) == cell.symb->mortonIndex){ - const int idxChild = ((FOpenCLGroupOfCells_getCellMortonIndex(&subCellGroups[idxSubCellGroup], idxChildCell)) & 7); - - child[idxChild] = FOpenCLGroupOfCells_getDownCell(&subCellGroups[idxSubCellGroup], idxChildCell); + do{ + const int idxChild = ((FOpenCLGroupOfCells_getCellMortonIndex(&childCells,idxChildCell)) & 7); + child[idxChild] = FOpenCLGroupOfCells_getDownCell(&childCells, idxChildCell); idxChildCell += 1; + }while(idxChildCell != childNbCells && cell.symb->mortonIndex == (FOpenCLGroupOfCells_getCellMortonIndex(&childCells, idxChildCell)>>3)); - if(idxChildCell == FOpenCLGroupOfCells_getNumberOfCellsInBlock(&subCellGroups[idxSubCellGroup])){ - idxChildCell = 0; - idxSubCellGroup += 1; - } + L2L(cell, child, idxLevel, userkernel); + + if(FOpenCLGroupOfCells_getCellMortonIndex(¤tCells, idxParentCell) == lastParent){ + break; } - L2L(cell, child, idxLevel, userkernel); + idxParentCell += 1; } } @@ -884,7 +880,8 @@ __kernel void FOpenCL__directInoutPassPerformMpi(__global unsigned char* contain struct FOpenCLGroupAttachedLeaf particles = FOpenCLGroupOfParticles_getLeaf(&containers, outsideInteractions[outInterIdx].insideIdxInBlock); FOpenCLAssertLF(FOpenCLGroupOfParticles_getLeafMortonIndex(&containers, outsideInteractions[outInterIdx].insideIdxInBlock) == outsideInteractions[outInterIdx].insideIndex); - P2PRemote( GetPositionFromMorton(outsideInteractions[outInterIdx].insideIndex, treeHeight-1), particles, particles , interParticles, outsideInteractions[outInterIdx].relativeOutPosition, userkernel); + P2PRemote( GetPositionFromMorton(outsideInteractions[outInterIdx].insideIndex, treeHeight-1), particles, particles , + interParticles, outsideInteractions[outInterIdx].relativeOutPosition, userkernel); } } } @@ -919,7 +916,7 @@ __kernel void FOpenCL__directInPassPerform(__global unsigned char* containersPtr for(int idxInter = 0 ; idxInter < counter ; ++idxInter){ if( blockStartIdx <= interactionsIndexes[idxInter] && interactionsIndexes[idxInter] < blockEndIdx ){ const int leafPos = FOpenCLGroupOfParticles_getLeafIndex(&containers, interactionsIndexes[idxInter]); - if(leafPos){ + if(leafPos != -1){ FOpenCLAssertLF(FOpenCLGroupOfParticles_getLeafMortonIndex(&containers, leafPos) == interactionsIndexes[idxInter]); interactionsObjects[counterExistingCell] = FOpenCLGroupOfParticles_getLeaf(&containers, leafPos); neighPosition[counterExistingCell] = interactionsPosition[idxInter]; @@ -944,16 +941,18 @@ __kernel void FOpenCL__directInoutPassPerform(__global unsigned char* containers for(int outInterIdx = 0 ; outInterIdx < nbOutsideInteractions ; ++outInterIdx){ const int leafPos = FOpenCLGroupOfParticles_getLeafIndex(&containersOther, outsideInteractions[outInterIdx].outIndex); if(leafPos != -1){ - FOpenCLAssertLF(FOpenCLGroupOfParticles_getLeafMortonIndex(&containersOther, leafPos) == outsideInteractions[outInterIdx].outIndex); - struct FOpenCLGroupAttachedLeaf interParticles = FOpenCLGroupOfParticles_getLeaf(&containersOther, leafPos); + struct FOpenCLGroupAttachedLeaf interParticles = FOpenCLGroupOfParticles_getLeaf(&containersOther, outsideInteractions[outInterIdx].outsideIdxInBlock); struct FOpenCLGroupAttachedLeaf particles = FOpenCLGroupOfParticles_getLeaf(&containers, outsideInteractions[outInterIdx].insideIdxInBlock); + FOpenCLAssertLF(FOpenCLGroupOfParticles_getLeafMortonIndex(&containers, outsideInteractions[outInterIdx].insideIdxInBlock) == outsideInteractions[outInterIdx].insideIndex); FOpenCLAssertLF(particles.nbParticles); FOpenCLAssertLF(interParticles.nbParticles); - P2PRemote( GetPositionFromMorton(outsideInteractions[outInterIdx].insideIndex, treeHeight-1), particles, particles , interParticles, outsideInteractions[outInterIdx].relativeOutPosition, userkernel ); + P2POuter( GetPositionFromMorton(outsideInteractions[outInterIdx].insideIndex, treeHeight-1), particles, particles , + interParticles, outsideInteractions[outInterIdx].relativeOutPosition, userkernel ); - P2PRemote( GetPositionFromMorton(outsideInteractions[outInterIdx].outIndex, treeHeight-1), interParticles, interParticles , particles, FMGetOppositeNeighIndex(outsideInteractions[outInterIdx].relativeOutPosition), userkernel); + P2POuter( GetPositionFromMorton(outsideInteractions[outInterIdx].outIndex, treeHeight-1), interParticles, interParticles , + particles, FMGetOppositeNeighIndex(outsideInteractions[outInterIdx].relativeOutPosition), userkernel); } } } diff --git a/Src/GroupTree/Uniform/FUniformKernel.cl b/Src/GroupTree/Uniform/FUniformKernel.cl new file mode 100644 index 0000000000000000000000000000000000000000..858e4bb3e50ba93623bb4a7cede912bd9c6b2f6a --- /dev/null +++ b/Src/GroupTree/Uniform/FUniformKernel.cl @@ -0,0 +1,986 @@ +/** This file contains the prototype for a kernel in opencl */ +// @SCALFMM_PRIVATE + + +/***************************************************************************/ +/***************************************************************************/ +/************************CHANGE THINGS HERE*********************************/ +/***************************************************************************/ + +typedef ___FSize___ FSize; +typedef ___FReal___ FReal; +typedef ___FParticleValueClass___ FParticleValueClass; +typedef long long int MortonIndex; + +#define FOpenCLGroupOfCellsCellIsEmptyFlag ((MortonIndex)-1) + +#define NbAttributesPerParticle ___NbAttributesPerParticle___ +#define NbSymbAttributes ___NbSymbAttributes___ + +#define FOpenCLGroupOfParticlesMemoryAlignementBytes ___FP2PDefaultAlignement___ +#define FOpenCLGroupOfParticlesMemoryAlignementParticles (FOpenCLGroupOfParticlesMemoryAlignementBytes/sizeof(FReal)) +#define FOpenCLGroupOfParticlesLeafIsEmptyFlag ((MortonIndex)-1) + +#define NULLPTR (0) + +#define DefaultStructAlign ___DefaultStructAlign___ + +struct FSymboleCellClass { + MortonIndex mortonIndex; + int coordinates[3]; +} __attribute__ ((aligned (DefaultStructAlign))); + +typedef FReal FPoleCellClass; +typedef FReal FLocalCellClass; + +struct FWrappeCell{ + __global struct FSymboleCellClass* symb; + __global FPoleCellClass* up; + __global FLocalCellClass* down; +}; + +#define ORDER __ORDER__ +#define POLE_SIZE __POLE_SIZE__ +#define LOCAL_SIZE __LOCAL_SIZE__ + +/***************************************************************************/ +/***************************************************************************/ +/***************************************************************************/ +/***************************************************************************/ + +struct OutOfBlockInteraction{ + MortonIndex outIndex; + MortonIndex insideIndex; + int relativeOutPosition; + int insideIdxInBlock; + int outsideIdxInBlock; +} __attribute__ ((aligned (DefaultStructAlign))); + +#define Between(inValue, inMin, inMax) ( (inMin) <= (inValue) && (inValue) < (inMax) ) +#define pow2(power) (1 << (power)) +#define Abs(inV) (inV < 0 ? -inV : inV) + +int3 GetPositionFromMorton(MortonIndex inIndex, const int inLevel){ + MortonIndex mask = 0x1LL; + + int3 coord; + coord.x = 0; + coord.y = 0; + coord.z = 0; + + for(int indexLevel = 0; indexLevel < inLevel ; ++indexLevel){ + coord.z |= (int)(inIndex & mask); + inIndex >>= 1; + coord.y |= (int)(inIndex & mask); + inIndex >>= 1; + coord.x |= (int)(inIndex & mask); + + mask <<= 1; + } + + return coord; +} + +MortonIndex GetMortonIndex(const int3 coord, const int inLevel) { + MortonIndex index = 0x0LL; + MortonIndex mask = 0x1LL; + // the ordre is xyz.xyz... + MortonIndex mx = coord.x << 2; + MortonIndex my = coord.y << 1; + MortonIndex mz = coord.z; + + for(int indexLevel = 0; indexLevel < inLevel ; ++indexLevel){ + index |= (mz & mask); + mask <<= 1; + index |= (my & mask); + mask <<= 1; + index |= (mx & mask); + mask <<= 1; + + mz <<= 2; + my <<= 2; + mx <<= 2; + } + + return index; +} + +int GetNeighborsIndexes(const int3 coord, const int OctreeHeight, MortonIndex indexes[26], int indexInArray[26]) { + int idxNeig = 0; + int limite = 1 << (OctreeHeight - 1); + // We test all cells around + for(int idxX = -1 ; idxX <= 1 ; ++idxX){ + if(!Between(coord.x + idxX,0, limite)) continue; + + for(int idxY = -1 ; idxY <= 1 ; ++idxY){ + if(!Between(coord.y + idxY,0, limite)) continue; + + for(int idxZ = -1 ; idxZ <= 1 ; ++idxZ){ + if(!Between(coord.z + idxZ,0, limite)) continue; + + // if we are not on the current cell + if( idxX || idxY || idxZ ){ + int3 other; + + other.x = coord.x + idxX; + other.y = coord.y + idxY; + other.z = coord.z + idxZ; + + indexes[ idxNeig ] = GetMortonIndex(other, OctreeHeight - 1); + indexInArray[ idxNeig ] = ((idxX+1)*3 + (idxY+1)) * 3 + (idxZ+1); + ++idxNeig; + } + } + } + } + return idxNeig; +} + +int GetInteractionNeighbors(const int3 coord, const int inLevel, MortonIndex inNeighbors[189], int inNeighborsPosition[189]) { + // Then take each child of the parent's neighbors if not in directNeighbors + // Father coordinate + int3 parentCell; + parentCell.x = coord.x>>1; + parentCell.y = coord.y>>1; + parentCell.z = coord.z>>1; + + // Limite at parent level number of box (split by 2 by level) + const int limite = pow2(inLevel-1); + + int idxNeighbors = 0; + // We test all cells around + for(int idxX = -1 ; idxX <= 1 ; ++idxX){ + if(!Between(parentCell.x + idxX,0,limite)) continue; + + for(int idxY = -1 ; idxY <= 1 ; ++idxY){ + if(!Between(parentCell.y + idxY,0,limite)) continue; + + for(int idxZ = -1 ; idxZ <= 1 ; ++idxZ){ + if(!Between(parentCell.z + idxZ,0,limite)) continue; + + // if we are not on the current cell + if( idxX || idxY || idxZ ){ + int3 otherParent; + + otherParent.x = parentCell.x + idxX; + otherParent.y = parentCell.y + idxY; + otherParent.z = parentCell.z + idxZ; + + const MortonIndex mortonOther = GetMortonIndex(otherParent, inLevel-1); + + // For each child + for(int idxCousin = 0 ; idxCousin < 8 ; ++idxCousin){ + const int xdiff = ((otherParent.x<<1) | ( (idxCousin>>2) & 1)) - coord.x; + const int ydiff = ((otherParent.y<<1) | ( (idxCousin>>1) & 1)) - coord.y; + const int zdiff = ((otherParent.z<<1) | (idxCousin&1)) - coord.z; + + // Test if it is a direct neighbor + if(Abs(xdiff) > 1 || Abs(ydiff) > 1 || Abs(zdiff) > 1){ + // add to neighbors + inNeighborsPosition[idxNeighbors] = ((( (xdiff+3) * 7) + (ydiff+3))) * 7 + zdiff + 3; + inNeighbors[idxNeighbors++] = (mortonOther << 3) | idxCousin; + } + } + } + } + } + } + + return idxNeighbors; +} + + +void FSetToNullptr343(struct FWrappeCell ptrs[343]){ + int idx; + for( idx = 0 ; idx < 343 ; ++idx){ + ptrs[idx].symb = NULLPTR; + } +} + +/***************************************************************************/ +/***************************************************************************/ +/***************************************************************************/ +/***************************************************************************/ + + +struct FOpenCLGroupAttachedLeaf { + //< Nb of particles in the current leaf + FSize nbParticles; + //< Pointers to the positions of the particles + __global FReal* positionsPointers[3]; + //< Pointers to the attributes of the particles + __global FParticleValueClass* attributes[NbSymbAttributes+NbAttributesPerParticle]; +}; + +struct FOpenCLGroupAttachedLeaf BuildFOpenCLGroupAttachedLeaf(const FSize inNbParticles, __global FReal* inPositionBuffer, const size_t inLeadingPosition, + __global FParticleValueClass* inAttributesBuffer, const size_t inLeadingAttributes){ + struct FOpenCLGroupAttachedLeaf leaf; + leaf.nbParticles = (inNbParticles); + // Redirect pointers to position + leaf.positionsPointers[0] = inPositionBuffer; + leaf.positionsPointers[1] = (__global FReal*)(((__global unsigned char*)inPositionBuffer) + inLeadingPosition); + leaf.positionsPointers[2] = (__global FReal*)(((__global unsigned char*)inPositionBuffer) + inLeadingPosition*2); + + for(unsigned idxAttribute = 0 ; idxAttribute < NbSymbAttributes ; ++idxAttribute){ + leaf.attributes[idxAttribute] = (__global FParticleValueClass*)(((__global unsigned char*)inPositionBuffer) + inLeadingPosition*(idxAttribute+3)); + } + + // Redirect pointers to data + if(inAttributesBuffer){ + for(unsigned idxAttribute = 0 ; idxAttribute < NbAttributesPerParticle ; ++idxAttribute){ + leaf.attributes[idxAttribute+NbSymbAttributes] = (__global FParticleValueClass*)(((__global unsigned char*)inAttributesBuffer) + idxAttribute*inLeadingAttributes); + } + } + else{ + for(unsigned idxAttribute = 0 ; idxAttribute < NbAttributesPerParticle ; ++idxAttribute){ + leaf.attributes[idxAttribute+NbSymbAttributes] = NULLPTR; + } + } + return leaf; +} + +struct FOpenCLGroupAttachedLeaf EmptyFOpenCLGroupAttachedLeaf(){ + struct FOpenCLGroupAttachedLeaf leaf; + leaf.nbParticles = -1; + // Redirect pointers to position + leaf.positionsPointers[0] = NULLPTR; + leaf.positionsPointers[1] = NULLPTR; + leaf.positionsPointers[2] = NULLPTR; + + // Redirect pointers to data + for(unsigned idxAttribute = 0 ; idxAttribute < NbSymbAttributes+NbAttributesPerParticle ; ++idxAttribute){ + leaf.attributes[idxAttribute] = NULLPTR; + } + return leaf; +} + +bool FOpenCLGroupAttachedLeaf_isAttachedToSomething(const struct FOpenCLGroupAttachedLeaf* group){ + return (group->nbParticles != -1); +} +bool FOpenCLGroupAttachedLeaf_getNbParticles(const struct FOpenCLGroupAttachedLeaf* group){ + return (group->nbParticles); +} + + +/** One header is allocated at the beginning of each block */ +struct FOpenCLGroupOfParticlesBlockHeader{ + MortonIndex startingIndex; + MortonIndex endingIndex; + int numberOfLeavesInBlock; + + //< The real number of particles allocated + FSize nbParticlesAllocatedInGroup; + //< Starting point of position + size_t offsetPosition; + //< Bytes difference/offset between position + size_t positionsLeadingDim; + //< Bytes difference/offset between attributes + size_t attributeLeadingDim; + //< The total number of particles in the group + FSize nbParticlesInGroup; +}__attribute__ ((aligned (DefaultStructAlign))); + +/** Information about a leaf */ +struct FOpenCLGroupOfParticlesLeafHeader { + MortonIndex mindex; + FSize nbParticles; + size_t offSet; +}__attribute__ ((aligned (DefaultStructAlign))); + + +struct FOpenCLGroupOfParticles { + //< The size of memoryBuffer in byte + size_t allocatedMemoryInByte; + //< Pointer to a block memory + __global unsigned char* memoryBuffer; + + //< Pointer to the header inside the block memory + __global struct FOpenCLGroupOfParticlesBlockHeader* blockHeader; + //< Pointer to leaves information + __global struct FOpenCLGroupOfParticlesLeafHeader* leafHeader; + //< The total number of particles in the group + const FSize nbParticlesInGroup; + + //< Pointers to particle position x, y, z + __global FReal* particlePosition[3]; + + //< Pointers to the particles data inside the block memory + __global FParticleValueClass* attributesBuffer; + __global FParticleValueClass* particleAttributes[NbSymbAttributes+NbAttributesPerParticle]; +}; + +struct FOpenCLGroupOfParticles BuildFOpenCLGroupOfParticles(__global unsigned char* inBuffer, const size_t inAllocatedMemoryInByte, + __global unsigned char* inAttributeBuffer){ + struct FOpenCLGroupOfParticles group; + group.allocatedMemoryInByte = (inAllocatedMemoryInByte); + group.memoryBuffer = (inBuffer); + + // Move the pointers to the correct position + group.blockHeader = ((__global struct FOpenCLGroupOfParticlesBlockHeader*)inBuffer); + inBuffer += sizeof(struct FOpenCLGroupOfParticlesBlockHeader); + group.leafHeader = ((__global struct FOpenCLGroupOfParticlesLeafHeader*)inBuffer); + + // Init particle pointers + // Assert group.blockHeader->positionsLeadingDim == (sizeof(FReal) * group.blockHeader->nbParticlesAllocatedInGroup); + group.particlePosition[0] = (__global FReal*) (group.memoryBuffer + group.blockHeader->offsetPosition); + group.particlePosition[1] = (group.particlePosition[0] + group.blockHeader->nbParticlesAllocatedInGroup); + group.particlePosition[2] = (group.particlePosition[1] + group.blockHeader->nbParticlesAllocatedInGroup); + + // Redirect pointer to data + // Assert group.blockHeader->attributeLeadingDim == (sizeof(FParticleValueClass) * group.blockHeader->nbParticlesAllocatedInGroup); + __global unsigned char* previousPointer = ((__global unsigned char*)(group.particlePosition[2] + group.blockHeader->nbParticlesAllocatedInGroup)); + for(unsigned idxAttribute = 0 ; idxAttribute < NbSymbAttributes ; ++idxAttribute){ + group.particleAttributes[idxAttribute] = ((__global FParticleValueClass*)previousPointer); + previousPointer += sizeof(FParticleValueClass)*group.blockHeader->nbParticlesAllocatedInGroup; + } + + if(inAttributeBuffer){ + group.attributesBuffer = (__global FParticleValueClass*)inAttributeBuffer; + for(unsigned idxAttribute = 0 ; idxAttribute < NbAttributesPerParticle ; ++idxAttribute){ + group.particleAttributes[idxAttribute+NbSymbAttributes] = ((__global FParticleValueClass*)inAttributeBuffer); + inAttributeBuffer += sizeof(FParticleValueClass)*group.blockHeader->nbParticlesAllocatedInGroup; + } + } + else{ + group.attributesBuffer = NULLPTR; + for(unsigned idxAttribute = 0 ; idxAttribute < NbAttributesPerParticle ; ++idxAttribute){ + group.particleAttributes[idxAttribute+NbSymbAttributes] = NULLPTR; + } + } + + return group; +} +MortonIndex FOpenCLGroupOfParticles_getStartingIndex(const struct FOpenCLGroupOfParticles* group) { + return group->blockHeader->startingIndex; +} +MortonIndex FOpenCLGroupOfParticles_getEndingIndex(const struct FOpenCLGroupOfParticles* group) { + return group->blockHeader->endingIndex; +} +int FOpenCLGroupOfParticles_getNumberOfLeaves(const struct FOpenCLGroupOfParticles* group) { + return group->blockHeader->numberOfLeavesInBlock; +} +bool FOpenCLGroupOfParticles_isInside(const struct FOpenCLGroupOfParticles* group, const MortonIndex inIndex) { + return group->blockHeader->startingIndex <= inIndex && inIndex < group->blockHeader->endingIndex; +} + + +/** Return the idx in array of the cell */ +MortonIndex FOpenCLGroupOfParticles_getLeafMortonIndex(const struct FOpenCLGroupOfParticles* group, const int id){ + return group->leafHeader[id].mindex; +} + +/** Check if a cell exist (by binary search) and return it index */ +int FOpenCLGroupOfParticles_getLeafIndex(const struct FOpenCLGroupOfParticles* group, const MortonIndex leafIdx){ + int idxLeft = 0; + int idxRight = group->blockHeader->numberOfLeavesInBlock-1; + while(idxLeft <= idxRight){ + const int idxMiddle = (idxLeft+idxRight)/2; + if(group->leafHeader[idxMiddle].mindex == leafIdx){ + return idxMiddle; + } + if(leafIdx < group->leafHeader[idxMiddle].mindex){ + idxRight = idxMiddle-1; + } + else{ + idxLeft = idxMiddle+1; + } + } + return -1; +} + + +bool FOpenCLGroupOfParticles_exists(const struct FOpenCLGroupOfParticles* group, const MortonIndex inIndex) { + return FOpenCLGroupOfParticles_isInside(group, inIndex) && (FOpenCLGroupOfParticles_getLeafIndex(group, inIndex) != -1); +} +struct FOpenCLGroupAttachedLeaf FOpenCLGroupOfParticles_getLeaf(struct FOpenCLGroupOfParticles* group, const int id){ + return BuildFOpenCLGroupAttachedLeaf(group->leafHeader[id].nbParticles, + group->particlePosition[0] + group->leafHeader[id].offSet, + group->blockHeader->positionsLeadingDim, + (group->attributesBuffer?group->particleAttributes[NbSymbAttributes] + group->leafHeader[id].offSet:NULLPTR), + group->blockHeader->attributeLeadingDim); +} + + +struct FOpenCLGroupOfCellsBlockHeader{ + MortonIndex startingIndex; + MortonIndex endingIndex; + int numberOfCellsInBlock; +} __attribute__ ((aligned (DefaultStructAlign))); + + +struct FOpenCLGroupOfCells { + //< The size of the memoryBuffer + size_t allocatedMemoryInByte; + //< Pointer to a block memory + __global unsigned char* memoryBuffer; + + //< Pointer to the header inside the block memory + __global struct FOpenCLGroupOfCellsBlockHeader* blockHeader; + //< Pointer to the indexes table inside the block memory + __global MortonIndex* cellIndexes; + //< Pointer to the cells inside the block memory + __global struct FSymboleCellClass* blockCells; + + //< The multipole data + __global FPoleCellClass* cellMultipoles; + //< The local data + __global FLocalCellClass* cellLocals; +}; + +struct FOpenCLGroupOfCells BuildFOpenCLGroupOfCells(__global unsigned char* inBuffer, const size_t inAllocatedMemoryInByte, + __global unsigned char* inCellMultipoles, __global unsigned char* inCellLocals){ + struct FOpenCLGroupOfCells group; + group.memoryBuffer = (inBuffer); + group.allocatedMemoryInByte = (inAllocatedMemoryInByte); + + // Move the pointers to the correct position + group.blockHeader = (__global struct FOpenCLGroupOfCellsBlockHeader*)(inBuffer); + inBuffer += sizeof(struct FOpenCLGroupOfCellsBlockHeader); + group.cellIndexes = (__global MortonIndex*)(inBuffer); + inBuffer += (group.blockHeader->numberOfCellsInBlock*sizeof(MortonIndex)); + group.blockCells = (__global struct FSymboleCellClass*)(inBuffer); + inBuffer += (group.blockHeader->numberOfCellsInBlock*sizeof(struct FSymboleCellClass)); + // Assert(((size_t)(inBuffer-group.memoryBuffer) == inAllocatedMemoryInByte); + + group.cellMultipoles = (__global FPoleCellClass*)inCellMultipoles; + group.cellLocals = (__global FLocalCellClass*)inCellLocals; + return group; +} +MortonIndex FOpenCLGroupOfCells_getStartingIndex(const struct FOpenCLGroupOfCells* group) { + return group->blockHeader->startingIndex; +} +MortonIndex FOpenCLGroupOfCells_getEndingIndex(const struct FOpenCLGroupOfCells* group) { + return group->blockHeader->endingIndex; +} +int FOpenCLGroupOfCells_getNumberOfCellsInBlock(const struct FOpenCLGroupOfCells* group) { + return group->blockHeader->numberOfCellsInBlock; +} +MortonIndex FOpenCLGroupOfCells_getSizeOfInterval(const struct FOpenCLGroupOfCells* group) { + return group->blockHeader->endingIndex - group->blockHeader->startingIndex; +} +bool FOpenCLGroupOfCells_isInside(const struct FOpenCLGroupOfCells* group, const MortonIndex inIndex){ + return group->blockHeader->startingIndex <= inIndex && inIndex < group->blockHeader->endingIndex; +} + +MortonIndex FOpenCLGroupOfCells_getCellMortonIndex(const struct FOpenCLGroupOfCells* group,const int cellPos){ + return group->cellIndexes[cellPos]; +} + +int FOpenCLGroupOfCells_getCellIndex(const struct FOpenCLGroupOfCells* group,const MortonIndex cellIdx){ + int idxLeft = 0; + int idxRight = group->blockHeader->numberOfCellsInBlock-1; + while(idxLeft <= idxRight){ + const int idxMiddle = (idxLeft+idxRight)/2; + if(group->cellIndexes[idxMiddle] == cellIdx){ + return idxMiddle; + } + if(cellIdx < group->cellIndexes[idxMiddle]){ + idxRight = idxMiddle-1; + } + else{ + idxLeft = idxMiddle+1; + } + } + return -1; +} + +int FOpenCLGroupOfCells_getFistChildIdx(const struct FOpenCLGroupOfCells* group, const MortonIndex parentIdx) { + int idxLeft = 0; + int idxRight = group->blockHeader->numberOfCellsInBlock-1; + while(idxLeft <= idxRight){ + int idxMiddle = (idxLeft+idxRight)/2; + if((group->cellIndexes[idxMiddle]>>3) == parentIdx){ + while(0 < idxMiddle && (group->cellIndexes[idxMiddle-1]>>3) == parentIdx){ + idxMiddle -= 1; + } + return idxMiddle; + } + if(parentIdx < (group->cellIndexes[idxMiddle]>>3)){ + idxRight = idxMiddle-1; + } + else{ + idxLeft = idxMiddle+1; + } + } + return -1; +} + + +bool FOpenCLGroupOfCells_exists(const struct FOpenCLGroupOfCells* group, const MortonIndex inIndex) { + return FOpenCLGroupOfCells_isInside(group, inIndex) && FOpenCLGroupOfCells_getCellIndex(group, inIndex) != -1; +} +struct FWrappeCell FOpenCLGroupOfCells_getCompleteCell(struct FOpenCLGroupOfCells* group, const int cellPos){ + struct FWrappeCell cell; + cell.symb = &group->blockCells[cellPos]; + cell.up = &group->cellMultipoles[cellPos]; + cell.down = &group->cellLocals[cellPos]; + return cell; +} + +struct FWrappeCell FOpenCLGroupOfCells_getUpCell(struct FOpenCLGroupOfCells* group, const int cellPos){ + struct FWrappeCell cell; + cell.symb = &group->blockCells[cellPos]; + cell.up = &group->cellMultipoles[cellPos]; + cell.down = NULLPTR; + return cell; +} + +struct FWrappeCell FOpenCLGroupOfCells_getDownCell(struct FOpenCLGroupOfCells* group, const int cellPos){ + struct FWrappeCell cell; + cell.symb = &group->blockCells[cellPos]; + cell.up = NULLPTR; + cell.down =&group->cellLocals[cellPos]; + return cell; +} + +struct Uptr9{ + __global unsigned char* ptrs[9]; +} __attribute__ ((aligned (DefaultStructAlign))); + +struct size_t9{ + size_t v[9]; +} __attribute__ ((aligned (DefaultStructAlign))); + +struct Uptr343{ + __global unsigned char* ptrs[343]; +}; + +/***************************************************************************/ +/***************************************************************************/ +/************************CHANGE THINGS HERE*********************************/ +/***************************************************************************/ + + +void P2M(struct FWrappeCell pole, const struct FOpenCLGroupAttachedLeaf particles, __global void* user_data) { + *pole.up = particles.nbParticles; +} + +void M2M(struct FWrappeCell pole, struct FWrappeCell child[8], const int level, __global void* user_data) { + for(int idxChild = 0 ; idxChild < 8 ; ++idxChild){ + if(child[idxChild].symb){ + *pole.up += *child[idxChild].up; + } + } +} + +void M2L(struct FWrappeCell const pole, const struct FWrappeCell* distantNeighbors, + const int* relativePositions, const int size, const int level, __global void* user_data) { + for(int idxNeigh = 0 ; idxNeigh < size ; ++idxNeigh){ + *pole.down += *distantNeighbors[idxNeigh].up; + } +} + +void L2L(const struct FWrappeCell localCell, struct FWrappeCell child[8], const int level, __global void* user_data) { + for(int idxChild = 0 ; idxChild < 8 ; ++idxChild){ + if(child[idxChild].symb){ + *child[idxChild].down += *localCell.down; + } + } +} + +void L2P(const struct FWrappeCell localCell, struct FOpenCLGroupAttachedLeaf particles, __global void* user_data){ + __global long long* partdown = particles.attributes[0]; + for(FSize idxPart = 0 ; idxPart < particles.nbParticles ; ++idxPart){ + partdown[idxPart] += *localCell.down; + } +} + +void P2P(const int3 pos, + struct FOpenCLGroupAttachedLeaf targets, const struct FOpenCLGroupAttachedLeaf sources, + struct FOpenCLGroupAttachedLeaf directNeighborsParticles[27], int directNeighborsPositions[27], const int counter, __global void* user_data){ + long long cumul = sources.nbParticles-1; + + for(int idxNeigh = 0 ; idxNeigh < counter ; ++idxNeigh){ + if(FOpenCLGroupAttachedLeaf_isAttachedToSomething(&directNeighborsParticles[idxNeigh])){ + cumul += directNeighborsParticles[idxNeigh].nbParticles; + } + } + + __global long long* partdown = targets.attributes[0]; + for(FSize idxPart = 0 ; idxPart < targets.nbParticles ; ++idxPart){ + partdown[idxPart] += cumul; + } +} + +void P2PRemote(const int3 pos, + struct FOpenCLGroupAttachedLeaf targets, const struct FOpenCLGroupAttachedLeaf sources, + struct FOpenCLGroupAttachedLeaf directNeighborsParticles, const int position, __global void* user_data){ + __global long long* partdown = targets.attributes[0]; + for(FSize idxPart = 0 ; idxPart < targets.nbParticles ; ++idxPart){ + partdown[idxPart] += directNeighborsParticles.nbParticles; + } +} + +void P2POuter(const int3 pos, + struct FOpenCLGroupAttachedLeaf targets, const struct FOpenCLGroupAttachedLeaf sources, + struct FOpenCLGroupAttachedLeaf directNeighborsParticles, const int position, __global void* user_data){ + __global long long* partdown = targets.attributes[0]; + for(FSize idxPart = 0 ; idxPart < targets.nbParticles ; ++idxPart){ + partdown[idxPart] += directNeighborsParticles.nbParticles; + } +} + +int3 getCoordinate(const struct FWrappeCell cell) { + int3 coord; + coord.x = cell.symb->coordinates[0]; + coord.y = cell.symb->coordinates[1]; + coord.z = cell.symb->coordinates[2]; + return coord; +} + + +/***************************************************************************/ +/***************************************************************************/ +/***************************************************************************/ +/***************************************************************************/ + +#define FOpenCLCheck( test ) { FOpenCLCheckCore((test), __FILE__, __LINE__); } +#define FOpenCLCheckAfterCall() { FOpenCLCheckCore((cudaGetLastError()), __FILE__, __LINE__); } +#define FOpenCLAssertLF(ARGS) if(!(ARGS)){ *((char*)0x09) = 'e'; } +//#define FOpenCLAssertLF(ARGS) ARGS; + +#define FMGetOppositeNeighIndex(index) (27-(index)-1) +#define FMGetOppositeInterIndex(index) (343-(index)-1) + +#define FOpenCLMax(x,y) ((x)<(y) ? (y) : (x)) +#define FOpenCLMin(x,y) ((x)>(y) ? (y) : (x)) + + +__kernel void FOpenCL__bottomPassPerform(__global unsigned char* leafCellsPtr, size_t leafCellsSize,__global unsigned char* leafCellsUpPtr, + __global unsigned char* containersPtr, size_t containersSize, + __global void* userkernel ){ + struct FOpenCLGroupOfCells leafCells = BuildFOpenCLGroupOfCells(leafCellsPtr, leafCellsSize, leafCellsUpPtr, NULLPTR); + struct FOpenCLGroupOfParticles containers = BuildFOpenCLGroupOfParticles(containersPtr, containersSize, NULLPTR); + + const int nbLeaves = FOpenCLGroupOfCells_getNumberOfCellsInBlock(&leafCells); + + for(int idxLeaf = 0 ; idxLeaf < nbLeaves ; ++idxLeaf){ + struct FWrappeCell cell = FOpenCLGroupOfCells_getUpCell(&leafCells, idxLeaf); + FOpenCLAssertLF(cell.symb->mortonIndex == FOpenCLGroupOfCells_getCellMortonIndex(&leafCells, idxLeaf)); + struct FOpenCLGroupAttachedLeaf particles = FOpenCLGroupOfParticles_getLeaf(&containers, idxLeaf); + FOpenCLAssertLF(FOpenCLGroupOfParticles_getLeafMortonIndex(&containers, idxLeaf) == FOpenCLGroupOfCells_getCellMortonIndex(&leafCells, idxLeaf)); + P2M(cell, particles, userkernel); + } +} + + +///////////////////////////////////////////////////////////////////////////////////// +/// Upward Pass +///////////////////////////////////////////////////////////////////////////////////// + +__kernel void FOpenCL__upwardPassPerform(__global unsigned char* currentCellsPtr, size_t currentCellsSize, __global unsigned char* currentCellsUpPtr, + __global unsigned char* childCellsPtr, size_t childCellsSize, __global unsigned char* childCellsUpPtr, + int idxLevel, __global void* userkernel){ + struct FOpenCLGroupOfCells currentCells = BuildFOpenCLGroupOfCells(currentCellsPtr, currentCellsSize, currentCellsUpPtr, NULLPTR); + const int nbCells = FOpenCLGroupOfCells_getNumberOfCellsInBlock(¤tCells); + struct FOpenCLGroupOfCells childCells = BuildFOpenCLGroupOfCells(childCellsPtr, childCellsSize, childCellsUpPtr, NULLPTR); + const int childNbCells = FOpenCLGroupOfCells_getNumberOfCellsInBlock(&childCells); + + const MortonIndex firstParent = FOpenCLMax(FOpenCLGroupOfCells_getStartingIndex(¤tCells), FOpenCLGroupOfCells_getStartingIndex(&childCells)>>3); + const MortonIndex lastParent = FOpenCLMin(FOpenCLGroupOfCells_getEndingIndex(¤tCells)-1, (FOpenCLGroupOfCells_getEndingIndex(&childCells)-1)>>3); + + int idxParentCell = FOpenCLGroupOfCells_getCellIndex(¤tCells,firstParent); + int idxChildCell = FOpenCLGroupOfCells_getFistChildIdx(&childCells,firstParent); + + while(true){ + struct FWrappeCell cell = FOpenCLGroupOfCells_getUpCell(¤tCells, idxParentCell); + struct FWrappeCell child[8]; + for(int idxChild = 0 ; idxChild < 8 ; ++idxChild){ + child[idxChild].symb = NULLPTR; + } + + do{ + const int idxChild = ((FOpenCLGroupOfCells_getCellMortonIndex(&childCells,idxChildCell)) & 7); + child[idxChild] = FOpenCLGroupOfCells_getUpCell(&childCells, idxChildCell); + idxChildCell += 1; + }while(idxChildCell != childNbCells && cell.symb->mortonIndex == (FOpenCLGroupOfCells_getCellMortonIndex(&childCells, idxChildCell)>>3)); + + M2M(cell, child, idxLevel, userkernel); + + if(FOpenCLGroupOfCells_getCellMortonIndex(¤tCells, idxParentCell) == lastParent){ + break; + } + + idxParentCell += 1; + } +} + + +///////////////////////////////////////////////////////////////////////////////////// +/// Transfer Pass Mpi +///////////////////////////////////////////////////////////////////////////////////// + + +__kernel void FOpenCL__transferInoutPassPerformMpi(__global unsigned char* currentCellsPtr, size_t currentCellsSize, __global unsigned char* currentCellsDownPtr, + __global unsigned char* externalCellsPtr, size_t externalCellsSize, __global unsigned char* externalCellsUpPtr, + int idxLevel, const __global struct OutOfBlockInteraction* outsideInteractions, + size_t nbOutsideInteractions, __global void* userkernel){ + struct FOpenCLGroupOfCells currentCells = BuildFOpenCLGroupOfCells(currentCellsPtr, currentCellsSize, NULLPTR, currentCellsDownPtr); + struct FOpenCLGroupOfCells cellsOther = BuildFOpenCLGroupOfCells(externalCellsPtr, externalCellsSize, externalCellsUpPtr, NULLPTR); + + for(int outInterIdx = 0 ; outInterIdx < nbOutsideInteractions ; ++outInterIdx){ + const int cellPos = FOpenCLGroupOfCells_getCellIndex(&cellsOther, outsideInteractions[outInterIdx].outIndex); + if(cellPos != -1){ + FOpenCLAssertLF(outsideInteractions[outInterIdx].outIndex == FOpenCLGroupOfCells_getCellMortonIndex(&cellsOther, outsideInteractions[outInterIdx].outIndex)); + struct FWrappeCell interCell = FOpenCLGroupOfCells_getUpCell(&cellsOther, cellPos); + FOpenCLAssertLF(interCell.symb->mortonIndex == outsideInteractions[outInterIdx].outIndex); + struct FWrappeCell cell = FOpenCLGroupOfCells_getDownCell(¤tCells, outsideInteractions[outInterIdx].insideIdxInBlock); + FOpenCLAssertLF(cell.symb->mortonIndex == outsideInteractions[outInterIdx].insideIndex); + + const int relativeOutPosition = outsideInteractions[outInterIdx].relativeOutPosition; + M2L( cell , &interCell, &relativeOutPosition, + 1, idxLevel, userkernel); + } + } +} + + +///////////////////////////////////////////////////////////////////////////////////// +/// Transfer Pass +///////////////////////////////////////////////////////////////////////////////////// + + + +__kernel void FOpenCL__transferInPassPerform(__global unsigned char* currentCellsPtr, size_t currentCellsSize, + __global unsigned char* currentCellsUpPtr, __global unsigned char* currentCellsDownPtr, + int idxLevel, __global void* userkernel){ + struct FOpenCLGroupOfCells currentCells = BuildFOpenCLGroupOfCells(currentCellsPtr, currentCellsSize, currentCellsUpPtr, currentCellsDownPtr); + + const MortonIndex blockStartIdx = FOpenCLGroupOfCells_getStartingIndex(¤tCells); + const MortonIndex blockEndIdx = FOpenCLGroupOfCells_getEndingIndex(¤tCells); + + const int nbCells = FOpenCLGroupOfCells_getNumberOfCellsInBlock(¤tCells); + + for(int idxCell = 0 ; idxCell < nbCells ; ++idxCell){ + struct FWrappeCell cell = FOpenCLGroupOfCells_getDownCell(¤tCells, idxCell); + FOpenCLAssertLF(cell.symb->mortonIndex == FOpenCLGroupOfCells_getCellMortonIndex(¤tCells, idxCell)); + MortonIndex interactionsIndexes[189]; + int interactionsPosition[189]; + const int3 coord = (getCoordinate(cell)); + int counter = GetInteractionNeighbors(coord, idxLevel,interactionsIndexes,interactionsPosition); + + struct FWrappeCell interactions[343]; + FSetToNullptr343(interactions); + int counterExistingCell = 0; + + for(int idxInter = 0 ; idxInter < counter ; ++idxInter){ + if( blockStartIdx <= interactionsIndexes[idxInter] && interactionsIndexes[idxInter] < blockEndIdx ){ + const int cellPos = FOpenCLGroupOfCells_getCellIndex(¤tCells, interactionsIndexes[idxInter]); + if(cellPos != -1){ + struct FWrappeCell interCell = FOpenCLGroupOfCells_getUpCell(¤tCells, cellPos); + interactions[counterExistingCell] = interCell; + interactionsPosition[counterExistingCell] = interactionsPosition[idxInter]; + counterExistingCell += 1; + } + } + } + + M2L( cell , interactions, interactionsPosition, + counterExistingCell, idxLevel, userkernel); + } +} + + + +__kernel void FOpenCL__transferInoutPassPerform(__global unsigned char* currentCellsPtr, size_t currentCellsSize, + __global unsigned char* currentCellsUpPtr, + __global unsigned char* externalCellsPtr, size_t externalCellsSize, + __global unsigned char* externalCellsDownPtr, + int idxLevel, int mode, const __global struct OutOfBlockInteraction* outsideInteractions, + size_t nbOutsideInteractions, __global void* userkernel){ + struct FOpenCLGroupOfCells currentCells = BuildFOpenCLGroupOfCells(currentCellsPtr, currentCellsSize, currentCellsUpPtr, NULLPTR); + struct FOpenCLGroupOfCells cellsOther = BuildFOpenCLGroupOfCells(externalCellsPtr, externalCellsSize, NULLPTR, externalCellsDownPtr); + + if(mode == 1){ + for(int outInterIdx = 0 ; outInterIdx < nbOutsideInteractions ; ++outInterIdx){ + struct FWrappeCell interCell = FOpenCLGroupOfCells_getUpCell(&cellsOther, outsideInteractions[outInterIdx].outsideIdxInBlock); + FOpenCLAssertLF(interCell.symb->mortonIndex == outsideInteractions[outInterIdx].outIndex); + struct FWrappeCell cell = FOpenCLGroupOfCells_getDownCell(¤tCells, outsideInteractions[outInterIdx].insideIdxInBlock); + FOpenCLAssertLF(cell.symb->mortonIndex == outsideInteractions[outInterIdx].insideIndex); + + const int relativeOutPosition = outsideInteractions[outInterIdx].relativeOutPosition; + M2L( cell , &interCell, &relativeOutPosition, + 1, idxLevel, userkernel); + } + } + else{ + for(int outInterIdx = 0 ; outInterIdx < nbOutsideInteractions ; ++outInterIdx){ + struct FWrappeCell interCell = FOpenCLGroupOfCells_getDownCell(&cellsOther, outsideInteractions[outInterIdx].outsideIdxInBlock); + FOpenCLAssertLF(interCell.symb->mortonIndex == outsideInteractions[outInterIdx].outIndex); + struct FWrappeCell cell = FOpenCLGroupOfCells_getUpCell(¤tCells, outsideInteractions[outInterIdx].insideIdxInBlock); + FOpenCLAssertLF(cell.symb->mortonIndex == outsideInteractions[outInterIdx].insideIndex); + + const int relativepos = FMGetOppositeInterIndex(outsideInteractions[outInterIdx].relativeOutPosition); + M2L( interCell , &cell, &relativepos, 1, idxLevel, userkernel); + } + } +} + + + +///////////////////////////////////////////////////////////////////////////////////// +/// Downard Pass +///////////////////////////////////////////////////////////////////////////////////// + + +__kernel void FOpenCL__downardPassPerform(__global unsigned char* currentCellsPtr, size_t currentCellsSize, __global unsigned char* currentCellsDownPtr, + __global unsigned char* childCellsPtr, size_t childCellsSize, __global unsigned char* childCellsDownPtr, + int idxLevel, __global void* userkernel){ + struct FOpenCLGroupOfCells currentCells = BuildFOpenCLGroupOfCells(currentCellsPtr, currentCellsSize, NULLPTR, currentCellsDownPtr); + const int nbCells = FOpenCLGroupOfCells_getNumberOfCellsInBlock(¤tCells); + struct FOpenCLGroupOfCells childCells = BuildFOpenCLGroupOfCells(childCellsPtr, childCellsSize, NULLPTR, childCellsDownPtr); + const int childNbCells = FOpenCLGroupOfCells_getNumberOfCellsInBlock(&childCells); + + const MortonIndex firstParent = FOpenCLMax(FOpenCLGroupOfCells_getStartingIndex(¤tCells), FOpenCLGroupOfCells_getStartingIndex(&childCells)>>3); + const MortonIndex lastParent = FOpenCLMin(FOpenCLGroupOfCells_getEndingIndex(¤tCells)-1, (FOpenCLGroupOfCells_getEndingIndex(&childCells)-1)>>3); + + int idxParentCell = FOpenCLGroupOfCells_getCellIndex(¤tCells,firstParent); + int idxChildCell = FOpenCLGroupOfCells_getFistChildIdx(&childCells,firstParent); + + while(true){ + struct FWrappeCell cell = FOpenCLGroupOfCells_getDownCell(¤tCells, idxParentCell); + struct FWrappeCell child[8]; + for(int idxChild = 0 ; idxChild < 8 ; ++idxChild){ + child[idxChild].symb = NULLPTR; + } + + do{ + const int idxChild = ((FOpenCLGroupOfCells_getCellMortonIndex(&childCells,idxChildCell)) & 7); + child[idxChild] = FOpenCLGroupOfCells_getDownCell(&childCells, idxChildCell); + idxChildCell += 1; + }while(idxChildCell != childNbCells && cell.symb->mortonIndex == (FOpenCLGroupOfCells_getCellMortonIndex(&childCells, idxChildCell)>>3)); + + L2L(cell, child, idxLevel, userkernel); + + if(FOpenCLGroupOfCells_getCellMortonIndex(¤tCells, idxParentCell) == lastParent){ + break; + } + + idxParentCell += 1; + } +} + + + +///////////////////////////////////////////////////////////////////////////////////// +/// Direct Pass MPI +///////////////////////////////////////////////////////////////////////////////////// + + +__kernel void FOpenCL__directInoutPassPerformMpi(__global unsigned char* containersPtr, size_t containersSize, __global unsigned char* containersDownPtr, + __global unsigned char* externalContainersPtr, size_t externalContainersSize, __global unsigned char* outsideInteractionsCl, + const __global struct OutOfBlockInteraction* outsideInteractions, + size_t nbOutsideInteractions, const int treeHeight, __global void* userkernel){ + struct FOpenCLGroupOfParticles containers = BuildFOpenCLGroupOfParticles(containersPtr, containersSize, containersDownPtr); + struct FOpenCLGroupOfParticles containersOther = BuildFOpenCLGroupOfParticles(externalContainersPtr, externalContainersSize, NULLPTR); + + for(int outInterIdx = 0 ; outInterIdx < nbOutsideInteractions ; ++outInterIdx){ + const int leafPos = FOpenCLGroupOfParticles_getLeafIndex(&containersOther, outsideInteractions[outInterIdx].outIndex); + if(leafPos != -1){ + FOpenCLAssertLF(FOpenCLGroupOfParticles_getLeafMortonIndex(&containersOther, leafPos) == outsideInteractions[outInterIdx].outIndex); + struct FOpenCLGroupAttachedLeaf interParticles = FOpenCLGroupOfParticles_getLeaf(&containersOther, leafPos); + struct FOpenCLGroupAttachedLeaf particles = FOpenCLGroupOfParticles_getLeaf(&containers, outsideInteractions[outInterIdx].insideIdxInBlock); + FOpenCLAssertLF(FOpenCLGroupOfParticles_getLeafMortonIndex(&containers, outsideInteractions[outInterIdx].insideIdxInBlock) == outsideInteractions[outInterIdx].insideIndex); + + P2PRemote( GetPositionFromMorton(outsideInteractions[outInterIdx].insideIndex, treeHeight-1), particles, particles , + interParticles, outsideInteractions[outInterIdx].relativeOutPosition, userkernel); + } + } +} + + +///////////////////////////////////////////////////////////////////////////////////// +/// Direct Pass +///////////////////////////////////////////////////////////////////////////////////// + + + +__kernel void FOpenCL__directInPassPerform(__global unsigned char* containersPtr, size_t containersSize, __global unsigned char* containersDownPtr, + const int treeHeight, __global void* userkernel){ + struct FOpenCLGroupOfParticles containers = BuildFOpenCLGroupOfParticles(containersPtr, containersSize, containersDownPtr); + + const MortonIndex blockStartIdx = FOpenCLGroupOfParticles_getStartingIndex(&containers); + const MortonIndex blockEndIdx = FOpenCLGroupOfParticles_getEndingIndex(&containers); + + const int nbLeaves = FOpenCLGroupOfParticles_getNumberOfLeaves(&containers); + + for(int idxLeaf = 0 ; idxLeaf < nbLeaves ; ++idxLeaf){ + struct FOpenCLGroupAttachedLeaf particles = FOpenCLGroupOfParticles_getLeaf(&containers, idxLeaf); + MortonIndex interactionsIndexes[26]; + int interactionsPosition[26]; + const int3 coord = GetPositionFromMorton(FOpenCLGroupOfParticles_getLeafMortonIndex(&containers, idxLeaf), treeHeight-1); + int counter = GetNeighborsIndexes(coord, treeHeight,interactionsIndexes,interactionsPosition); + + struct FOpenCLGroupAttachedLeaf interactionsObjects[27]; + int neighPosition[26]; + int counterExistingCell = 0; + + for(int idxInter = 0 ; idxInter < counter ; ++idxInter){ + if( blockStartIdx <= interactionsIndexes[idxInter] && interactionsIndexes[idxInter] < blockEndIdx ){ + const int leafPos = FOpenCLGroupOfParticles_getLeafIndex(&containers, interactionsIndexes[idxInter]); + if(leafPos != -1){ + FOpenCLAssertLF(FOpenCLGroupOfParticles_getLeafMortonIndex(&containers, leafPos) == interactionsIndexes[idxInter]); + interactionsObjects[counterExistingCell] = FOpenCLGroupOfParticles_getLeaf(&containers, leafPos); + neighPosition[counterExistingCell] = interactionsPosition[idxInter]; + counterExistingCell += 1; + } + } + } + + P2P( coord, particles, particles , interactionsObjects, neighPosition, counterExistingCell, userkernel); + } +} + + + +__kernel void FOpenCL__directInoutPassPerform(__global unsigned char* containersPtr, size_t containersSize, __global unsigned char* containersDownPtr, + __global unsigned char* externalContainersPtr, size_t externalContainersSize, __global unsigned char* externalContainersDownPtr, + const __global struct OutOfBlockInteraction* outsideInteractions, + size_t nbOutsideInteractions, const int treeHeight, __global void* userkernel){ + struct FOpenCLGroupOfParticles containers = BuildFOpenCLGroupOfParticles(containersPtr, containersSize, containersDownPtr); + struct FOpenCLGroupOfParticles containersOther = BuildFOpenCLGroupOfParticles(externalContainersPtr, externalContainersSize, externalContainersDownPtr); + + for(int outInterIdx = 0 ; outInterIdx < nbOutsideInteractions ; ++outInterIdx){ + const int leafPos = FOpenCLGroupOfParticles_getLeafIndex(&containersOther, outsideInteractions[outInterIdx].outIndex); + if(leafPos != -1){ + struct FOpenCLGroupAttachedLeaf interParticles = FOpenCLGroupOfParticles_getLeaf(&containersOther, outsideInteractions[outInterIdx].outsideIdxInBlock); + struct FOpenCLGroupAttachedLeaf particles = FOpenCLGroupOfParticles_getLeaf(&containers, outsideInteractions[outInterIdx].insideIdxInBlock); + + FOpenCLAssertLF(FOpenCLGroupOfParticles_getLeafMortonIndex(&containers, outsideInteractions[outInterIdx].insideIdxInBlock) == outsideInteractions[outInterIdx].insideIndex); + FOpenCLAssertLF(particles.nbParticles); + FOpenCLAssertLF(interParticles.nbParticles); + + P2POuter( GetPositionFromMorton(outsideInteractions[outInterIdx].insideIndex, treeHeight-1), particles, particles , + interParticles, outsideInteractions[outInterIdx].relativeOutPosition, userkernel ); + + P2POuter( GetPositionFromMorton(outsideInteractions[outInterIdx].outIndex, treeHeight-1), interParticles, interParticles , + particles, FMGetOppositeNeighIndex(outsideInteractions[outInterIdx].relativeOutPosition), userkernel); + } + } +} + + + +///////////////////////////////////////////////////////////////////////////////////// +/// Merge Pass +///////////////////////////////////////////////////////////////////////////////////// + + + +__kernel void FOpenCL__mergePassPerform(__global unsigned char* leafCellsPtr, size_t leafCellsSize, __global unsigned char* leafCellsDownPtr, + __global unsigned char* containersPtr, size_t containersSize, __global unsigned char* containersDownPtr, + __global void* userkernel){ + struct FOpenCLGroupOfCells leafCells = BuildFOpenCLGroupOfCells(leafCellsPtr,leafCellsSize, NULLPTR, leafCellsDownPtr); + struct FOpenCLGroupOfParticles containers = BuildFOpenCLGroupOfParticles(containersPtr,containersSize, containersDownPtr); + + const int nbLeaves = FOpenCLGroupOfCells_getNumberOfCellsInBlock(&leafCells); + + for(int idxLeaf = 0 ; idxLeaf < nbLeaves ; ++idxLeaf){ + struct FWrappeCell cell = FOpenCLGroupOfCells_getDownCell(&leafCells, idxLeaf); + FOpenCLAssertLF(cell.symb->mortonIndex == FOpenCLGroupOfCells_getCellMortonIndex(&leafCells, idxLeaf)); + struct FOpenCLGroupAttachedLeaf particles = FOpenCLGroupOfParticles_getLeaf(&containers, idxLeaf); + FOpenCLAssertLF(FOpenCLGroupOfParticles_getLeafMortonIndex(&containers, idxLeaf) == FOpenCLGroupOfCells_getCellMortonIndex(&leafCells, idxLeaf)); + L2P(cell, particles, userkernel); + } +} + diff --git a/Src/GroupTree/Uniform/FUniformOpenCLCode.hpp b/Src/GroupTree/Uniform/FUniformOpenCLCode.hpp new file mode 100644 index 0000000000000000000000000000000000000000..4556fa722c755c46fb7be820ff26badfc85a04c5 --- /dev/null +++ b/Src/GroupTree/Uniform/FUniformOpenCLCode.hpp @@ -0,0 +1,67 @@ +#ifndef FUNIFORMOPENCLCODE_HPP +#define FUNIFORMOPENCLCODE_HPP + + +#include "../../Utils/FGlobal.hpp" +#include "../StarPUUtils/FStarPUDefaultAlign.hpp" +#include "../OpenCl/FTextReplacer.hpp" + +#include "../../Kernels/Uniform/FUnifCell.hpp" + +// Initialize the types +template <class FReal, const int ORDER> +class FUniformOpenCLCode{ + FTextReplacer kernelfile; + size_t dim; + +public: + FUniformOpenCLCode() : kernelfile("../Src/GroupTree/Uniform/FUniformKernel.cl"){ + if(sizeof(FReal) == sizeof(double)){ + kernelfile.replaceAll("___FReal___", "double"); + } + else{ + kernelfile.replaceAll("___FReal___", "float"); + } + FAssertLF((typeid(FSize) == typeid(long long int))); + kernelfile.replaceAll("___FSize___", "long long int"); + kernelfile.replaceAll("___FParticleValueClass___", "long long"); + kernelfile.replaceAll("___NbSymbAttributes___", 0); + kernelfile.replaceAll("___NbAttributesPerParticle___", 1); + const size_t structAlign = FStarPUDefaultAlign::StructAlign; + kernelfile.replaceAll("___DefaultStructAlign___", structAlign); + kernelfile.replaceAll("___FP2PDefaultAlignement___", FP2PDefaultAlignement); + + kernelfile.replaceAll("__ORDER__", ORDER); + FUnifCell<FReal, ORDER> cell; + kernelfile.replaceAll("__POLE_SIZE__", cell.getVectorSize()); + kernelfile.replaceAll("__LOCAL_SIZE__", cell.getVectorSize()); + + dim = 1; + } + + const char* getKernelCode(const int /*inDevId*/){ + return kernelfile.getContent(); + } + + void releaseKernelCode(){ + kernelfile.clear(); + } + + unsigned int getNbDims() const { + return 1; + } + + const size_t* getNbGroups(const int /*inSizeInterval*/) const { + // We return 1 + return &dim; + } + + const size_t* getGroupSize() const { + // We return 1 + return &dim; + } +}; + + +#endif // FUNIFORMOPENCLCODE_HPP + diff --git a/Src/ScalFmmConfig.h.cmake b/Src/ScalFmmConfig.h.cmake index 9ae8f6fbc3a531ceeb8fd910f5f1c6c51a9455e7..d62c7af25af646e0d202836ec805023327e2ef38 100644 --- a/Src/ScalFmmConfig.h.cmake +++ b/Src/ScalFmmConfig.h.cmake @@ -131,6 +131,12 @@ const std::string SCALFMMCompileLibs("@SCALFMM_COMPILE_LIBS@"); #cmakedefine OPENMP_SUPPORT_PRIORITY +/////////////////////////////////////////////////////// +// To use a taskname clause for tasks with KSTAR OMP4 +/////////////////////////////////////////////////////// + +#cmakedefine OPENMP_SUPPORT_TASK_NAME + /////////////////////////////////////////////////////// // To record omp4 task times for statistics /////////////////////////////////////////////////////// diff --git a/Src/Utils/FMpi.hpp b/Src/Utils/FMpi.hpp index dec2ee94da810f4f3e545ade0cf930ea8fbd7e5f..f0654ea8ea223c919ca278774fecc095be29bf9e 100644 --- a/Src/Utils/FMpi.hpp +++ b/Src/Utils/FMpi.hpp @@ -28,6 +28,7 @@ #include "FNoCopyable.hpp" #include "FMath.hpp" +#include "FAssert.hpp" //Need that for converting datas #include "FComplex.hpp" @@ -250,6 +251,13 @@ public: void barrier() const { FMpi::Assert(MPI_Barrier(getComm()), __LINE__); } + + bool hasPendingMessage() const { + MPI_Status status; + int flag = 0; + MPI_Iprobe(MPI_ANY_SOURCE, MPI_ANY_TAG, getComm(), &flag, &status); + return (flag != 0); + } }; //////////////////////////////////////////////////////// @@ -427,6 +435,44 @@ public: } } + static const size_t MaxBytesPerDivMess = 20000000; + + template <class ObjectType, class VectorType> + static int ISendSplit(const ObjectType toSend[], const size_t nbItems, + const int dest, const int tagBase, const FMpi::FComm& communicator, + VectorType* requestVector){ + const size_t totalByteToSend = (nbItems*sizeof(ObjectType)); + unsigned char*const ptrDataToSend = (unsigned char*)const_cast<ObjectType*>(toSend); + for(size_t idxSize = 0 ; idxSize < totalByteToSend ; idxSize += MaxBytesPerDivMess){ + MPI_Request currentRequest; + const size_t nbBytesInMessage = FMath::Min(MaxBytesPerDivMess, totalByteToSend-idxSize); + FAssertLF(nbBytesInMessage < std::numeric_limits<int>::max()); + FMpi::Assert( MPI_Isend(&ptrDataToSend[idxSize], int(nbBytesInMessage), MPI_BYTE , dest, + tagBase + int(idxSize/MaxBytesPerDivMess), communicator.getComm(), ¤tRequest) , __LINE__); + + requestVector->push_back(currentRequest); + } + return int((totalByteToSend+MaxBytesPerDivMess-1)/MaxBytesPerDivMess); + } + + template <class ObjectType, class VectorType> + static int IRecvSplit(ObjectType toRecv[], const size_t nbItems, + const int source, const int tagBase, const FMpi::FComm& communicator, + VectorType* requestVector){ + const size_t totalByteToRecv = (nbItems*sizeof(ObjectType)); + unsigned char*const ptrDataToRecv = (unsigned char*)(toRecv); + for(size_t idxSize = 0 ; idxSize < totalByteToRecv ; idxSize += MaxBytesPerDivMess){ + MPI_Request currentRequest; + const size_t nbBytesInMessage = FMath::Min(MaxBytesPerDivMess, totalByteToRecv-idxSize); + FAssertLF(nbBytesInMessage < std::numeric_limits<int>::max()); + FMpi::Assert( MPI_Irecv(&ptrDataToRecv[idxSize], int(nbBytesInMessage), MPI_BYTE , source, + tagBase + int(idxSize/MaxBytesPerDivMess), communicator.getComm(), ¤tRequest) , __LINE__); + + requestVector->push_back(currentRequest); + } + return int((totalByteToRecv+MaxBytesPerDivMess-1)/MaxBytesPerDivMess); + } + private: /// The original communicator FComm* communicator; diff --git a/Src/Utils/FQuickSortMpi.hpp b/Src/Utils/FQuickSortMpi.hpp index 848cf58f76914aabf30433e709d02ecb282e4ef5..437d2ede4e2f81538ffb9ecdef515f88be01b197 100644 --- a/Src/Utils/FQuickSortMpi.hpp +++ b/Src/Utils/FQuickSortMpi.hpp @@ -20,14 +20,16 @@ #include "FMpi.hpp" #include "FLog.hpp" #include "FAssert.hpp" +#include "FEnv.hpp" #include <memory> #include <utility> template <class SortType, class CompareType, class IndexType = size_t> class FQuickSortMpi : public FQuickSort< SortType, IndexType> { - /** We are limited by the size of int in MPI coms */ - static const int FQS_MAX_MPI_BYTES = 2000000000; +#ifdef SCALFMM_USE_LOG + static const bool VerboseLog; +#endif // We need a structure see the algorithm detail to know more struct Partition{ @@ -96,6 +98,7 @@ class FQuickSortMpi : public FQuickSort< SortType, IndexType> { const IndexType nbElementsAlreadyOwned = (inFromRightToLeft ? globalElementBalance[idxProc].lowerPart : globalElementBalance[idxProc].greaterPart); const IndexType averageNbElementForRemainingProc = (totalRemainingElements)/(lastProcToRecv-idxProc); totalRemainingElements -= nbElementsAlreadyOwned; + FAssertLF(totalRemainingElements >= 0); if(nbElementsAlreadyOwned < averageNbElementForRemainingProc){ nbElementsToRecvPerProc[idxProc - firstProcToRecv] = (averageNbElementForRemainingProc - nbElementsAlreadyOwned); totalRemainingElements -= nbElementsToRecvPerProc[idxProc - firstProcToRecv]; @@ -103,8 +106,9 @@ class FQuickSortMpi : public FQuickSort< SortType, IndexType> { else{ nbElementsToRecvPerProc[idxProc - firstProcToRecv] = 0; } - FLOG( FLog::Controller << "SCALFMM-DEBUG [" << currentRank << "] nbElementsToRecvPerProc[" << idxProc << "] = " << nbElementsToRecvPerProc[idxProc - firstProcToRecv] << "\n"; ) + FLOG(if(VerboseLog) FLog::Controller << "SCALFMM-DEBUG [" << currentRank << "] nbElementsToRecvPerProc[" << idxProc << "] = " << nbElementsToRecvPerProc[idxProc - firstProcToRecv] << "\n"; ) } + FAssertLF(totalRemainingElements == 0); } // Store in an array the number of element to send @@ -113,7 +117,7 @@ class FQuickSortMpi : public FQuickSort< SortType, IndexType> { for(int idxProc = firstProcToSend; idxProc < lastProcToSend ; ++idxProc){ const IndexType nbElementsAlreadyOwned = (inFromRightToLeft ? globalElementBalance[idxProc].lowerPart : globalElementBalance[idxProc].greaterPart); nbElementsToSendPerProc[idxProc-firstProcToSend] = nbElementsAlreadyOwned; - FLOG( FLog::Controller << "SCALFMM-DEBUG [" << currentRank << "] nbElementsToSendPerProc[" << idxProc << "] = " << nbElementsToSendPerProc[idxProc-firstProcToSend] << "\n"; ) + FLOG(if(VerboseLog) FLog::Controller << "SCALFMM-DEBUG [" << currentRank << "] nbElementsToSendPerProc[" << idxProc << "] = " << nbElementsToSendPerProc[idxProc-firstProcToSend] << "\n"; ) } // Compute all the send recv but keep only the ones related to currentRank @@ -177,31 +181,23 @@ class FQuickSortMpi : public FQuickSort< SortType, IndexType> { requests.reserve(whatToRecvFromWho.size()); for(int idxPack = 0 ; idxPack < int(whatToRecvFromWho.size()) ; ++idxPack){ const PackData& pack = whatToRecvFromWho[idxPack]; - FLOG( FLog::Controller << "SCALFMM-DEBUG [" << currentComm.processId() << "] Recv from " << pack.idProc << " from " << pack.fromElement << " to " << pack.toElement << "\n"; ); -// FAssertLF((pack.toElement - pack.fromElement) * sizeof(SortType) < std::numeric_limits<int>::max()); -// FMpi::Assert( MPI_Irecv((SortType*)&recvBuffer[pack.fromElement], int((pack.toElement - pack.fromElement) * sizeof(SortType)), MPI_BYTE, pack.idProc, -// FMpi::TagQuickSort, currentComm.getComm(), &requests[idxPack]) , __LINE__); - // Work per max size - const IndexType nbElementsInPack = (pack.toElement - pack.fromElement); - const IndexType totalByteToRecv = IndexType(nbElementsInPack*sizeof(SortType)); - unsigned char*const ptrDataToRecv = (unsigned char*)&recvBuffer[pack.fromElement]; - for(IndexType idxSize = 0 ; idxSize < totalByteToRecv ; idxSize += FQS_MAX_MPI_BYTES){ - MPI_Request currentRequest; - const FSize nbBytesInMessage = int(FMath::Min(IndexType(FQS_MAX_MPI_BYTES), totalByteToRecv-idxSize)); - FAssertLF(nbBytesInMessage < std::numeric_limits<int>::max()); - FMpi::Assert( MPI_Irecv(&ptrDataToRecv[idxSize], int(nbBytesInMessage), MPI_BYTE, pack.idProc, - int(FMpi::TagQuickSort + idxSize/FQS_MAX_MPI_BYTES), currentComm.getComm(), ¤tRequest) , __LINE__); - - requests.push_back(currentRequest); - } + FLOG(if(VerboseLog) FLog::Controller << "SCALFMM-DEBUG [" << currentComm.processId() << "] Recv from " << pack.idProc << " from " << pack.fromElement << " to " << pack.toElement << "\n"; ); + FAssertLF(pack.toElement <= totalToRecv); + FMpi::IRecvSplit(&recvBuffer[pack.fromElement], + (pack.toElement - pack.fromElement), + pack.idProc, + FMpi::TagQuickSort, + currentComm, + &requests); + } FAssertLF(whatToRecvFromWho.size() <= requests.size()); - FLOG( FLog::Controller << "SCALFMM-DEBUG [" << "Wait for " << requests.size() << " request \n" ); - FLOG( FLog::Controller.flush()); + FLOG(if(VerboseLog) FLog::Controller << "SCALFMM-DEBUG [" << "Wait for " << requests.size() << " request \n" ); + FLOG(if(VerboseLog) FLog::Controller.flush()); // Wait to complete FMpi::Assert( MPI_Waitall(int(requests.size()), requests.data(), MPI_STATUSES_IGNORE), __LINE__ ); - FLOG( FLog::Controller << "SCALFMM-DEBUG [" << currentComm.processId() << "] Recv Done \n"; ) - FLOG( FLog::Controller.flush()); + FLOG(if(VerboseLog) FLog::Controller << "SCALFMM-DEBUG [" << currentComm.processId() << "] Recv Done \n"; ) + FLOG(if(VerboseLog) FLog::Controller.flush()); // Copy to ouput variables (*inPartRecv) = recvBuffer; (*inNbElementsRecv) = totalToRecv; @@ -220,31 +216,22 @@ class FQuickSortMpi : public FQuickSort< SortType, IndexType> { requests.reserve(whatToSendToWho.size()); for(int idxPack = 0 ; idxPack < int(whatToSendToWho.size()) ; ++idxPack){ const PackData& pack = whatToSendToWho[idxPack]; - FLOG( FLog::Controller << "SCALFMM-DEBUG [" << currentComm.processId() << "] Send to " << pack.idProc << " from " << pack.fromElement << " to " << pack.toElement << "\n"; ); -// FAssertLF((pack.toElement - pack.fromElement)* sizeof(SortType) < std::numeric_limits<int>::max()); -// FMpi::Assert( MPI_Isend(const_cast<SortType*>(&inPartToSend[pack.fromElement]), int((pack.toElement - pack.fromElement) * sizeof(SortType)), MPI_BYTE , pack.idProc, -// FMpi::TagQuickSort, currentComm.getComm(), &requests[idxPack]) , __LINE__); - // Work per max size - const IndexType nbElementsInPack = (pack.toElement - pack.fromElement); - const IndexType totalByteToSend = IndexType(nbElementsInPack*sizeof(SortType)); - unsigned char*const ptrDataToSend = (unsigned char*)const_cast<SortType*>(&inPartToSend[pack.fromElement]); - for(IndexType idxSize = 0 ; idxSize < totalByteToSend ; idxSize += FQS_MAX_MPI_BYTES){ - MPI_Request currentRequest; - const IndexType nbBytesInMessage = int(FMath::Min(IndexType(FQS_MAX_MPI_BYTES), totalByteToSend-idxSize)); - FAssertLF(nbBytesInMessage < std::numeric_limits<int>::max()); - FMpi::Assert( MPI_Isend((SortType*)&ptrDataToSend[idxSize], int(nbBytesInMessage), MPI_BYTE , pack.idProc, - int(FMpi::TagQuickSort + idxSize/FQS_MAX_MPI_BYTES), currentComm.getComm(), ¤tRequest) , __LINE__); - - requests.push_back(currentRequest); - } + FLOG(if(VerboseLog) FLog::Controller << "SCALFMM-DEBUG [" << currentComm.processId() << "] Send to " << pack.idProc << " from " << pack.fromElement << " to " << pack.toElement << "\n"; ); + + FMpi::ISendSplit(&inPartToSend[pack.fromElement], + (pack.toElement - pack.fromElement), + pack.idProc, + FMpi::TagQuickSort, + currentComm, + &requests); } FAssertLF(whatToSendToWho.size() <= requests.size()); - FLOG( FLog::Controller << "SCALFMM-DEBUG [" << "Wait for " << requests.size() << " request \n" ); - FLOG( FLog::Controller.flush()); + FLOG(if(VerboseLog) FLog::Controller << "SCALFMM-DEBUG [" << currentComm.processId() << "] Wait for " << requests.size() << " request \n" ); + FLOG(if(VerboseLog) FLog::Controller.flush()); // Wait to complete FMpi::Assert( MPI_Waitall(int(requests.size()), requests.data(), MPI_STATUSES_IGNORE), __LINE__ ); - FLOG( FLog::Controller << "SCALFMM-DEBUG [" << currentComm.processId() << "] Send Done \n"; ) - FLOG( FLog::Controller.flush()); + FLOG(if(VerboseLog) FLog::Controller << "SCALFMM-DEBUG [" << currentComm.processId() << "] Send Done \n"; ) + FLOG(if(VerboseLog) FLog::Controller.flush()); } static CompareType SelectPivot(const SortType workingArray[], const IndexType currentSize, const FMpi::FComm& currentComm, bool* shouldStop){ @@ -253,33 +240,38 @@ class FQuickSortMpi : public FQuickSort< SortType, IndexType> { NO_VALUES, AVERAGE_2 }; - // We need to know the max value to ensure that the pivot will be different - CompareType maxFoundValue = CompareType(workingArray[0]); // Check if all the same bool allTheSame = true; - for(int idx = 1 ; idx < currentSize && allTheSame; ++idx){ - if(workingArray[0] != workingArray[idx]){ - allTheSame = false; - } - // Keep the max - maxFoundValue = FMath::Max(maxFoundValue , CompareType(workingArray[idx])); - } // Check if empty const bool noValues = (currentSize == 0); // Get the local pivot if not empty CompareType localPivot = CompareType(0); - if(!noValues){ - localPivot = (CompareType(workingArray[currentSize/3])+CompareType(workingArray[(2*currentSize)/3]))/2; + + if(noValues == false){ + // We need to know the max value to ensure that the pivot will be different + CompareType maxFoundValue = CompareType(workingArray[0]); + // We need to know the min value to ensure that the pivot will be different + CompareType minFoundValue = CompareType(workingArray[0]); + + for(int idx = 1 ; idx < currentSize ; ++idx){ + // Keep the max + maxFoundValue = FMath::Max(maxFoundValue , CompareType(workingArray[idx])); + // Keep the min + minFoundValue = FMath::Min(minFoundValue , CompareType(workingArray[idx])); + } + allTheSame = (maxFoundValue == minFoundValue); + // Value equal to pivot are kept on the left so + localPivot = ((maxFoundValue-minFoundValue)/2) + minFoundValue; // The pivot must be different (to ensure that the partition will return two parts) if( localPivot == maxFoundValue && !allTheSame){ - FLOG( FLog::Controller << "SCALFMM-DEBUG [" << currentComm.processId() << "] Pivot " << localPivot << " is equal max and allTheSame equal " << allTheSame << "\n"; ) - FLOG( FLog::Controller.flush()); + FLOG(if(VerboseLog) FLog::Controller << "SCALFMM-DEBUG [" << currentComm.processId() << "] Pivot " << localPivot << " is equal max and allTheSame equal " << allTheSame << "\n"; ) + FLOG(if(VerboseLog) FLog::Controller.flush()); localPivot -= 1; } } - FLOG( FLog::Controller << "SCALFMM-DEBUG [" << currentComm.processId() << "] localPivot = " << localPivot << "\n" ); - FLOG( FLog::Controller.flush()); + FLOG(if(VerboseLog) FLog::Controller << "SCALFMM-DEBUG [" << currentComm.processId() << "] localPivot = " << localPivot << "\n" ); + FLOG(if(VerboseLog) FLog::Controller.flush()); //const int myRank = currentComm.processId(); const int nbProcs = currentComm.processCount(); @@ -339,20 +331,20 @@ public: bool shouldStop; const CompareType globalPivot = SelectPivot(workingArray, currentSize, currentComm, &shouldStop); if(shouldStop){ - FLOG( FLog::Controller << "SCALFMM-DEBUG [" << currentComm.processId() << "] shouldStop = " << shouldStop << "\n" ); - FLOG( FLog::Controller.flush()); + FLOG(if(VerboseLog) FLog::Controller << "SCALFMM-DEBUG [" << currentComm.processId() << "] shouldStop = " << shouldStop << "\n" ); + FLOG(if(VerboseLog) FLog::Controller.flush()); break; } - FLOG( FLog::Controller << "SCALFMM-DEBUG [" << currentComm.processId() << "] globalPivot = " << globalPivot << "\n" ); - FLOG( FLog::Controller.flush()); + FLOG(if(VerboseLog) FLog::Controller << "SCALFMM-DEBUG [" << currentComm.processId() << "] globalPivot = " << globalPivot << "\n" ); + FLOG(if(VerboseLog) FLog::Controller.flush()); // Split the array in two parts lower equal to pivot and greater than pivot const IndexType nbLowerElements = QsPartition(workingArray, 0, currentSize-1, globalPivot); const IndexType nbGreaterElements = currentSize - nbLowerElements; - FLOG( FLog::Controller << "SCALFMM-DEBUG [" << currentComm.processId() << "] After Partition: lower = " << nbLowerElements << " greater = " << nbGreaterElements << "\n"; ) - FLOG( FLog::Controller.flush()); + FLOG(if(VerboseLog) FLog::Controller << "SCALFMM-DEBUG [" << currentComm.processId() << "] After Partition: lower = " << nbLowerElements << " greater = " << nbGreaterElements << "\n"; ) + FLOG(if(VerboseLog) FLog::Controller.flush()); const int currentRank = currentComm.processId(); const int currentNbProcs = currentComm.processCount(); @@ -378,19 +370,19 @@ public: globalNumberOfElementsLower += globalElementBalance[idxProc].lowerPart; } - FLOG( FLog::Controller << "SCALFMM-DEBUG [" << currentComm.processId() << "] globalNumberOfElementsGreater = " << globalNumberOfElementsGreater << "\n"; ) - FLOG( FLog::Controller << "SCALFMM-DEBUG [" << currentComm.processId() << "] globalNumberOfElementsLower = " << globalNumberOfElementsLower << "\n"; ) - FLOG( FLog::Controller.flush()); + FLOG(if(VerboseLog) FLog::Controller << "SCALFMM-DEBUG [" << currentComm.processId() << "] globalNumberOfElementsGreater = " << globalNumberOfElementsGreater << "\n"; ) + FLOG(if(VerboseLog) FLog::Controller << "SCALFMM-DEBUG [" << currentComm.processId() << "] globalNumberOfElementsLower = " << globalNumberOfElementsLower << "\n"; ) + FLOG(if(VerboseLog) FLog::Controller.flush()); // The proc rank in the middle from the percentage int procInTheMiddle; if(globalNumberOfElementsLower == 0) procInTheMiddle = -1; else if(globalNumberOfElementsGreater == 0) procInTheMiddle = currentNbProcs-1; else procInTheMiddle = int(FMath::Min(IndexType(currentNbProcs-2), (currentNbProcs*globalNumberOfElementsLower) - /(globalNumberOfElementsGreater + globalNumberOfElementsLower))); + /(globalNumberOfElementsGreater + globalNumberOfElementsLower))); - FLOG( FLog::Controller << "SCALFMM-DEBUG [" << currentComm.processId() << "] procInTheMiddle = " << procInTheMiddle << "\n"; ) - FLOG( FLog::Controller.flush()); + FLOG(if(VerboseLog) FLog::Controller << "SCALFMM-DEBUG [" << currentComm.processId() << "] procInTheMiddle = " << procInTheMiddle << "\n"; ) + FLOG(if(VerboseLog) FLog::Controller.flush()); // Send or receive depending on the state if(currentRank <= procInTheMiddle){ @@ -411,11 +403,11 @@ public: workingArray = fullLowerPart; currentSize = fullNbLowerElementsRecv; // Reduce working group - FLOG( FLog::Controller << "SCALFMM-DEBUG [" << currentComm.processId() << "] Reduce group to " << 0 << " / " << procInTheMiddle << "\n"; ) - FLOG( FLog::Controller.flush()); + FLOG(if(VerboseLog) FLog::Controller << "SCALFMM-DEBUG [" << currentComm.processId() << "] Reduce group to " << 0 << " / " << procInTheMiddle << "\n"; ) + FLOG(if(VerboseLog) FLog::Controller.flush()); currentComm.groupReduce( 0, procInTheMiddle); - FLOG( FLog::Controller << "SCALFMM-DEBUG [" << currentComm.processId() << "] Done\n" ); - FLOG( FLog::Controller.flush()); + FLOG(if(VerboseLog) FLog::Controller << "SCALFMM-DEBUG [" << currentComm.processId() << "] Done\n" ); + FLOG(if(VerboseLog) FLog::Controller.flush()); } else { // I am in the group of the greater elements @@ -435,16 +427,16 @@ public: workingArray = fullGreaterPart; currentSize = fullNbGreaterElementsRecv; // Reduce working group - FLOG( FLog::Controller << "SCALFMM-DEBUG [" << currentComm.processId() << "] Reduce group to " << procInTheMiddle + 1 << " / " << currentNbProcs - 1 << "\n"; ) - FLOG( FLog::Controller.flush()); + FLOG( if(VerboseLog) FLog::Controller << "SCALFMM-DEBUG [" << currentComm.processId() << "] Reduce group to " << procInTheMiddle + 1 << " / " << currentNbProcs - 1 << "\n"; ) + FLOG( if(VerboseLog) FLog::Controller.flush()); currentComm.groupReduce( procInTheMiddle + 1, currentNbProcs - 1); - FLOG( FLog::Controller << "SCALFMM-DEBUG [" << currentComm.processId() << "] Done\n"; ) - FLOG( FLog::Controller.flush()); + FLOG( if(VerboseLog) FLog::Controller << "SCALFMM-DEBUG [" << currentComm.processId() << "] Done\n"; ) + FLOG( if(VerboseLog) FLog::Controller.flush()); } } - FLOG( FLog::Controller << "SCALFMM-DEBUG [" << currentComm.processId() << "] Sequential sort (currentSize = " << currentSize << ")\n"; ) - FLOG( FLog::Controller.flush()); + FLOG( if(VerboseLog) FLog::Controller << "SCALFMM-DEBUG [" << currentComm.processId() << "] Sequential sort (currentSize = " << currentSize << ")\n"; ) + FLOG( if(VerboseLog) FLog::Controller.flush()); // Finish by a local sort FQuickSort< SortType, IndexType>::QsOmp(workingArray, currentSize, [](const SortType& v1, const SortType& v2){ return CompareType(v1) <= CompareType(v2); @@ -454,4 +446,10 @@ public: } }; + +#ifdef SCALFMM_USE_LOG +template <class SortType, class CompareType, class IndexType> +const bool FQuickSortMpi<SortType, CompareType, IndexType>::VerboseLog = FEnv::GetBool("SCALFMM_DEBUG_LOG", false); +#endif + #endif // FQUICKSORTMPI_HPP diff --git a/Tests/CMakeLists.txt b/Tests/CMakeLists.txt index 0834489118c47241b822ccb1dc84da2a1e1e2f8d..9616ae3108e75d83610f8f39f1c9bb2f2983a3df 100644 --- a/Tests/CMakeLists.txt +++ b/Tests/CMakeLists.txt @@ -17,8 +17,8 @@ file( # Adding the project sources dir as an include dir INCLUDE_DIRECTORIES( - ${CMAKE_BINARY_DIR}/Src - ${CMAKE_SOURCE_DIR}/Src + ${SCALFMM_BINARY_DIR}/Src + ${SCALFMM_SOURCE_DIR}/Src ${SCALFMM_INCLUDES} ) diff --git a/Tests/GroupTree/testBlockedRotationCuda.cpp b/Tests/GroupTree/testBlockedRotationCuda.cpp new file mode 100644 index 0000000000000000000000000000000000000000..fe4b36165f3c61af9f4cdbe14819c2ce76d4fa62 --- /dev/null +++ b/Tests/GroupTree/testBlockedRotationCuda.cpp @@ -0,0 +1,202 @@ +// ==== CMAKE ===== +// @FUSE_BLAS +// @FUSE_STARPU +// @FUSE_CUDA +// ================ +// Keep in private GIT + + +#include "../../Src/Utils/FGlobal.hpp" + +#include "../../Src/GroupTree/Core/FGroupTree.hpp" + +#include "../../Src/Components/FSimpleLeaf.hpp" +#include "../../Src/Containers/FVector.hpp" + +#include "../../Src/Kernels/P2P/FP2PParticleContainer.hpp" + +#include "../../Src/Kernels/Rotation/FRotationKernel.hpp" + +#include "../../Src/GroupTree/Rotation/FRotationCellPOD.hpp" + +#include "../../Src/Utils/FMath.hpp" +#include "../../Src/Utils/FMemUtils.hpp" +#include "../../Src/Utils/FParameters.hpp" + +#include "../../Src/Files/FRandomLoader.hpp" +#include "../../Src/Files/FFmaGenericLoader.hpp" + +#include "../../Src/GroupTree/Core/FGroupSeqAlgorithm.hpp" +#include "../../Src/GroupTree/Core/FGroupTaskAlgorithm.hpp" + +#include "../../Src/GroupTree/Core/FGroupTaskStarpuAlgorithm.hpp" +#include "../../Src/GroupTree/StarPUUtils/FStarPUKernelCapacities.hpp" + +#include "../../Src/GroupTree/Core/FP2PGroupParticleContainer.hpp" + +#include "../../Src/GroupTree/Cuda/FCudaDeviceWrapper.hpp" +#include "../../Src/GroupTree/Cuda/FCudaEmptyCellSymb.hpp" +#include "../../Src/GroupTree/Cuda/FCudaGroupOfParticles.hpp" +#include "../../Src/GroupTree/Cuda/FCudaGroupOfCells.hpp" + + +#include "../../Src/Utils/FParameterNames.hpp" + +#include <memory> + +template <class FReal> +class FCudaP2P; + +#define RANDOM_PARTICLES + +int main(int argc, char* argv[]){ + const FParameterNames LocalOptionBlocSize { {"-bs"}, "The size of the block of the blocked tree"}; + const FParameterNames LocalOptionNoValidate { {"-no-validation"}, "To avoid comparing with direct computation"}; + FHelpDescribeAndExit(argc, argv, "Test the blocked tree by counting the particles.", + FParameterDefinitions::OctreeHeight, +#ifdef RANDOM_PARTICLES + FParameterDefinitions::NbParticles, +#else + FParameterDefinitions::InputFile, +#endif + FParameterDefinitions::NbThreads, + LocalOptionBlocSize, LocalOptionNoValidate); + + // Initialize the types + typedef double FReal; + static const int ORDER = 6; + + typedef FRotationCellPODCore GroupCellSymbClass; + typedef FRotationCellPODPole<FReal,ORDER> GroupCellUpClass; + typedef FRotationCellPODLocal<FReal,ORDER> GroupCellDownClass; + typedef FRotationCellPOD<FReal,ORDER> GroupCellClass; + + typedef FP2PGroupParticleContainer<FReal> GroupContainerClass; + typedef FGroupTree< FReal, GroupCellClass, GroupCellSymbClass, GroupCellUpClass, GroupCellDownClass, GroupContainerClass, 1, 4, FReal> GroupOctreeClass; + + typedef FStarPUCudaP2PCapacities<FRotationKernel<FReal,GroupCellClass,GroupContainerClass,ORDER>> GroupKernelClass; + typedef FStarPUCpuWrapper<typename GroupOctreeClass::CellGroupClass, GroupCellClass, GroupKernelClass, typename GroupOctreeClass::ParticleGroupClass, GroupContainerClass> GroupCpuWrapper; + + typedef FStarPUCudaWrapper<GroupKernelClass, + FCudaEmptyCellSymb, int, int, + FCudaGroupOfCells<FCudaEmptyCellSymb, int, int>, + FCudaGroupOfParticles<FReal, 1, 4, FReal>, FCudaGroupAttachedLeaf<FReal, 1, 4, FReal>, FCudaP2P<FReal> > GroupCudaWrapper; + + typedef FGroupTaskStarPUAlgorithm<GroupOctreeClass, typename GroupOctreeClass::CellGroupClass, GroupKernelClass, typename GroupOctreeClass::ParticleGroupClass, + GroupCpuWrapper, GroupCudaWrapper > GroupAlgorithm; + + // Get params + const int NbLevels = FParameters::getValue(argc,argv,FParameterDefinitions::OctreeHeight.options, 5); + const int groupSize = FParameters::getValue(argc,argv,LocalOptionBlocSize.options, 250); + + // Load the particles +#ifdef RANDOM_PARTICLES + FRandomLoader<FReal> loader(FParameters::getValue(argc,argv,FParameterDefinitions::NbParticles.options, 2000), 1.0, FPoint<FReal>(0,0,0), 0); +#else + const char* const filename = FParameters::getStr(argc,argv,FParameterDefinitions::InputFile.options, "../Data/test20k.fma"); + FFmaGenericLoader<FReal> loader(filename); +#endif + FAssertLF(loader.isOpen()); + FTic timer; + + FP2PParticleContainer<FReal> allParticles; + for(FSize idxPart = 0 ; idxPart < loader.getNumberOfParticles() ; ++idxPart){ + FPoint<FReal> particlePosition; + FReal physicalValue; +#ifdef RANDOM_PARTICLES + physicalValue = 0.10; + loader.fillParticle(&particlePosition); +#else + loader.fillParticle(&particlePosition, &physicalValue); +#endif + allParticles.push(particlePosition, physicalValue); + } + std::cout << "Particles loaded in " << timer.tacAndElapsed() << "s\n"; + + // Put the data into the tree + timer.tic(); + GroupOctreeClass groupedTree(NbLevels, loader.getBoxWidth(), loader.getCenterOfBox(), groupSize, &allParticles); + groupedTree.printInfoBlocks(); + std::cout << "Tree created in " << timer.tacAndElapsed() << "s\n"; + + // Run the algorithm + GroupKernelClass groupkernel(NbLevels, loader.getBoxWidth(), loader.getCenterOfBox()); + GroupAlgorithm groupalgo(&groupedTree,&groupkernel); + + timer.tic(); + groupalgo.execute(); + std::cout << "Kernel executed in in " << timer.tacAndElapsed() << "s\n"; + + // Validate the result + if(FParameters::existParameter(argc, argv, LocalOptionNoValidate.options) == false){ + FSize offsetParticles = 0; + FReal*const allPhysicalValues = allParticles.getPhysicalValues(); + FReal*const allPosX = const_cast<FReal*>( allParticles.getPositions()[0]); + FReal*const allPosY = const_cast<FReal*>( allParticles.getPositions()[1]); + FReal*const allPosZ = const_cast<FReal*>( allParticles.getPositions()[2]); + + groupedTree.forEachCellLeaf<FP2PGroupParticleContainer<FReal> >([&](GroupCellClass cellTarget, FP2PGroupParticleContainer<FReal> * leafTarget){ + const FReal*const physicalValues = leafTarget->getPhysicalValues(); + const FReal*const posX = leafTarget->getPositions()[0]; + const FReal*const posY = leafTarget->getPositions()[1]; + const FReal*const posZ = leafTarget->getPositions()[2]; + const FSize nbPartsInLeafTarget = leafTarget->getNbParticles(); + + for(FSize idxPart = 0 ; idxPart < nbPartsInLeafTarget ; ++idxPart){ + allPhysicalValues[offsetParticles + idxPart] = physicalValues[idxPart]; + allPosX[offsetParticles + idxPart] = posX[idxPart]; + allPosY[offsetParticles + idxPart] = posY[idxPart]; + allPosZ[offsetParticles + idxPart] = posZ[idxPart]; + } + + offsetParticles += nbPartsInLeafTarget; + }); + + FAssertLF(offsetParticles == loader.getNumberOfParticles()); + + FReal*const allDirectPotentials = allParticles.getPotentials(); + FReal*const allDirectforcesX = allParticles.getForcesX(); + FReal*const allDirectforcesY = allParticles.getForcesY(); + FReal*const allDirectforcesZ = allParticles.getForcesZ(); + + for(int idxTgt = 0 ; idxTgt < offsetParticles ; ++idxTgt){ + for(int idxMutual = idxTgt + 1 ; idxMutual < offsetParticles ; ++idxMutual){ + FP2PR::MutualParticles( + allPosX[idxTgt],allPosY[idxTgt],allPosZ[idxTgt], allPhysicalValues[idxTgt], + &allDirectforcesX[idxTgt], &allDirectforcesY[idxTgt], &allDirectforcesZ[idxTgt], &allDirectPotentials[idxTgt], + allPosX[idxMutual],allPosY[idxMutual],allPosZ[idxMutual], allPhysicalValues[idxMutual], + &allDirectforcesX[idxMutual], &allDirectforcesY[idxMutual], &allDirectforcesZ[idxMutual], &allDirectPotentials[idxMutual] + ); + } + } + + FMath::FAccurater<FReal> potentialDiff; + FMath::FAccurater<FReal> fx, fy, fz; + offsetParticles = 0; + groupedTree.forEachCellLeaf<FP2PGroupParticleContainer<FReal> >([&](GroupCellClass cellTarget, FP2PGroupParticleContainer<FReal> * leafTarget){ + const FReal*const potentials = leafTarget->getPotentials(); + const FReal*const forcesX = leafTarget->getForcesX(); + const FReal*const forcesY = leafTarget->getForcesY(); + const FReal*const forcesZ = leafTarget->getForcesZ(); + const FSize nbPartsInLeafTarget = leafTarget->getNbParticles(); + + for(int idxTgt = 0 ; idxTgt < nbPartsInLeafTarget ; ++idxTgt){ + potentialDiff.add(allDirectPotentials[idxTgt + offsetParticles], potentials[idxTgt]); + fx.add(allDirectforcesX[idxTgt + offsetParticles], forcesX[idxTgt]); + fy.add(allDirectforcesY[idxTgt + offsetParticles], forcesY[idxTgt]); + fz.add(allDirectforcesZ[idxTgt + offsetParticles], forcesZ[idxTgt]); + } + + offsetParticles += nbPartsInLeafTarget; + }); + + std::cout << "Error : Potential " << potentialDiff << "\n"; + std::cout << "Error : fx " << fx << "\n"; + std::cout << "Error : fy " << fy << "\n"; + std::cout << "Error : fz " << fz << "\n"; + } + + return 0; +} + + diff --git a/Tests/GroupTree/testBlockedWithCudaAlgorithm.cpp b/Tests/GroupTree/testBlockedWithCudaAlgorithm.cpp index 07015cfa8232431bddcfe78a771817f07437114e..522418cb77d05623f62cc10ed0d6808a13f863d4 100644 --- a/Tests/GroupTree/testBlockedWithCudaAlgorithm.cpp +++ b/Tests/GroupTree/testBlockedWithCudaAlgorithm.cpp @@ -80,7 +80,7 @@ int main(int argc, char* argv[]){ typedef FGroupTestParticleContainer<FReal> GroupContainerClass; typedef FGroupTree< FReal, GroupCellClass, GroupCellSymbClass, GroupCellUpClass, GroupCellDownClass, GroupContainerClass, 0, 1, long long int> GroupOctreeClass; - typedef FStarPUAllCpuCudaCapacities<FTestKernels< GroupCellClass, GroupContainerClass >> GroupKernelClass; + typedef FStarPUAllCudaCapacities<FTestKernels< GroupCellClass, GroupContainerClass >> GroupKernelClass; typedef FStarPUCpuWrapper<typename GroupOctreeClass::CellGroupClass, GroupCellClass, GroupKernelClass, typename GroupOctreeClass::ParticleGroupClass, GroupContainerClass> GroupCpuWrapper; typedef FStarPUCudaWrapper<GroupKernelClass, GroupCellSymbClass, GroupCellUpClass, GroupCellDownClass, diff --git a/Tests/GroupTree/testBlockedWithOpenCLAlgorithm.cpp b/Tests/GroupTree/testBlockedWithOpenCLAlgorithm.cpp index 16b3c4916f117f39c6d54090b37c2304309c0857..264467596eab55dcffcf462ba13eb8c9253af888 100644 --- a/Tests/GroupTree/testBlockedWithOpenCLAlgorithm.cpp +++ b/Tests/GroupTree/testBlockedWithOpenCLAlgorithm.cpp @@ -44,13 +44,15 @@ #include "../../Src/GroupTree/OpenCl/FOpenCLDeviceWrapper.hpp" int main(int argc, char* argv[]){ - setenv("STARPU_NCPU","0",1); - setenv("STARPU_NOPENCL","1",1); - //setenv("STARPU_OPENCL_ONLY_ON_CPUS","1",1); - setenv("STARPU_OPENCL_ON_CPUS","1",1); - - setenv("STARPU_DISABLE_ASYNCHRONOUS_OPENCL_COPY","1",1); - setenv("STARPU_OPENCL_PIPELINE","0",0); // synchronous task + if(getenv("HOSTNAME") && strcmp(getenv("HOSTNAME"),"berenger-HP-ProBook-640-G1") == 0){ + setenv("STARPU_NCPU","0",1); + setenv("STARPU_NOPENCL","1",1); + setenv("STARPU_OPENCL_ONLY_ON_CPUS","1",1); + setenv("STARPU_OPENCL_ON_CPUS","1",1); + + setenv("STARPU_DISABLE_ASYNCHRONOUS_OPENCL_COPY","1",1); + setenv("STARPU_OPENCL_PIPELINE","0",0); // synchronous task + } const FParameterNames LocalOptionBlocSize { {"-bs"}, diff --git a/Tests/Kernels/testRotationAlgorithmProc.cpp b/Tests/Kernels/testRotationAlgorithmProc.cpp index 49d51dc14933092eef79049927cf4b2c1f7ad2d6..22613ef15926552f3c6e37839710749d26f7a147 100644 --- a/Tests/Kernels/testRotationAlgorithmProc.cpp +++ b/Tests/Kernels/testRotationAlgorithmProc.cpp @@ -115,8 +115,8 @@ int main(int argc, char* argv[]) tree.getBoxWidth(),tree.getHeight(), &finalParticles, &balancer); { // ----------------------------------------------------- - std::cout << "Creating & Inserting " << loader.getNumberOfParticles() << " particles ..." << std::endl; - std::cout << "For a total of " << loader.getNumberOfParticles() * app.global().processCount() << " particles ..." << std::endl; + std::cout << app.global().processId() << "] Creating & Inserting " << finalParticles.getSize() << " particles ..." << std::endl; + std::cout << app.global().processId() << "] For a total of " << loader.getNumberOfParticles() * app.global().processCount() << " particles ..." << std::endl; std::cout << "\tHeight : " << TreeHeight << " \t sub-height : " << SubTreeHeight << std::endl; time.tic(); @@ -126,8 +126,17 @@ int main(int argc, char* argv[]) } time.tac(); - std::cout << "Done " << "(@Creating and Inserting Particles = " + std::cout << app.global().processId() << "] Done " << "(@Creating and Inserting Particles = " << time.elapsed() << "s)." << std::endl; + + FSize minPart = std::numeric_limits<FSize>::max(); + FSize maxPart = std::numeric_limits<FSize>::min(); + tree.forEachLeaf([&](LeafClass* lf){ + minPart = FMath::Min(lf->getSrc()->getNbParticles(), minPart); + maxPart = FMath::Max(lf->getSrc()->getNbParticles(), maxPart); + }); + + std::cout << app.global().processId() << "] Min nb part " << minPart << " Max nb part " << maxPart << std::endl; } // ----------------------------------------------------- delete[] particles; @@ -139,13 +148,14 @@ int main(int argc, char* argv[]) KernelClass kernels(TreeHeight, loader.getBoxWidth(), loader.getCenterOfBox()); FmmClass algorithm(app.global(),&tree, &kernels); time.tac(); - std::cout << "Done " << "(@Init = " << time.elapsed() << "s)." << std::endl; + std::cout << app.global().processId() << "] Done " << "(@Init = " << time.elapsed() << "s)." << std::endl; time.tic(); algorithm.execute(); time.tac(); - std::cout << "Done " << "(@Algorithm = " << time.elapsed() << "s)." << std::endl; + std::cout << app.global().processId() << "] Done " << "(@Algorithm = " << time.elapsed() << "s)." << std::endl; } // ----------------------------------------------------- + app.global().barrier(); return 0; } diff --git a/Tests/noDist/testParticlesDistrMpi.cpp b/Tests/noDist/testParticlesDistrMpi.cpp new file mode 100644 index 0000000000000000000000000000000000000000..41ef6b67bd253e4dc627ccb22473055f601dea9d --- /dev/null +++ b/Tests/noDist/testParticlesDistrMpi.cpp @@ -0,0 +1,152 @@ +// =================================================================================== +// Copyright ScalFmm 2011 INRIA, Olivier Coulaud, Berenger Bramas +// olivier.coulaud@inria.fr, berenger.bramas@inria.fr +// This software is a computer program whose purpose is to compute the FMM. +// +// This software is governed by the CeCILL-C and LGPL licenses and +// abiding by the rules of distribution of free software. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public and CeCILL-C Licenses for more details. +// "http://www.cecill.info". +// "http://www.gnu.org/licenses". +// =================================================================================== + +// ==== CMAKE ===== +// @FUSE_MPI +// ================ + +#include <iostream> + +#include <cstdio> +#include <cstdlib> + + +#include "../../Src/Kernels/Rotation/FRotationCell.hpp" +#include "../../Src/Kernels/Rotation/FRotationKernel.hpp" + +#include "../../Src/Components/FSimpleLeaf.hpp" +#include "../../Src/Kernels/P2P/FP2PParticleContainerIndexed.hpp" + +#include "../../Src/Utils/FParameters.hpp" +#include "../../Src/Utils/FMemUtils.hpp" + +#include "../../Src/Containers/FOctree.hpp" +#include "../../Src/Containers/FVector.hpp" + +#include "../../Src/Files/FRandomLoader.hpp" +#include "../../Src/Files/FMpiTreeBuilder.hpp" + +#include "../../Src/Core/FFmmAlgorithm.hpp" +#include "../../Src/Core/FFmmAlgorithmThread.hpp" +#include "../../Src/Core/FFmmAlgorithmThreadProc.hpp" + +#include "../../Src/BalanceTree/FLeafBalance.hpp" + +#include "../../Src/Utils/FParameterNames.hpp" + +/** + * This program runs the FMM Algorithm Distributed with the Rotation kernel + */ + +// Simply create particles and try the kernels +int main(int argc, char* argv[]) +{ + FHelpDescribeAndExit(argc, argv, + "Test with MPI the chebyshev FMM and compare it to the direct computation for debugging purpose.", + FParameterDefinitions::NbParticles, FParameterDefinitions::OctreeHeight, + FParameterDefinitions::OctreeSubHeight, FParameterDefinitions::NbThreads); + + typedef double FReal; + + + FMpi app(argc,argv); + + const FSize nbParticles = FParameters::getValue(argc,argv, FParameterDefinitions::NbParticles.options, 10000000ULL); + const unsigned int TreeHeight = FParameters::getValue(argc, argv, FParameterDefinitions::OctreeHeight.options, 5); + FTic time; + + std::cout << ">> This executable has to be used to test Proc Rotation Algorithm. \n"; + + // init particles position and physical value + struct TestParticle{ + FSize idxPart; + FPoint<FReal> position; + FReal physicalValue; + const FPoint<FReal>& getPosition(){ + return position; + } + }; + + // open particle file + std::cout << "Creating : " << nbParticles << "\n" << std::endl; + + time.tic(); + const FSize totalNbParticles = nbParticles*app.global().processCount(); + TestParticle* particles = new TestParticle[totalNbParticles]; + memset(particles,0,(unsigned int) (sizeof(TestParticle)*totalNbParticles)); + for(int idxProc = 0 ; idxProc < app.global().processCount() ; ++idxProc){ + FRandomLoader<FReal> loader(nbParticles, 1.0, FPoint<FReal>(0,0,0), idxProc); + for(FSize idxPart = 0 ; idxPart < loader.getNumberOfParticles() ; ++idxPart){ + loader.fillParticle(&particles[idxPart + idxProc*nbParticles].position); + particles[idxPart + idxProc*nbParticles].physicalValue = 1.0; + particles[idxPart + idxProc*nbParticles].idxPart = idxPart + idxProc*nbParticles; + } + } + + FVector<TestParticle> finalParticles; + FLeafBalance balancer; + FMpiTreeBuilder< FReal,TestParticle >::DistributeArrayToContainer(app.global(),&particles[app.global().processId()*nbParticles], + nbParticles, + FPoint<FReal>(0,0,0), + 1.0,TreeHeight, + &finalParticles, &balancer); + + app.global().barrier(); + + std::cout << "Testing : " << finalParticles.getSize() << "\n" << std::endl; + + for(FSize idxRes = 0 ; idxRes < finalParticles.getSize() ; ++idxRes){ + FAssertLF(0 <= finalParticles[idxRes].idxPart, "idxRes ", idxRes, " finalParticles[idxRes].idxPart ", finalParticles[idxRes].idxPart); + FAssertLF(finalParticles[idxRes].idxPart < totalNbParticles, "idxRes ", idxRes, " finalParticles[idxRes].idxPart ", finalParticles[idxRes].idxPart); + + const TestParticle correctPart = particles[finalParticles[idxRes].idxPart]; + const TestParticle testPart = finalParticles[idxRes]; + + FAssertLF(testPart.idxPart == correctPart.idxPart); + FAssertLF(testPart.position.getX() == correctPart.position.getX()); + FAssertLF(testPart.position.getY() == correctPart.position.getY()); + FAssertLF(testPart.position.getZ() == correctPart.position.getZ()); + FAssertLF(testPart.physicalValue == correctPart.physicalValue); + } + + std::cout << "Done\n" << std::endl; + + app.global().barrier(); + + std::unique_ptr<int[]> particlesExist(new int[totalNbParticles]); + memset(particlesExist.get(), 0, sizeof(int)*totalNbParticles); + + for(FSize idxRes = 0 ; idxRes < finalParticles.getSize() ; ++idxRes){ + FAssertLF(particlesExist[finalParticles[idxRes].idxPart] == 0); + particlesExist[finalParticles[idxRes].idxPart] = 1; + } + + std::unique_ptr<int[]> particlesReduced(new int[totalNbParticles]); + memset(particlesReduced.get(), 0, sizeof(int)*totalNbParticles); + + FAssert(totalNbParticles <= std::numeric_limits<int>::max()); + FMpi::Assert(MPI_Allreduce(particlesExist.get(), particlesReduced.get(), int(totalNbParticles), + MPI_INT, MPI_SUM, + app.global().getComm()), __LINE__); + + for(FSize idxPart = 0 ; idxPart < totalNbParticles ; ++idxPart){ + FAssertLF(particlesReduced[idxPart] == 1, idxPart, " " , particlesReduced[idxPart]); + } + + return 0; +} + + diff --git a/UTests/CMakeLists.txt b/UTests/CMakeLists.txt index 3e3d0fdfaff0efacee1e02a3cf6617c6bfc7d3b6..56f2164f315875134d858d7f51760b21b0175fa3 100644 --- a/UTests/CMakeLists.txt +++ b/UTests/CMakeLists.txt @@ -57,8 +57,8 @@ file( # Adding the project sources dir as an include dir INCLUDE_DIRECTORIES( - ${CMAKE_BINARY_DIR}/Src - ${CMAKE_SOURCE_DIR}/Src + ${SCALFMM_BINARY_DIR}/Src + ${SCALFMM_SOURCE_DIR}/Src ${SCALFMM_INCLUDES} ) diff --git a/Utils/CMakeLists.txt b/Utils/CMakeLists.txt index 5cf190484b93d0a1564a3241a39c971d04da08c9..681a92634ce90e8d85236aa69ed561ba27dedd12 100644 --- a/Utils/CMakeLists.txt +++ b/Utils/CMakeLists.txt @@ -17,8 +17,8 @@ file( # Adding the project sources dir as an include dir INCLUDE_DIRECTORIES( - ${CMAKE_BINARY_DIR}/Src - ${CMAKE_SOURCE_DIR}/Src + ${SCALFMM_BINARY_DIR}/Src + ${SCALFMM_SOURCE_DIR}/Src ${SCALFMM_INCLUDES} )