Commit efa505d5 authored by PIACIBELLO Cyrille's avatar PIACIBELLO Cyrille
parents 5108b944 64cdf22a
......@@ -12,7 +12,7 @@ set(CMAKE_DISABLE_IN_SOURCE_BUILD ON)
# Project Declaration
#===========================================================================
project(SCALFMM C CXX)
INCLUDE( CMakeDependentOption )
# check if compiling into source directories
string(COMPARE EQUAL "${CMAKE_SOURCE_DIR}" "${CMAKE_BINARY_DIR}" insource)
if(insource)
......@@ -21,12 +21,12 @@ endif(insource)
list(APPEND CMAKE_MODULE_PATH ${CMAKE_SOURCE_DIR}/CMakeModules/)
set(SCALFMM_CMAKE_MODULE_PATH ${CMAKE_SOURCE_DIR}/CMakeModules)
include(GetCpuInfos)
GetCpuInfos()
#
# Adds the CMAKE_DEPENDENT_OPTION command
INCLUDE(CMakeDependentOption)
# Add to check CPU info
include(GetCpuInfos)
GetCpuInfos()
#===========================================================================
# Version Number
......@@ -53,7 +53,6 @@ if (MORSE_DISTRIB_DIR OR EXISTS "${CMAKE_SOURCE_DIR}/CMakeModules/morse/")
set(MORSE_CMAKE_MODULE_PATH ${CMAKE_SOURCE_DIR}/CMakeModules/morse )
endif()
include(MorseInit)
#
# Options
option( SCALFMM_USE_MPI "Set to ON to build ScaFMM with MPI" OFF )
......@@ -89,6 +88,7 @@ if (MORSE_DISTRIB_DIR OR EXISTS "${CMAKE_SOURCE_DIR}/CMakeModules/morse/")
option( SCALFMM_DISABLE_NATIVE_OMP4 "Set to ON to disable the gcc/intel omp4" OFF )
option( SCALFMM_TIME_OMPTASKS "Set to ON to time omp4 tasks and generate output file" OFF )
endif()
message(STATUS "AVANT ${CMAKE_CXX_COMPILER_ID}" )
if( SCALFMM_USE_MPI )
try_compile(COMPILE_INTEL ${CMAKE_CURRENT_BINARY_DIR}
${SCALFMM_CMAKE_MODULE_PATH}/compileTestIntel.cpp
......@@ -101,6 +101,7 @@ if (MORSE_DISTRIB_DIR OR EXISTS "${CMAKE_SOURCE_DIR}/CMakeModules/morse/")
# Set scalfmm to default libraries
set(SCALFMM_LIBRARIES "")
set(SCALFMM_CXX_FLAGS "-std=c++11 -fpic -Wall")
MESSAGE(STATUS "FLAGS =$CALFMM_CXX_FLAGS")
#
#
# Test if openmp is here
......@@ -127,7 +128,11 @@ if (MORSE_DISTRIB_DIR OR EXISTS "${CMAKE_SOURCE_DIR}/CMakeModules/morse/")
if(CMAKE_CXX_COMPILER_ID STREQUAL "Intel")
# INTEL
IF (APPLE)
IF( CPUOPTION_SSE42 )
set(SSE_FLAGS "-msse4 -mfpmath=sse") # -mtune=native -march=native
ELSEIF (CPUOPTION_SSE3)
set(SSE_FLAGS "-msse3 -mfpmath=sse") # -mtune=native -march=native
ENDIF (CPUOPTION_SSE42)
else(APPLE)
set(AVX_FLAGS "-march=native -axCORE-AVX2,CORE-AVX-I,AVX") #-mavx
set(AVX2_FLAGS "-march=native -axCORE-AVX2,CORE-AVX-I") #-march=core-avx2
......@@ -150,8 +155,12 @@ if (MORSE_DISTRIB_DIR OR EXISTS "${CMAKE_SOURCE_DIR}/CMakeModules/morse/")
endif()
endif()
IF (APPLE)
# set(SSE_FLAGS "-msse4 -mfpmath=sse") # -mtune=native -march=native
IF( CPUOPTION_SSE42 )
set(SSE_FLAGS "-msse4 -mfpmath=sse") # -mtune=native -march=native
set(SSE_FLAGS "-msse3 -mfpmath=sse")
ELSEIF (CPUOPTION_SSE3)
set(SSE_FLAGS "-msse3 -mfpmath=sse") # -mtune=native -march=native
ENDIF (CPUOPTION_SSE42)
set(AVX_FLAGS "-mtune=native -march=avx")
set(AVX2_FLAGS "-mtune=native -march=native -mmic")
else(APPLE)
......@@ -286,7 +295,7 @@ if (MORSE_DISTRIB_DIR OR EXISTS "${CMAKE_SOURCE_DIR}/CMakeModules/morse/")
if( SCALFMM_USE_MKL_AS_BLAS )
set(BLA_VENDOR "Intel10_64lp_seq")
find_package(BLASEXT) # not REQUIRED
find_package(BLASEXT QUIET) # not REQUIRED
if(BLAS_LIBRARY_DIRS)
# the RPATH to be used when installing
......@@ -644,14 +653,26 @@ if (MORSE_DISTRIB_DIR OR EXISTS "${CMAKE_SOURCE_DIR}/CMakeModules/morse/")
if(PKG_CONFIG_FOUND)
set(PKG_CONFIG_USE_CMAKE_PREFIX_PATH "ON")
pkg_search_module( EZTrace REQUIRED eztrace)
if(PEZTrace_FOUND)
link_directories(${EZTrace_LIBRARY_DIRS})
link_libraries( ${EZTrace_LIBRARIES} -leztrace-memory)
link_libraries( ${EZTrace_LIBRARIES})
IF( SCALFMM_USE_MPI )
link_libraries(-leztrace-mpi)
ENDIF(SCALFMM_USE_MPI)
include_directories(${EZTrace_INCLUDE_DIRS})
MESSAGE(STATUS "EZTRACE: ${EZTrace_INCLUDE_DIRS} ${EZTrace_LIBRARY_DIRS} ${EZTrace_LIBRARIES}")
CMAKE_DEPENDENT_OPTION(SCALFMM_TRACE_M2L "Set to ON to trace M2L operator" ON "SCALFMM_USE_EZTRACE" OFF )
CMAKE_DEPENDENT_OPTION(SCALFMM_TRACE_ALGO "Set to ON to trace the full algorithm (all operators)" ON "SCALFMM_USE_EZTRACE" OFF )
CMAKE_DEPENDENT_OPTION(SCALFMM_TRACE_P2M "Set to ON to trace P2M operator" OFF "SCALFMM_USE_EZTRACE" OFF )
CMAKE_DEPENDENT_OPTION(SCALFMM_TRACE_M2M "Set to ON to trace M2M operator" OFF "SCALFMM_USE_EZTRACE" OFF )
CMAKE_DEPENDENT_OPTION(SCALFMM_TRACE_M2L "Set to ON to trace M2L operator" OFF "SCALFMM_USE_EZTRACE" OFF )
CMAKE_DEPENDENT_OPTION(SCALFMM_TRACE_L2L "Set to ON to trace L2L operator" OFF "SCALFMM_USE_EZTRACE" OFF )
CMAKE_DEPENDENT_OPTION(SCALFMM_TRACE_P2P "Set to ON to trace P2P operator" OFF "SCALFMM_USE_EZTRACE" OFF )
else(EZTrace_FOUND)
MESSAGE(WARNING "Eztrace not found - EZTRACE Is set to OFF")
set(SCALFMM_USE_EZTRACE OFF)
endif(EZTrace_FOUND)
else(PKG_CONFIG_FOUND)
MESSAGE(WARNING "PKG-CONFIG not found- EZTRACE Is set to NONE")
MESSAGE(WARNING "PKG-CONFIG not found - EZTRACE Is set to OFF")
set(SCALFMM_USE_EZTRACE OFF)
endif(PKG_CONFIG_FOUND)
......@@ -817,7 +838,7 @@ if (MORSE_DISTRIB_DIR OR EXISTS "${CMAKE_SOURCE_DIR}/CMakeModules/morse/")
message(STATUS "SCALFMM_USE_BLAS = ${SCALFMM_USE_BLAS}")
message(STATUS "SCALFMM_USE_FFT = ${SCALFMM_USE_FFT}")
message(STATUS "SCALFMM_USE_MKL = ${SCALFMM_USE_MKL}")
#
#
message(STATUS "CMAKE_CXX_FLAGS = ${CMAKE_CXX_FLAGS}")
message(STATUS "SCALFMM_CXX_FLAGS = ${SCALFMM_CXX_FLAGS}")
message(STATUS "SCALFMM_LIBRARIES = ${SCALFMM_LIBRARIES}")
......
......@@ -43,8 +43,8 @@ if(NOT DEFINED PROCESSOR_COUNT)
if(APPLE)
find_program(cmd_sys_pro "system_profiler")
if(cmd_sys_pro)
execute_process(COMMAND ${cmd_sys_pro} OUTPUT_VARIABLE info)
string(REGEX REPLACE "^.*Total Number Of Cores: ([0-9]+).*$" "\\1"
execute_process(COMMAND ${cmd_sys_pro} SPHardwareDataType OUTPUT_VARIABLE info)
string(REGEX REPLACE "^.*Total Number of Cores: ([0-9]+).*$" "\\1"
NUMBER_OF_CPU "${info}")
endif()
endif()
......
......@@ -264,10 +264,21 @@ if(PTSCOTCH_LIBRARIES)
if(CMAKE_THREAD_LIBS_INIT)
list(APPEND REQUIRED_LIBS "${CMAKE_THREAD_LIBS_INIT}")
endif()
if(UNIX OR WIN32)
set(Z_LIBRARY "Z_LIBRARY-NOTFOUND")
find_library(Z_LIBRARY NAMES z)
if(Z_LIBRARY)
list(APPEND REQUIRED_LIBS "-lz")
endif()
set(M_LIBRARY "M_LIBRARY-NOTFOUND")
find_library(M_LIBRARY NAMES m)
if(M_LIBRARY)
list(APPEND REQUIRED_LIBS "-lm")
endif()
list(APPEND REQUIRED_LIBS "-lz -lrt")
set(RT_LIBRARY "RT_LIBRARY-NOTFOUND")
find_library(RT_LIBRARY NAMES rt)
if(RT_LIBRARY)
list(APPEND REQUIRED_LIBS "-lrt")
endif()
# set required libraries for link
set(CMAKE_REQUIRED_INCLUDES "${REQUIRED_INCDIRS}")
......
......@@ -233,10 +233,21 @@ if(SCOTCH_LIBRARIES)
if(CMAKE_THREAD_LIBS_INIT)
list(APPEND REQUIRED_LIBS "${CMAKE_THREAD_LIBS_INIT}")
endif()
if(UNIX OR WIN32)
set(Z_LIBRARY "Z_LIBRARY-NOTFOUND")
find_library(Z_LIBRARY NAMES z)
if(Z_LIBRARY)
list(APPEND REQUIRED_LIBS "-lz")
endif()
set(M_LIBRARY "M_LIBRARY-NOTFOUND")
find_library(M_LIBRARY NAMES m)
if(M_LIBRARY)
list(APPEND REQUIRED_LIBS "-lm")
endif()
list(APPEND REQUIRED_LIBS "-lz -lrt")
set(RT_LIBRARY "RT_LIBRARY-NOTFOUND")
find_library(RT_LIBRARY NAMES rt)
if(RT_LIBRARY)
list(APPEND REQUIRED_LIBS "-lrt")
endif()
# set required libraries for link
set(CMAKE_REQUIRED_INCLUDES "${REQUIRED_INCDIRS}")
......
......@@ -62,6 +62,7 @@ public:
FAssertLF(tree, "tree cannot be null");
FAssertLF(kernels, "kernels cannot be null");
FAssertLF(leafLevelSeparationCriteria < 3, "Separation criteria should be < 3");
FAbstractAlgorithm::setNbLevelsInTree(tree->getHeight());
......
......@@ -67,6 +67,7 @@ public:
FAssertLF(tree, "tree cannot be null");
FAssertLF(-1 <= inUpperLevel, "inUpperLevel cannot be < -1");
FAssertLF(leafLevelSeperationCriteria < 3, "Separation criteria should be < 3");
FAbstractAlgorithm::setNbLevelsInTree(extendedTreeHeight());
......
......@@ -27,6 +27,7 @@
#include "../Containers/FVector.hpp"
#include "FCoreCommon.hpp"
#include "FP2PExclusion.hpp"
/**
* @author Berenger Bramas (berenger.bramas@inria.fr)
......@@ -45,7 +46,7 @@
*
* Upon destruction, this class does not deallocate pointers given to its constructor.
*/
template<class OctreeClass, class CellClass, class ContainerClass, class KernelClass, class LeafClass>
template<class OctreeClass, class CellClass, class ContainerClass, class KernelClass, class LeafClass, class P2PExclusionClass = FP2PMiddleExclusion>
class FFmmAlgorithmSectionTask : public FAbstractAlgorithm, public FAlgorithmTimers {
OctreeClass* const tree; ///< The octree to work on
......@@ -74,13 +75,14 @@ public:
FAssertLF(tree, "tree cannot be null");
FAssertLF(inKernels, "kernels cannot be null");
FAssertLF(leafLevelSeparationCriteria < 3, "Separation criteria should be < 3");
this->kernels = new KernelClass*[MaxThreads];
#pragma omp parallel for schedule(static)
for(int idxThread = 0 ; idxThread < MaxThreads ; ++idxThread){
#pragma omp parallel num_threads(MaxThreads)
{
#pragma omp critical (InitFFmmAlgorithmSectionTask)
{
this->kernels[idxThread] = new KernelClass(*inKernels);
this->kernels[omp_get_thread_num()] = new KernelClass(*inKernels);
}
}
......@@ -327,7 +329,7 @@ protected:
// There is a maximum of 26 neighbors
ContainerClass* neighbors[27];
const int SizeShape = 3*3*3;
const int SizeShape = P2PExclusionClass::SizeShape;
FVector<typename OctreeClass::Iterator> shapes[SizeShape];
typename OctreeClass::Iterator octreeIterator(tree);
......@@ -337,7 +339,7 @@ protected:
// Coloring all the cells
do{
const FTreeCoordinate& coord = octreeIterator.getCurrentGlobalCoordinate();
const int shapePosition = (coord.getX()%3)*9 + (coord.getY()%3)*3 + (coord.getZ()%3);
const int shapePosition = P2PExclusionClass::GetShapeIdx(coord);
shapes[shapePosition].push(octreeIterator);
......
......@@ -27,6 +27,7 @@
#include "../Containers/FVector.hpp"
#include "FCoreCommon.hpp"
#include "FP2PExclusion.hpp"
/**
* @author Berenger Bramas (berenger.bramas@inria.fr)
......@@ -39,7 +40,7 @@
*
* Of course this class does not deallocate pointer given in arguements.
*/
template<class OctreeClass, class CellClass, class ContainerClass, class KernelClass, class LeafClass>
template<class OctreeClass, class CellClass, class ContainerClass, class KernelClass, class LeafClass, class P2PExclusionClass = FP2PMiddleExclusion>
class FFmmAlgorithmTask : public FAbstractAlgorithm, public FAlgorithmTimers {
OctreeClass* const tree; //< The octree to work on
......@@ -49,7 +50,7 @@ class FFmmAlgorithmTask : public FAbstractAlgorithm, public FAlgorithmTimers {
const int OctreeHeight;
const int leafLevelSeperationCriteria;
const int leafLevelSeparationCriteria;
public:
/** The constructor need the octree and the kernels used for computation
* @param inTree the octree to work on
......@@ -58,18 +59,19 @@ public:
*/
FFmmAlgorithmTask(OctreeClass* const inTree, KernelClass* const inKernels, const int inLeafLevelSeperationCriteria = 1)
: tree(inTree) , kernels(nullptr),
MaxThreads(omp_get_max_threads()), OctreeHeight(tree->getHeight()), leafLevelSeperationCriteria(inLeafLevelSeperationCriteria)
MaxThreads(omp_get_max_threads()), OctreeHeight(tree->getHeight()), leafLevelSeparationCriteria(inLeafLevelSeperationCriteria)
{
FAssertLF(tree, "tree cannot be null");
FAssertLF(inKernels, "kernels cannot be null");
FAssertLF(leafLevelSeparationCriteria < 3, "Separation criteria should be < 3");
this->kernels = new KernelClass*[MaxThreads];
#pragma omp parallel for schedule(static)
for(int idxThread = 0 ; idxThread < MaxThreads ; ++idxThread){
#pragma omp critical (InitFFmmAlgorithmTask)
#pragma omp parallel num_threads(MaxThreads)
{
#pragma omp critical (InitFFmmAlgorithmTask)
{
this->kernels[idxThread] = new KernelClass(*inKernels);
this->kernels[omp_get_thread_num()] = new KernelClass(*inKernels);
}
}
......@@ -239,7 +241,7 @@ protected:
// for each levels
for(int idxLevel = FAbstractAlgorithm::upperWorkingLevel ; idxLevel < FAbstractAlgorithm::lowerWorkingLevel ; ++idxLevel ){
FLOG(FTic counterTimeLevel);
const int separationCriteria = (idxLevel != FAbstractAlgorithm::lowerWorkingLevel-1 ? 1 : leafLevelSeperationCriteria);
const int separationCriteria = (idxLevel != FAbstractAlgorithm::lowerWorkingLevel-1 ? 1 : leafLevelSeparationCriteria);
// for each cell we apply the M2L with all cells in the implicit interaction list
do{
#pragma omp task firstprivate(octreeIterator) private(neighbors) shared(idxLevel)
......@@ -286,7 +288,7 @@ protected:
// for each levels
for(int idxLevel = FAbstractAlgorithm::upperWorkingLevel ; idxLevel < FAbstractAlgorithm::lowerWorkingLevel ; ++idxLevel ){
FLOG(FTic counterTimeLevel);
const int separationCriteria = (idxLevel != FAbstractAlgorithm::lowerWorkingLevel-1 ? 1 : leafLevelSeperationCriteria);
const int separationCriteria = (idxLevel != FAbstractAlgorithm::lowerWorkingLevel-1 ? 1 : leafLevelSeparationCriteria);
// for each cells
do{
//#pragma omp task default(none) firstprivate(octreeIterator,separationCriteria) private( neighbors) shared(idxLevel)
......@@ -388,7 +390,7 @@ protected:
// There is a maximum of 26 neighbors
ContainerClass* neighbors[27];
const int SizeShape = 3*3*3;
const int SizeShape = P2PExclusionClass::SizeShape;
FVector<typename OctreeClass::Iterator> shapes[SizeShape];
typename OctreeClass::Iterator octreeIterator(tree);
......@@ -397,7 +399,7 @@ protected:
// for each leafs
do{
const FTreeCoordinate& coord = octreeIterator.getCurrentGlobalCoordinate();
const int shapePosition = (coord.getX()%3)*9 + (coord.getY()%3)*3 + (coord.getZ()%3);
const int shapePosition = P2PExclusionClass::GetShapeIdx(coord);
shapes[shapePosition].push(octreeIterator);
......
......@@ -27,6 +27,7 @@
#include "../Containers/FOctree.hpp"
#include "FCoreCommon.hpp"
#include "FP2PExclusion.hpp"
#include <omp.h>
......@@ -45,7 +46,7 @@
*
* This class does not deallocate pointers given to its constructor.
*/
template<class OctreeClass, class CellClass, class ContainerClass, class KernelClass, class LeafClass>
template<class OctreeClass, class CellClass, class ContainerClass, class KernelClass, class LeafClass, class P2PExclusionClass = FP2PMiddleExclusion>
class FFmmAlgorithmThread : public FAbstractAlgorithm, public FAlgorithmTimers{
OctreeClass* const tree; ///< The octree to work on.
KernelClass** kernels; ///< The kernels.
......@@ -53,7 +54,7 @@ class FFmmAlgorithmThread : public FAbstractAlgorithm, public FAlgorithmTimers{
typename OctreeClass::Iterator* iterArray;
int leafsNumber;
static const int SizeShape = 3*3*3;
static const int SizeShape = P2PExclusionClass::SizeShape;
int shapeLeaf[SizeShape];
const int MaxThreads; ///< The maximum number of threads.
......@@ -62,7 +63,7 @@ class FFmmAlgorithmThread : public FAbstractAlgorithm, public FAlgorithmTimers{
int userChunkSize;
const int leafLevelSeperationCriteria;
const int leafLevelSeparationCriteria;
public:
/** Class constructor
......@@ -79,15 +80,17 @@ public:
const int inUserChunkSize = 10, const int inLeafLevelSeperationCriteria = 1)
: tree(inTree) , kernels(nullptr), iterArray(nullptr), leafsNumber(0),
MaxThreads(omp_get_max_threads()), OctreeHeight(tree->getHeight()),
userChunkSize(inUserChunkSize), leafLevelSeperationCriteria(inLeafLevelSeperationCriteria) {
userChunkSize(inUserChunkSize), leafLevelSeparationCriteria(inLeafLevelSeperationCriteria) {
FAssertLF(tree, "tree cannot be null");
FAssertLF(leafLevelSeparationCriteria < 3, "Separation criteria should be < 3");
FAssertLF(0 < userChunkSize, "Chunk size should be > 0");
this->kernels = new KernelClass*[MaxThreads];
#pragma omp parallel for schedule(static)
for(int idxThread = 0 ; idxThread < MaxThreads ; ++idxThread){
#pragma omp parallel num_threads(MaxThreads)
{
#pragma omp critical (InitFFmmAlgorithmThread)
{
this->kernels[idxThread] = new KernelClass(*inKernels);
this->kernels[omp_get_thread_num()] = new KernelClass(*inKernels);
}
}
......@@ -138,7 +141,7 @@ protected:
do{
++leafsNumber;
const FTreeCoordinate& coord = octreeIterator.getCurrentCell()->getCoordinate();
++this->shapeLeaf[(coord.getX()%3)*9 + (coord.getY()%3)*3 + (coord.getZ()%3)];
++this->shapeLeaf[P2PExclusionClass::GetShapeIdx(coord)];
} while(octreeIterator.moveRight());
iterArray = new typename OctreeClass::Iterator[leafsNumber];
......@@ -296,7 +299,7 @@ protected:
// for each levels
for(int idxLevel = FAbstractAlgorithm::upperWorkingLevel ; idxLevel < FAbstractAlgorithm::lowerWorkingLevel ; ++idxLevel ){
FLOG(FTic counterTimeLevel);
const int separationCriteria = (idxLevel != FAbstractAlgorithm::lowerWorkingLevel-1 ? 1 : leafLevelSeperationCriteria);
const int separationCriteria = (idxLevel != FAbstractAlgorithm::lowerWorkingLevel-1 ? 1 : leafLevelSeparationCriteria);
int numberOfCells = 0;
// for each cells
do{
......@@ -439,7 +442,7 @@ protected:
//iterArray[leafs] = octreeIterator;
//++leafs;
const FTreeCoordinate& coord = octreeIterator.getCurrentGlobalCoordinate();
const int shapePosition = (coord.getX()%3)*9 + (coord.getY()%3)*3 + (coord.getZ()%3);
const int shapePosition = P2PExclusionClass::GetShapeIdx(coord);
omp_set_lock(&lockShape[shapePosition]);
const int positionToWork = startPosAtShape[shapePosition]++;
......
......@@ -11,6 +11,7 @@
#include "../Containers/FOctree.hpp"
#include "FCoreCommon.hpp"
#include "FP2PExclusion.hpp"
#include <omp.h>
#include <vector>
......@@ -29,18 +30,18 @@
*
* This class does not deallocate pointers given to its constructor.
*/
template<class OctreeClass, class CellClass, class ContainerClass, class KernelClass, class LeafClass>
template<class OctreeClass, class CellClass, class ContainerClass, class KernelClass, class LeafClass, class P2PExclusionClass = FP2PMiddleExclusion>
class FFmmAlgorithmThreadBalance : public FAbstractAlgorithm, public FAlgorithmTimers{
OctreeClass* const tree; ///< The octree to work on.
KernelClass** kernels; ///< The kernels.
static const int SizeShape = 3*3*3;
static const int SizeShape = P2PExclusionClass::SizeShape;
const int MaxThreads; ///< The maximum number of threads.
const int OctreeHeight; ///< The height of the given tree.
const int leafLevelSeperationCriteria;
const int leafLevelSeparationCriteria;
public:
/** Class constructor
......@@ -57,15 +58,16 @@ public:
const int inLeafLevelSeperationCriteria = 1)
: tree(inTree) , kernels(nullptr),
MaxThreads(omp_get_max_threads()), OctreeHeight(tree->getHeight()),
leafLevelSeperationCriteria(inLeafLevelSeperationCriteria) {
leafLevelSeparationCriteria(inLeafLevelSeperationCriteria) {
FAssertLF(tree, "tree cannot be null");
FAssertLF(leafLevelSeparationCriteria < 3, "Separation criteria should be < 3");
this->kernels = new KernelClass*[MaxThreads];
#pragma omp parallel for schedule(static)
for(int idxThread = 0 ; idxThread < MaxThreads ; ++idxThread){
#pragma omp critical (InitFFmmAlgorithmThreadBalance)
#pragma omp parallel num_threads(MaxThreads)
{
#pragma omp critical (InitFFmmAlgorithmThreadBalance)
{
this->kernels[idxThread] = new KernelClass(*inKernels);
this->kernels[omp_get_thread_num()] = new KernelClass(*inKernels);
}
}
......@@ -205,7 +207,7 @@ protected:
do{
++leafsNumber;
const FTreeCoordinate& coord = octreeIterator.getCurrentCell()->getCoordinate();
++shapeLeaves[(coord.getX()%3)*9 + (coord.getY()%3)*3 + (coord.getZ()%3)];
++shapeLeaves[P2PExclusionClass::GetShapeIdx(coord)];
} while(octreeIterator.moveRight());
}
......@@ -346,6 +348,7 @@ protected:
workloadBufferThread[omp_get_thread_num()] = new WorkloadTemp[leafsNumber];
}
WorkloadTemp* workloadBuffer = workloadBufferThread[omp_get_thread_num()];
memset(workloadBuffer, 0, sizeof(struct WorkloadTemp)*leafsNumber);
// Prepare the P2P
const int LeafIndex = OctreeHeight - 1;
leafsDataArray.reset(new LeafData[leafsNumber]);
......@@ -365,7 +368,7 @@ protected:
// for each leafs
for(int idxLeaf = 0 ; idxLeaf < leafsNumber ; ++idxLeaf){
const FTreeCoordinate& coord = octreeIterator.getCurrentGlobalCoordinate();
const int shapePosition = (coord.getX()%3)*9 + (coord.getY()%3)*3 + (coord.getZ()%3);
const int shapePosition = P2PExclusionClass::GetShapeIdx(coord);
const int positionToWork = startPosAtShape[shapePosition]++;
......@@ -542,7 +545,7 @@ protected:
// for each levels
for(int idxLevel = FAbstractAlgorithm::upperWorkingLevel ; idxLevel < FAbstractAlgorithm::lowerWorkingLevel ; ++idxLevel ){
const int separationCriteria = (idxLevel != FAbstractAlgorithm::lowerWorkingLevel-1 ? 1 : leafLevelSeperationCriteria);
const int separationCriteria = (idxLevel != FAbstractAlgorithm::lowerWorkingLevel-1 ? 1 : leafLevelSeparationCriteria);
FLOG(FTic counterTimeLevel);
FLOG(computationCounter.tic());
#pragma omp parallel
......
......@@ -40,6 +40,7 @@
#include <sys/time.h>
#include "FCoreCommon.hpp"
#include "FP2PExclusion.hpp"
#include <memory>
......@@ -63,7 +64,7 @@
* --tool=memcheck --leak-check=yes --show-reachable=yes --num-callers=20
* --track-fds=yes ./Tests/testFmmAlgorithmProc ../Data/testLoaderSmall.fma.tmp
*/
template<class OctreeClass, class CellClass, class ContainerClass, class KernelClass, class LeafClass>
template<class OctreeClass, class CellClass, class ContainerClass, class KernelClass, class LeafClass, class P2PExclusionClass = FP2PMiddleExclusion>
class FFmmAlgorithmThreadProc : public FAbstractAlgorithm, public FAlgorithmTimers {
private:
OctreeClass* const tree; ///< The octree to work on
......@@ -82,7 +83,7 @@ private:
const int idProcess; ///< Current process id
const int OctreeHeight; ///< Tree height
const int leafLevelSeperationCriteria;
const int leafLevelSeparationCriteria;
/** An interval is the morton index interval
* that a proc uses (i.e. it holds data in this interval) */
......@@ -150,17 +151,18 @@ public:
nbProcess(inComm.processCount()),
idProcess(inComm.processId()),
OctreeHeight(tree->getHeight()),
leafLevelSeperationCriteria(inLeafLevelSeperationCriteria),
leafLevelSeparationCriteria(inLeafLevelSeperationCriteria),
intervals(new Interval[inComm.processCount()]),
workingIntervalsPerLevel(new Interval[inComm.processCount() * tree->getHeight()]) {
FAssertLF(tree, "tree cannot be null");
FAssertLF(leafLevelSeparationCriteria < 3, "Separation criteria should be < 3");
this->kernels = new KernelClass*[MaxThreads];
#pragma omp parallel for schedule(static)
for(int idxThread = 0 ; idxThread < MaxThreads ; ++idxThread){
#pragma omp parallel num_threads(MaxThreads)
{
#pragma omp critical (InitFFmmAlgorithmThreadProc)
{
this->kernels[idxThread] = new KernelClass(*inKernels);
this->kernels[omp_get_thread_num()] = new KernelClass(*inKernels);
}
}
......@@ -188,6 +190,9 @@ protected:
*/
void executeCore(const unsigned operationsToProceed) override {
// Count leaf
#ifdef SCALFMM_TRACE_ALGO
eztrace_start();
#endif
this->numberOfLeafs = 0;
{
Interval myFullInterval;
......@@ -260,31 +265,61 @@ protected:
workingIntervalsPerLevel, int(sizeof(Interval)) * OctreeHeight, MPI_BYTE, comm.getComm()), __LINE__ );
}
#ifdef SCALFMM_TRACE_ALGO
Timers[P2MTimer].tic();
eztrace_enter_event("P2M", EZTRACE_YELLOW);
#endif
if(operationsToProceed & FFmmP2M) bottomPass();
Timers[P2MTimer].tac();
#ifdef SSCALFMM_TRACE_ALGO
eztrace_leave_event();
eztrace_enter_event("M2M", EZTRACE_PINK);
#endif
Timers[M2MTimer].tic();
if(operationsToProceed & FFmmM2M) upwardPass();
Timers[M2MTimer].tac();
#ifdef SCALFMM_TRACE_ALGO
eztrace_leave_event();
eztrace_enter_event("M2L", EZTRACE_GREEN);
#endif
Timers[M2LTimer].tic();
if(operationsToProceed & FFmmM2L) transferPass();
Timers[M2LTimer].tac();
#ifdef SCALFMM_TRACE_ALGO
eztrace_leave_event();
eztrace_enter_event("L2L", EZTRACE_PINK);
#endif
Timers[L2LTimer].tic();
if(operationsToProceed & FFmmL2L) downardPass();
Timers[L2LTimer].tac();
#ifdef SCALFMM_TRACE_ALGO
eztrace_leave_event();
eztrace_enter_event("L2P+P2P", EZTRACE_BLUE);
#endif
Timers[NearTimer].tic();
if( (operationsToProceed & FFmmP2P) || (operationsToProceed & FFmmL2P) ) directPass((operationsToProceed & FFmmP2P),(operationsToProceed & FFmmL2P));
Timers[NearTimer].tac();
#ifdef SCALFMM_TRACE_ALGO