Commit a0162afc authored by BLANCHARD Pierre's avatar BLANCHARD Pierre
Browse files

Merge branch 'master' into PB_tensorialFMM

parents c1ec6ed7 8a78a9cc
......@@ -29,7 +29,7 @@ project(ScalFMM)
# ScalFMM version number. An even minor number corresponds to releases.
SET(ScalFMM_MAJOR_VERSION 1)
SET(ScalFMM_MINOR_VERSION 1)
SET(ScalFMM_MINOR_VERSION 2)
SET(ScalFMM_PATCH_VERSION 1)
SET(ScalFMM_VERSION "${ScalFMM_MAJOR_VERSION}.${ScalFMM_MINOR_VERSION}.${ScalFMM_PATCH_VERSION}" )
......@@ -217,6 +217,10 @@ add_subdirectory(Src)
# Link with scalfmm lib
set(scalfmm_lib scalfmm)
# Build - Examples and drivers
add_subdirectory(Examples)
# Build - Tests
MESSAGE( STATUS "ScalFMM_BUILD_TESTS = ${ScalFMM_BUILD_TESTS}" )
if( ScalFMM_BUILD_TESTS )
......@@ -275,7 +279,14 @@ set (CPACK_RESOURCE_FILE_LICENSE
"${CMAKE_CURRENT_SOURCE_DIR}/Licence.txt")
SET(CPACK_PACKAGE_VERSION_MAJOR "${ScalFMM_MAJOR_VERSION}")
SET(CPACK_PACKAGE_VERSION_NINOR "${ScalFMM_MINOR_VERSION}")
set(CPACK_PACKAGE_VERSION_PATCH "${ScalFMM_PATCH_VERSION}")
#
# Use git commit number as CPACK_PACKAGE_VERSION_PATCH
execute_process(
COMMAND git rev-list HEAD --count
OUTPUT_VARIABLE CPACK_PACKAGE_VERSION_PATCH
RESULT_VARIABLE RET
ERROR_QUIET
)
SET(CPACK_INCLUDE_TOPLEVEL_DIRECTORY "ON")
#
SET(CPACK_SOURCE_GENERATOR "TGZ")
......
......@@ -34,7 +34,7 @@ PROJECT_NAME = "ScalFmm"
# This could be handy for archiving the generated documentation or
# if some version control system is used.
PROJECT_NUMBER =
PROJECT_NUMBER = @ScalFMM_MAJOR_VERSION@.@ScalFMM_MINOR_VERSION@
# Using the PROJECT_BRIEF tag one can provide an optional one line description
# for a project that appears at the top of each page and should give viewer
......@@ -665,7 +665,8 @@ WARN_LOGFILE =
# with spaces.
INPUT = @CMAKE_CURRENT_SOURCE_DIR@/../Doc/Site_dox \
@CMAKE_CURRENT_SOURCE_DIR@/../Src/
@CMAKE_CURRENT_SOURCE_DIR@/../Src/ \
@CMAKE_CURRENT_SOURCE_DIR@/../Examples/
# This tag can be used to specify the character encoding of the source files
# that doxygen parses. Internally doxygen uses the UTF-8 encoding, which is
......
How to use ScalFMM on MIC
-fp-model precise -fp-model source -fimf-precision=low
Native mode
1)
2) Launch the code as follow
micnativeloadex Tests/Release/testNewCompareKernels -a "-f /tmp/coulaud/file.fma"
\ No newline at end of file
......@@ -21,9 +21,13 @@
* <li> Bérenger Bramas </li>
* <li> Cyrille Piacibello </li>
* <li> Pierre Blanchard </li>
* </ul>
* Old contributors
*
* <ul>
* <li> Matthias Messner </li>
* </ul>
*/
......@@ -3,11 +3,12 @@
\usepackage{listings}
\usepackage{geometry}
\usepackage{graphicx}
\usepackage{tikz}
\usepackage[hypertexnames=false, pdftex]{hyperref}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% use:$ pdflatex ParallelDetails.tex
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\author{Berenger Bramas, Olivier Coulaud, Cyrille Piacibelo}
\author{Berenger Bramas, Olivier Coulaud, Cyrille Piacibello}
\title{ScalFmm - Parallel Algorithms (Draft)}
\date{\today}
......@@ -152,26 +153,64 @@ That is the reason why we need to balance the data among nodes.
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\section{Balancing the leaves}
After sorting, each process has potentially several leaves.
If we have two processes $P_{i}$ and $P_{j}$ with $i < j$ the sort guarantees that all leaves from node i are inferior than the leaves on the node j in a Morton indexing way.
But the leaves are randomly distributed among the nodes and we need to balance them.
It is a simple reordoring of the data, but the data has to stayed sorted.
After sorting, each process has potentially several leaves. If we
have two processes $P_{i}$ and $P_{j}$ with $i < j$ the sort
guarantees that all leaves from node i are inferior than the leaves on
the node j in a Morton indexing way. But the leaves are randomly
distributed among the nodes and we need to balance them. It is a
simple reordoring of the data, but the data has to stayed sorted.
\begin{enumerate}
\item Each process informs other to tell how many leaves it holds.
\item Each process compute how many leaves it has to send or to receive from left or right.
\item Each process compute how many leaves it has to send or to
receive from left or right. \label{balRef}
\end{enumerate}
At the end of the algorithm our system is completely balanced with the same number of leaves on each process.
At the end of the algorithm our system is completely balanced with the
same number of leaves on each process. If another kind of balancing
algorithm is needed, one can only change the BalanceAlgorithm class
that is given in parameter to the ArrayToTree static method in the
step \ref{balRef}.
\subsection{Balancing algorithms supported}
Any balancing algorithm can be used, but it has to provide at least
two method, as showed in the class FAbstractBalancingAlgorithm.
Those methods are :
\begin{enumerate}
\item GetLeft : return the number of leaves that will belongs only to
proc on the left of given proc.
\item GetRight : return the number of leaves that will belongs only to
proc on the right of given proc.
\end{enumerate}
In the parameters of those two methods, one can find the total number
of leaves, the total number of particles, the number of proc, and the
index of a proc to be treated.
\begin{figure}[h!]
\begin{center}
\includegraphics[width=15cm, height=15cm, keepaspectratio=true]{Images/Balance.png}
\caption{Balancing Example}
\caption{Balancing Example : A process has to send data to the
left if its current left limit is upper than its objective
limit. Same in the other side, and we can reverse the calculs
to know if a process has to received data.}
\end{center}
\end{figure}
A process has to send data to the left if its current left limit is upper than its objective limit.
Same in the other side, and we can reverse the calculs to know if a process has to received data.
\clearpage
\subsection{Mpi calls}
Once every process know exactly what it needs to compute for itself
and for any other proc the bound GetRight() and GetLeft(), there is
only one Mpi communication AllToAll.
To prepare the buffers to be sent and received, each proc count the
number of leafs (and the size) it holds, and divide them into
potentially three parts :
\begin{enumerate}
\item The datas to send to proc on left (Can be null).
\item The datas to keep (can be null).
\item The datas to send to proc on right(can be null).
\end{enumerate}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
......@@ -237,10 +276,10 @@ shared cell at a level.
There are to cases :
\begin{itemize}
\item My first cell is shared means that I need to send the children I have of
this cell to the processus on my left.
\item My last cell is shared means that I need to receive some
children from the processus on my right.
\item My first cell is shared means that I need to send the children I have of
this cell to the processus on my left.
\item My last cell is shared means that I need to receive some
children from the processus on my right.
\end{itemize}
......@@ -313,6 +352,46 @@ Example :
\hline
\end{tabular}
\subsection{Modified M2M}
The algorithm may not be efficient for special cases. Since the
communications do not progress (even in asynchrone way) while
computing the M2M, the algorithm has been modified, in order to set
one of the OMP thread to the communications.
\begin{algorithm}[H]
\RestyleAlgo{boxed}
\LinesNumbered
\SetAlgoLined
\KwData{none}
\KwResult{none}
\BlankLine
\For{idxLevel $\leftarrow$ $Height - 2$ \KwTo 1}{
\tcp{pragma omp single}
\Begin(To be done by one thread only){
\uIf{$cells[0]$ not in my working interval}{
isend($cells[0].child$)\;
hasSend $\leftarrow$ true\;
}
\uIf{$cells[end]$ in another working interval}{
irecv(recvBuffer)\;
hasRecv $\leftarrow$ true\;
}
\emph{Wait send and recv if needed}\;
\uIf{hasRecv is true}{
M2M($cells[end]$, recvBuffer)\;
}
}
\tcp{pragma omp for}
\Begin(To be done by all the other threads){
\ForAll{Cell c at level idxLevel in working interval}{
M2M(c, c.child)\;
}
}
}
\BlankLine
\caption{Distributed M2M}
\end{algorithm}
\clearpage
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
......@@ -413,13 +492,25 @@ Finally they send and receive data in an asynchronous way and cover it by the P2
\BlankLine
\caption{Distributed P2P}
\end{algorithm}
\subsection{Shared Memory Version}
The P2P algorithm is computed once for each pair of leafs belonging to
the same proc. This means that when a proc will compute the force on
the particles of leaf $1$ due to the particles of leaf $2$, both leafs
$1$ and $2$ will be updated.
This way of compute the interaction is faster, but leads to
concurrency problems.
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\section{M2L}
The M2L operator is relatively similar to the P2P.
Hence P2P is done at the leaves level, M2L is done on several levels from Height - 2 to 2.
Hence P2P is done at the leaves level, M2L is done on several levels from $Height - 2$ to 2.
At each level, a node needs to have access to all the distant neighbors of the cells it is the proprietary and those ones can be hosted by any other node.
Anyway, each node can compute a part of the M2L with the data it has.
\subsection{Original Algorithm}
The algorithm can be viewed as several tasks:
\begin{enumerate}
\item Compute to know what data has to be sent
......@@ -436,7 +527,7 @@ The algorithm can be viewed as several tasks:
\KwData{none}
\KwResult{none}
\BlankLine
\ForAll{Level idxLeve from 2 to Height - 2}{
\ForAll{Level idxLevel from 2 to Height - 2}{
\ForAll{Cell c at level idxLevel}{
neighborsIndexes $\leftarrow$ $c.potentialDistantNeighbors()$\;
\ForAll{index in neighborsIndexes}{
......@@ -462,6 +553,103 @@ The algorithm can be viewed as several tasks:
\BlankLine
\caption{Distributed M2L}
\end{algorithm}
\subsection{Algorithm Modified}
The idea in the following version is to cover the communications
between process with the work (M2L Self) that can be done without
anything from outside.
\begin{algorithm}[H]
\RestyleAlgo{boxed}
\LinesNumbered
\SetAlgoLined
\KwData{none}
\KwResult{none}
\BlankLine
\Begin(To be done by one thread only){ \label{single}
\ForAll{Level idxLevel from 2 to Height - 2}{
\ForAll{Cell c at level idxLevel}{
neighborsIndexes $\leftarrow$ $c.potentialDistantNeighbors()$\;
\ForAll{index in neighborsIndexes}{
\uIf{index belong to another proc}{
isend(c)\;
\emph{Mark c as a cell that is linked to another proc}\;
}
}
}
}
\emph{Wait send and recv if needed}\;
}
\Begin(To be done by everybody else){\label{multiple}
\emph{Normal M2L}\;
}
\ForAll{Cell c received}{
$lightOctree.insert( c )$\;
}
\ForAll{Level idxLeve from 2 to Height - 1}{
\ForAll{Cell c at level idxLevel that are marked}{
neighborsIndexes $\leftarrow$ $c.potentialDistantNeighbors()$\;
neighbors $\leftarrow$ lightOctree.get(neighborsIndexes)\;
M2L( c, neighbors)\;
}
}
\BlankLine
\caption{Distributed M2L 2}
\end{algorithm}
\appendix
\clearpage
\chapter{Cheat sheet about using EZtrace with ViTE on ScalFMM}
In this appendix, one can find usefull information about using EZtrace
on ScalFMM, and visualisation with ViTE.
\section{EZtrace}
EZTrace is a tool that aims at generating automatically execution
trace from HPC (High Performance Computing) programs.
It does not need any source instrumentation.
Usefull variables :
\begin{itemize}
\item EZTRACE\_FLUSH : set the value to $1$ in order to flush the
event buffer to the disk in case of uge amouts of datas.
\item EZTRACE\_TRACE : choice of the type of event one wants to
have. Example : EZTRACE\_TRACE="mpi stdio omp memory". Remark : Mpi do a
lot of call to pthread, so I suggest to not trace pthread events in
order to visualize the results.
\item EZTRACE\_TRACE\_DIR : path to a directory in wich eztrace will
store trace for each MPI Proc. (Set to /lustre/username/smt to avoid
overhead)
\end{itemize}
Once the traces are generated, one need to convert them, in order to
visualize its.
\section{ViTE}
ViTE is a high memory consumption software, so in order to use it,
avoid tracing pthread for example.
One can zoom in and out in the gant chart.
Plugin : One can get a histogram of each proc display the percentage
of time spend in different section.
\begin{itemize}
\item Got to Preferences $\rightarrow$ Plugin.
\end{itemize}
Sorting the gant charts : Sometimes the process are badly sorted (like
0,1,10,11,2,3,4,5,6,7,8,9). It's possible to sort them with the mouse,
or with editing an xml file :
\begin{itemize}
\item Got to Preferences $\rightarrow$ Node Selection and then Sort,
or export/load xml file .
\end{itemize}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{thebibliography}{9}
......
# check if compiling into source directories
STRING(COMPARE EQUAL "${CMAKE_SOURCE_DIR}" "${CMAKE_BINARY_DIR}" insource)
if(insource)
MESSAGE(FATAL_ERROR "${PROJECT_NAME} requires an out of source build. Goto scalfmm/Build and tapes cmake ../")
endif(insource)
project(Examples_scalfmm CXX)
set(EXECUTABLE_OUTPUT_PATH ${CMAKE_BUILD_TYPE})
ADD_DEFINITIONS( ${ScaLFMM_CXX_FLAGS})
# Find all code files
file(
GLOB_RECURSE
source_tests_files
./*.cpp
)
# Adding the project sources dir as an include dir
INCLUDE_DIRECTORIES(
${CMAKE_BINARY_DIR}/Src
${CMAKE_SOURCE_DIR}/Src
)
# Add execs - 1 cpp = 1 exec
foreach(exec ${source_tests_files})
get_filename_component(
execname ${exec}
NAME_WE
)
set(compile_exec "TRUE")
# Test Blas dependency
file(STRINGS "${exec}" lines_blas REGEX "@FUSE_BLAS")
if(lines_blas)
if( NOT ScalFMM_USE_BLAS )
MESSAGE( STATUS "This needs BLAS = ${exec}" )
set(compile_exec "FALSE")
endif()
endif()
# Test FFT dependency
file(STRINGS "${exec}" lines_fft REGEX "@FUSE_FFT")
if(lines_fft)
if( NOT ScalFMM_USE_FFT )
MESSAGE( STATUS "This needs FFT = ${exec}" )
set(compile_exec "FALSE")
endif()
endif()
# Test MPI dependency
file(STRINGS "${exec}" lines_mpi REGEX "@FUSE_MPI")
if(lines_mpi)
if( NOT ScalFMM_USE_MPI )
MESSAGE( STATUS "This needs MPI = ${exec}" )
set(compile_exec "FALSE")
endif()
endif()
# Dependency are OK
if( compile_exec )
add_executable(
${execname}
${exec}
)
SET_TARGET_PROPERTIES(${execname} PROPERTIES ENABLE_EXPORTS TRUE)
target_link_libraries(
${execname}
${scalfmm_lib}
${SCALFMM_LIBRARIES}
)
endif()
# Install bin
install( TARGETS ${execname} DESTINATION bin )
endforeach(exec)
/*
* genarateDistributions.cpp
*
* Created on: 23 mars 2014
* Author: Olivier Coulaud
*/
#include <iostream>
#include <fstream>
#include <sstream>
#include <string>
#include "Utils/FMath.hpp"
#include "Utils/FGenerateDistribution.hpp"
//
/// \file generateDistributions.cpp
//!
//! \brief generateDistributions: Driver to generate N points (non)uniformly distributed on a given geometry
//!
//! The goal of this driver is to generate uniform or non uniform points on the following geometries
//!
//! Uniform : sphere, prolate,
//!
//! Non uniform : ellipsoid, prolate
//!
//! You can set two kind of physical values depending of your problem. By default all values are between 0 and 1.
//! If you select the argument -charge (see bellow) the values are between -1 and 1.
//! The arguments available are
//!
//! <b> General arguments:</b>
//! \param -help(-h) to see the parameters available in this driver
//! \param -N The number of points in the distribution (default 20000)
//! \param -filename name: generic name for files (without extension) and save data
//! with following format in name.xxx or name.bin in -bin (not yet implemented) is set
//! \param -visu save output in filename.txt
//! \param -visufmt format for the visu file (vtk, vtp, cvs or cosmo). vtp is the default
//!
//! <b> Geometry arguments:</b>
//! \param -unitSphere uniform distribution on unit sphere
//! \param -sphere uniform distribution on sphere of radius given by
//! \arg -radius R - default value for R is 2.0
//! \param -ellipsoid non uniform distribution on an ellipsoid of aspect ratio given by
//! \arg -ar a:b:c with a, b and c > 0
//! \param -prolate ellipsoid with aspect ratio a:a:c given by
//! \arg -ar a:a:c with c > a > 0
//! \param -plummer (Highly non uniform) plummer distribution (astrophysics)
//! \arg -radius R - default value 10.0"
//!
//!
//! <b> Physical values argument:</b>
//! \param -charge generate physical values between -1 and 1 otherwise generate between 0 and 1
//!
//! \b examples
//!
//! generateDistributions -prolate -ar 2:2:4 -N 20000 -filename prolate -visu
//!
//!
//
//
void genDistusage() {
std::cout << "Driver to generate N points (non)uniformly distributed on a given geometry"
<< std::endl;
std::cout << "Options "<< std::endl
<< " -help to see the parameters " << std::endl
<< " -N The number of points in the distribution " << std::endl
<< std::endl
<< " Distributions " << std::endl
<< " Uniform on " << std::endl
<< " -unitSphere uniform distribution on unit sphere" <<std::endl
<< " -sphere uniform distribution on sphere of radius given by" <<std::endl
<< " -radius R - default value for R is 2.0" <<std::endl
<< " -prolate ellipsoid with aspect ratio a:a:c" <<std::endl
<< " -ar a:a:c with c > a > 0" <<std::endl<<std::endl
<< " Non Uniform on " << std::endl
<< " -ellipsoid non uniform distribution on an ellipsoid of aspect ratio given by" <<std::endl
<< " -ar a:b:c with a, b and c > 0" <<std::endl
<< " -plummer (Highly non unuiform) plummer distrinution (astrophysics)"<<std::endl
<< " -radius R - default value 10.0" <<std::endl
<< " Physical values" <<std::endl
<< " -charge generate physical values between -1 and 1 otherwise generate between 0 and 1 " <<std::endl<<std::endl
<< " Output " << std::endl
<< " -filename name: generic name for files (without extension) and save data" <<std::endl
<< " with following format in name.xxx or name.bin in -bin is set" <<std::endl
<< " -visu save output in name.txt" <<std::endl
<< " -visufmt vtk, vtp, cosmo or cvs format " <<std::endl;
}
int main(int argc, char ** argv){
//
if(FParameters::existParameter(argc, argv, "-h")||FParameters::existParameter(argc, argv, "-help")|| (argc < 3 )){
genDistusage() ;
exit(-1);
}
const FReal extraRadius = 0.001 ;
const int NbPoints = FParameters::getValue(argc,argv,"-N", 20000);
const std::string genericFileName(FParameters::getStr(argc,argv,"-filename", "unifPointDist"));
FReal BoxWith ;
//
// Allocation
//
FReal * particles ;
particles = new FReal[4*NbPoints] ;
memset(particles,0,4*NbPoints*sizeof(FReal));
//
// Generate physical values
//
FReal phyVal, sum,a,b ;
if(FParameters::existParameter(argc, argv, "-charge")){
a= 2.0 ; b = -1.0 ;
}
else {
a= 1.0 ; b = 0.0 ;
}
sum = 0.0 ;
int j = 3 ;
for(int i = 0 ; i< NbPoints; ++i, j+=4){
phyVal = a*getRandom() +b ;
sum += phyVal ;
particles[j] = phyVal ;
}
std::cout << "Sum physical value "<< sum << " Mean Value " << sum/NbPoints<<std::endl ;
//
// Point generation
//
if(FParameters::existParameter(argc, argv, "-unitSphere")){
unifRandonPointsOnUnitSphere(NbPoints, particles) ;
BoxWith = 2.0 ;
}
else if(FParameters::existParameter(argc, argv, "-sphere")){
const FReal Radius = FParameters::getValue(argc,argv,"-radius", 2.0);
unifRandonPointsOnSphere(NbPoints, Radius,particles) ;
BoxWith = 2.0*Radius ;
}
else if(FParameters::existParameter(argc, argv, "-prolate")){
std::string dd(":"),aspectRatio = FParameters::getStr(argc,argv,"-ar", "1:1:2");
FReal A,B,C ;
size_t pos = aspectRatio.find(":"); aspectRatio.replace(pos,1," ");
pos = aspectRatio.find(":"); aspectRatio.replace(pos,1," ");
std::stringstream ss(aspectRatio); ss >>A >> B >> C ;
if(A != B){
std::cerr << " A /= B in prolate sllipsoide A =B. Your aspect ratio: "<< aspectRatio<<std::endl;
}
std::cout << "A: "<<A<<" B "<< B << " C: " << C<<std::endl;
unifRandonPointsOnProlate(NbPoints,A,C,particles);
BoxWith = C;
}
else if(FParameters::existParameter(argc, argv, "-ellipsoid")){
std::string dd(":"),aspectRatio = FParameters::getStr(argc,argv,"-ar", "1:1:2");
FReal A,B,C ;
size_t pos = aspectRatio.find(":"); aspectRatio.replace(pos,1," ");
pos = aspectRatio.find(":"); aspectRatio.replace(pos,1," ");
std::stringstream ss(aspectRatio); ss >>A >> B >> C ;
// std::cout << "A: "<<A<<" B "<< B << " C: " << C<<std::endl;
nonunifRandonPointsOnElipsoid(NbPoints,A,B,C,particles);
BoxWith = FMath::Max( A,FMath::Max( B,C)) ;
}
else if(FParameters::existParameter(argc, argv, "-plummer")){
const FReal Radius = FParameters::getValue(argc,argv,"-radius", 10.0);
unifRandonPlummer(NbPoints, Radius, sum, particles) ;
BoxWith = 2.0*Radius ;
}
else {
std::cout << "Bad geometry option"<< std::endl;
exit(-1) ;
}
if(FParameters::<