diff --git a/Addons/BenchEfficiency/bordeaux_0116.pdf b/Addons/BenchEfficiency/bordeaux_0116.pdf new file mode 100644 index 0000000000000000000000000000000000000000..c7785df796322f7b6779f10deff6bef7ef3bb436 Binary files /dev/null and b/Addons/BenchEfficiency/bordeaux_0116.pdf differ diff --git a/Addons/BenchEfficiency/execAllHomogeneous.sh b/Addons/BenchEfficiency/execAllHomogeneous.sh index 610108bcc606ba56d0ef2809d61772b1954f8785..0ec3591b1bb06c6a87975c2ef5eae956e9dc36c2 100644 --- a/Addons/BenchEfficiency/execAllHomogeneous.sh +++ b/Addons/BenchEfficiency/execAllHomogeneous.sh @@ -13,14 +13,20 @@ for (( cpu=1 ; cpu<=$SCALFMM_MAX_NB_CPU ; cpu++)) ; do STARPU_NCPUS=$cpu STARPU_NCUDA=0 - ./Tests/Release/testBlockedUnifCudaBench -nb $SCALFMM_NB -h $SCALFMM_H -bs $SCALFMM_BS_CPU_SEQ - rec_name="trace-nb_$SCALFMM_NB-h_$SCALFMM_H-bs_$SCALFMM_CPU_SEQ-CPU_$cpu.rec" + logoutput=`./Tests/Release/testBlockedUnifCudaBench -nb $SCALFMM_NB -h $SCALFMM_H -bs $SCALFMM_BS_CPU_SEQ` + if [[ $VERBOSE ]] ; then + echo $logoutput + fi + rec_name="$SCALFMM_RES_DIR/trace-nb_$SCALFMM_NB-h_$SCALFMM_H-bs_$SCALFMM_CPU_SEQ-CPU_$cpu.rec" mv trace.rec output/$rec_name - python $SCALFMM_STARPU_DIR/bin/starpu_trace_state_stats.py -t output/$rec_name + python $SCALFMM_STARPU_DIR/bin/starpu_trace_state_stats.py -t $rec_name > $rec_name.time - ./Tests/Release/testBlockedUnifCudaBench -nb $SCALFMM_NB -h $SCALFMM_H -bs $SCALFMM_BS_CPU_PAR - rec_name="trace-nb_$SCALFMM_NB-h_$SCALFMM_H-bs_$SCALFMM_CPU_PAR-CPU_$cpu.rec" + logoutput=`./Tests/Release/testBlockedUnifCudaBench -nb $SCALFMM_NB -h $SCALFMM_H -bs $SCALFMM_BS_CPU_PAR` + if [[ $VERBOSE ]] ; then + echo $logoutput + fi + rec_name="$SCALFMM_RES_DIR/trace-nb_$SCALFMM_NB-h_$SCALFMM_H-bs_$SCALFMM_CPU_PAR-CPU_$cpu.rec" mv trace.rec output/$rec_name - python $SCALFMM_STARPU_DIR/bin/starpu_trace_state_stats.py -t output/$rec_name + python $SCALFMM_STARPU_DIR/bin/starpu_trace_state_stats.py -t $rec_name > $rec_name.time done diff --git a/Addons/BenchEfficiency/paintmodel.fmm.colors b/Addons/BenchEfficiency/paintmodel.fmm.colors new file mode 100644 index 0000000000000000000000000000000000000000..8766ca2f30208b5be1c3bf08fa451bb1d9482cf6 --- /dev/null +++ b/Addons/BenchEfficiency/paintmodel.fmm.colors @@ -0,0 +1,9 @@ +#expression evaluation;replacement (if not empty);red;green;blue +P2P;;1.0;1.0;1.0 +P2P-out;;0.7;0.7;0.7 +(M2L-level-)[0-9]*;;0.2;0.2;0.6 +(M2L-out-level-)[0-9]*;;0.1;0.1;0.6 +(L2L-level-)[0-9]*;;0.2;0.6;0.2 +(M2M-level-)[0-9]*;;0.1;0.6;0.6 +P2M;;0.2;0.2;0.2 +L2P;;0.6;0.8;0.2 diff --git a/Addons/BenchEfficiency/pajecolor b/Addons/BenchEfficiency/pajecolor new file mode 100755 index 0000000000000000000000000000000000000000..281fe611c05b4238fd0af0e8098a4b2ff27391ab Binary files /dev/null and b/Addons/BenchEfficiency/pajecolor differ diff --git a/Addons/BenchEfficiency/par-bs-search.png b/Addons/BenchEfficiency/par-bs-search.png new file mode 100644 index 0000000000000000000000000000000000000000..4243937f718415f59047a70a48cb51391dcd223e Binary files /dev/null and b/Addons/BenchEfficiency/par-bs-search.png differ diff --git a/Addons/BenchEfficiency/scalfmm.html b/Addons/BenchEfficiency/scalfmm.html index 1c0c3211b6190fd2671baf7b08b2ed0242b7b2b9..85da3260555a959efc0d6cf222967ac0f683f4b5 100644 --- a/Addons/BenchEfficiency/scalfmm.html +++ b/Addons/BenchEfficiency/scalfmm.html @@ -13,6 +13,7 @@ <li>BLAS/LAPACK (The configure of ScalFMM is different if the MKL is used or not, but with the MKL it is recommended to set environment variable <code>MKLROOT</code>)</li> <li>CUDA (>= 7) and <code>CUDA_PATH</code> must be set. In our case, <code>CUDA_PATH=/usr/local/cuda-7.5/</code></li> <li><strong>Optional</strong> Vite (from <code>sudo apt-get install vite</code> or see <a href="http://vite.gforge.inria.fr/download.php" class="uri">http://vite.gforge.inria.fr/download.php</a>)</li> +<li><strong>Optional</strong> Qt5 library to be able to change the colors of the execution traces in order to visualize the different FMM operators</li> </ul> <blockquote> <p>Some installations of CUDA does not have libcuda file. In this case, one needs to create a link : <code>sudo ln /usr/local/cuda-7.5/lib64/libcudart.so /usr/local/cuda-7.5/lib64/libcuda.so</code></p> @@ -26,9 +27,14 @@ <h3 id="working-directory">Working directory</h3> <p>The variable <code>SCALFMM_TEST_DIR</code> is used to specify the working directory:</p> <pre class="bash"><code>export SCALFMM_TEST_DIR=~/scalfmm_test -mkdir $SCALFMM_TEST_DIR +if [[ ! -d $SCALFMM_TEST_DIR ]] ; then + mkdir $SCALFMM_TEST_DIR +fi cd $SCALFMM_TEST_DIR</code></pre> -<p><em>Output variables:</em> <code>$SCALFMM_TEST_DIR</code></p> +<p>In order to be able to stop the tutorial in the middle and restart later, we will keep the register variables in a file that should be source to restart.</p> +<pre class="bash"><code>function scalfmmRegisterVariable() { echo "export $1=${!1}" >> "$SCALFMM_TEST_DIR/environment.source"; } +echo "function scalfmmRegisterVariable() { echo \"export $1=${!1}\" >> \"$SCALFMM_TEST_DIR/environment.source\"; }" > "$SCALFMM_TEST_DIR/environment.source"</code></pre> +<p><em>Output variables:</em> <code>scalfmmRegisterVariable SCALFMM_TEST_DIR</code></p> <p>Valid-if</p> <pre class="bash"><code>if [[ -n $SCALFMM_TEST_DIR ]] && [[ -d $SCALFMM_TEST_DIR ]] ; then echo “STEP-OK” @@ -48,10 +54,10 @@ if [[ ! -f hwloc-1.11.2.tar.gz ]] ; then fi tar xvf hwloc-1.11.2.tar.gz cd hwloc-1.11.2/ -SCALFMM_HWLOC_DIR=$SCALFMM_TEST_DIR/hwlocinstall +export SCALFMM_HWLOC_DIR=$SCALFMM_TEST_DIR/hwlocinstall ./configure --prefix=$SCALFMM_HWLOC_DIR make install</code></pre> -<p><em>Output variables:</em> <code>$SCALFMM_HWLOC_DIR</code></p> +<p><em>Output variables:</em> <code>scalfmmRegisterVariable SCALFMM_HWLOC_DIR</code></p> <p>Valid-if:</p> <pre class="bash"><code>if [[ -n $SCALFMM_HWLOC_DIR ]] && [[ -d $SCALFMM_HWLOC_DIR/lib/ ]] && [[ -f $SCALFMM_HWLOC_DIR/lib/libhwloc.so ]]; then echo “OK” @@ -63,10 +69,10 @@ if [[ ! -f fxt-0.2.11.tar.gz ]] ; then fi tar xvf fxt-0.2.11.tar.gz cd fxt-0.2.11/ -SCALFMM_FXT_DIR=$SCALFMM_TEST_DIR/fxtinstall +export SCALFMM_FXT_DIR=$SCALFMM_TEST_DIR/fxtinstall ./configure --prefix=$SCALFMM_FXT_DIR make install</code></pre> -<p><em>Output variables:</em> <code>$SCALFMM_FXT_DIR</code></p> +<p><em>Output variables:</em> <code>scalfmmRegisterVariable SCALFMM_FXT_DIR</code></p> <p>Valid-if:</p> <pre class="bash"><code>if [[ -n $SCALFMM_FXT_DIR ]] && [[ -d $SCALFMM_FXT_DIR/lib/ ]] && [[ -f $SCALFMM_FXT_DIR/lib/libfxt.so ]]; then echo “OK” @@ -79,12 +85,12 @@ if [[ ! -f fftw-3.3.4.tar.gz ]] ; then fi tar xvf fftw-3.3.4.tar.gz cd fftw-3.3.4/ -SCALFMM_FFTW_DIR=$SCALFMM_TEST_DIR/fftinstall +export SCALFMM_FFTW_DIR=$SCALFMM_TEST_DIR/fftinstall ./configure --prefix=$SCALFMM_FFTW_DIR make install ./configure --prefix=$SCALFMM_FFTW_DIR --enable-float make install</code></pre> -<p><em>Output variables:</em> <code>$SCALFMM_FFTW_DIR</code></p> +<p><em>Output variables:</em> <code>scalfmmRegisterVariable SCALFMM_FFTW_DIR</code></p> <p>Valid-if:</p> <pre class="bash"><code>if [[ -n $SCALFMM_FFTW_DIR ]] && [[ -d $SCALFMM_FFTW_DIR/lib/ ]] && [[ -f $SCALFMM_FFTW_DIR/lib/libfftw3.a ]] && [[ -f $SCALFMM_FFTW_DIR/lib/libfftw3f.a ]]; then echo “OK” @@ -95,14 +101,14 @@ if [[ ! -d starpu ]] ; then svn export svn://scm.gforge.inria.fr/svnroot/starpu/trunk starpu fi cd starpu/ -SCALFMM_STARPU_DIR=$SCALFMM_TEST_DIR/starpuinstall +export SCALFMM_STARPU_DIR=$SCALFMM_TEST_DIR/starpuinstall ./autogen.sh ./configure --prefix=$SCALFMM_STARPU_DIR --with-fxt=$SCALFMM_FXT_DIR --with-hwloc=$SCALFMM_HWLOC_DIR --with-cuda-dir=$CUDA_PATH --disable-opencl make install</code></pre> <blockquote> <p><strong>Optional</strong> In case you do not want to use trace (FXT) please remove the <code>--with-fxt=$SCALFMM_FXT_DIR</code> parameter from the command</p> </blockquote> -<p><em>Output variables:</em> <code>$SCALFMM_STARPU_DIR</code></p> +<p><em>Output variables:</em> <code>scalfmmRegisterVariable SCALFMM_STARPU_DIR</code></p> <p>Valid-if:</p> <pre class="bash"><code>if [[ -n $SCALFMM_STARPU_DIR ]] && [[ -d $SCALFMM_STARPU_DIR/lib/ ]] && [[ -f $SCALFMM_STARPU_DIR/lib/libstarpu.so ]] ; then echo “OK” @@ -117,10 +123,10 @@ git clone --depth=1 https://scm.gforge.inria.fr/anonscm/git/scalfmm-public/scalf fi cd scalfmm-public/ export SCALFMM_SOURCE_DIR=`pwd` -Build/ +cd Build/ export SCALFMM_BUILD_DIR=`pwd`</code></pre></li> </ul> -<p><em>Output variables:</em> <code>SCALFMM_BUILD_DIR</code> <code>SCALFMM_SOURCE_DIR</code></p> +<p><em>Output variables:</em> <code>scalfmmRegisterVariable SCALFMM_BUILD_DIR</code> <code>scalfmmRegisterVariable SCALFMM_SOURCE_DIR</code></p> <ul> <li><p>Configure (No MKL):</p> <pre class="bash"><code>cmake .. -DSCALFMM_BUILD_DEBUG=OFF -DSCALFMM_USE_MPI=OFF -DSCALFMM_BUILD_TESTS=ON -DSCALFMM_BUILD_UTESTS=OFF -DSCALFMM_USE_BLAS=ON -DSCALFMM_USE_MKL_AS_BLAS=OFF -DSCALFMM_USE_LOG=ON -DSCALFMM_USE_STARPU=ON -DSCALFMM_USE_CUDA=ON -DSCALFMM_USE_OPENCL=OFF -DHWLOC_DIR=$SCALFMM_HWLOC_DIR -DSTARPU_DIR=$SCALFMM_STARPU_DIR -DSCALFMM_USE_FFT=ON -DFFT_DIR=$SCALFMM_FFT_DIR</code></pre></li> @@ -154,7 +160,27 @@ make testBlockedUnifCudaBench</code></pre> <li><code>-p2p-m2l-cuda-only</code> : to compute the P2P and the M2L only on GPU (the rest on the CPU)</li> </ul> <p>Examples:</p> -<pre><code>STARPU_NCPUS=3 STARPU_NCUDA=1 ./Tests/Release/testBlockedUnifCudaBench -nb 10000 -h 3</code></pre> +<pre><code>export STARPU_NCPUS=12 +export STARPU_NCUDA=2 +./Tests/Release/testBlockedUnifCudaBench -nb 30000000 -h 7 -bs 800</code></pre> +<p>Last part of the output should be:</p> +<pre class="bash"><code> Start FGroupTaskStarPUAlgorithm + directPass in 0.0406482s + inblock in 0.000780428s + outblock in 0.0398674s + bottomPass in 0.00586269s + upwardPass in 0.00265723s + transferPass in 0.00323571s + inblock in 0.000124817s + outblock in 0.00298331s + downardPass in 0.00257975s + transferPass in 0.0652285s + inblock in 0.00164774s + outblock in 0.0635799s + L2P in 0.0115733s + Submitting the tasks took 0.139101s + Moving data to the host took 0.0578765s +@EXEC TIME = 14.6321s</code></pre> <ul> <li>Visualize the execution trace (<strong>Optional</strong>)</li> </ul> @@ -162,119 +188,147 @@ make testBlockedUnifCudaBench</code></pre> <pre class="bash"><code>$SCALFMM_STARPU_DIR/bin/starpu_fxt_tool -i "/tmp/prof_file_"$USER"_0"</code></pre> <p>Then visualize the output with vite</p> <pre class="bash"><code>vite ./paje.trace</code></pre> +<p>Should be like: // IMAGE HERE</p> +<p>We can convert the color of the trace by (it needs Qt5 library):</p> +<pre class="bash"><code>$SCALFMM_SOURCE_DIR/Addons/BenchEfficiency/pajecolor paje.trace $SCALFMM_SOURCE_DIR/Addons/BenchEfficiency/paintmodel.fmm.colors +vite ./paje.trace.painted</code></pre> +<p>Should be like: // IMAGE HERE</p> <ul> <li>Get execution times</li> </ul> <pre class="bash"><code>python $SCALFMM_STARPU_DIR/bin/starpu_trace_state_stats.py -t trace.rec</code></pre> <p>Should give something like:</p> <pre><code>"Name","Count","Type","Duration" -"Initializing",3,"Runtime",5.027746 -"Overhead",37,"Runtime",0.110073 -"Idle",13,"Other",0.03678 -"Scheduling",24,"Runtime",16.529527 -"Sleeping",17,"Other",2197.255516 -"FetchingInput",10,"Runtime",0.012637 -"execute_on_all_wrapper",6,"Task",8.431909 -"PushingOutput",10,"Runtime",16.505568 -"P2P",1,"Task",105.131112 -"Callback",4,"Runtime",0.001048 -"Deinitializing",3,"Runtime",0.014547 -"P2M",1,"Task",2.543303 -"L2P",1,"Task",5.649106 -"M2L-level-2",1,"Task",2.167273</code></pre> +"Initializing",14,"Runtime",7153.096196 +"Overhead",57010,"Runtime",376.473463 +"Idle",14355,"Other",12.815899 +"Scheduling",28441,"Runtime",238.367394 +"Sleeping",610,"Other",13786.513208 +"FetchingInput",14341,"Runtime",13918.805814 +"execute_on_all_wrapper",30,"Task",21.288802 +"Executing",414,"Runtime",26852.864578 +"PushingOutput",14341,"Runtime",284.96123 +"P2P-out",3846,"Task",60378.266619 +"Callback",13559,"Runtime",4.210633 +"P2P",328,"Task",15383.426991 +"M2L-level-5",41,"Task",2354.702554 +"M2L-level-6",328,"Task",18349.915495 +"Deinitializing",14,"Runtime",109.87483 +"M2L-level-4",6,"Task",275.088295 +"P2M",328,"Task",11312.022842 +"M2M-level-5",328,"Task",829.9055 +"M2M-level-4",41,"Task",93.130498 +"M2L-out-level-5",638,"Task",1914.900053 +"M2M-level-3",6,"Task",11.053067 +"M2M-level-2",1,"Task",1.363157 +"M2L-out-level-4",22,"Task",159.580457 +"L2L-level-4",41,"Task",84.554065 +"L2L-level-5",328,"Task",1087.717767 +"M2L-out-level-6",7692,"Task",18322.518045 +"L2P",328,"Task",27146.256793 +"M2L-level-2",1,"Task",2.661235 +"L2L-level-3",6,"Task",11.346978 +"M2L-level-3",1,"Task",47.612555 +"L2L-level-2",1,"Task",1.471873</code></pre> <p>Most of the script are in the addon directories</p> <pre><code>export SCALFMM_AB=$SCALFMM_SOURCE_DIR/Addons/BenchEfficiency/</code></pre> +<p><em>Output variable:</em> <code>scalfmmRegisterVariable SCALFMM_AB</code></p> <h2 id="homogeneous-efficiencies">Homogeneous Efficiencies</h2> <p>Here we compute the efficiencies for a given test case on CPU only.</p> <p>Go in the build dir and create output dir</p> <pre><code>cd $SCALFMM_BUILD_DIR -mkdir homogeneous</code></pre> +export SCALFMM_RES_DIR=$SCALFMM_BUILD_DIR/homogeneous +mkdir $SCALFMM_RES_DIR</code></pre> +<p><em>Output variable:</em> <code>scalfmmRegisterVariable SCALFMM_AB</code></p> <p>Set up the configuration variables:</p> -<pre class="bash"><code>SCALFMM_NB=10000000 -SCALFMM_H=7 -SCALFMM_MIN_BS=100 -SCALFMM_MAX_BS=3000 -SCALFMM_MAX_NB_CPU=24</code></pre> +<pre class="bash"><code>export SCALFMM_NB=10000000 +export SCALFMM_H=7 +export SCALFMM_MIN_BS=100 +export SCALFMM_MAX_BS=10000 +export SCALFMM_MAX_NB_CPU=24</code></pre> <p>Find best granularity in sequential and in parallel:</p> -<pre class="bash"><code>STARPU_NCPUS=1 -STARPU_NCUDA=0 -SCALFMM_BS_CPU_SEQ=`$SCALFMM_AB/scalfmmFindBs.sh "./Tests/Release/testBlockedUnifCudaBench -nb $SCALFMM_NB -h $SCALFMM_H -bs" $SCALFMM_MIN_BS $SCALFMM_MAX_BS | $SCALFMM_AB/scalfmm_extract_key "@BEST BS" ` +<pre class="bash"><code>export STARPU_NCPUS=1 +export STARPU_NCUDA=0 +export SCALFMM_BS_CPU_SEQ=`$SCALFMM_AB/scalfmmFindBs.sh "./Tests/Release/testBlockedUnifCudaBench -nb $SCALFMM_NB -h $SCALFMM_H -bs" $SCALFMM_MIN_BS $SCALFMM_MAX_BS | $SCALFMM_AB/scalfmmExtractKey.sh "@BEST BS" ` if [[ `which gnuplot | wc -l` == "1" ]] ; then gnuplot -e "filename='seq-bs-search'" $SCALFMM_AB/scalfmmFindBs.gplot fi -STARPU_NCPUS=$SCALFMM_MAX_NB_CPU -STARPU_NCUDA=0 -=`$SCALFMM_AB/scalfmmFindBs.sh "./Tests/Release/testBlockedUnifCudaBench -nb $SCALFMM_NB -h $SCALFMM_H -bs" $SCALFMM_MIN_BS $SCALFMM_MAX_BS | $SCALFMM_AB/scalfmm_extract_key "@BEST BS" ` +export STARPU_NCPUS=$SCALFMM_MAX_NB_CPU +export STARPU_NCUDA=0 +export SCALFMM_BS_CPU_PAR=`$SCALFMM_AB/scalfmmFindBs.sh "./Tests/Release/testBlockedUnifCudaBench -nb $SCALFMM_NB -h $SCALFMM_H -bs" $SCALFMM_MIN_BS $SCALFMM_MAX_BS | $SCALFMM_AB/scalfmm_extract_key "@BEST BS" ` if [[ `which gnuplot | wc -l` == "1" ]] ; then gnuplot -e "filename='par-bs-search'" $SCALFMM_AB/scalfmmFindBs.gplot fi</code></pre> -<p>Then we compute the efficiency using both granulirities and keep the .rec files.</p> +<p>In our case we get and 5385.</p> +<p><em>Output variable:</em> <code>scalfmmRegisterVariable SCALFMM_BS_CPU_SEQ</code> <code>scalfmmRegisterVariable SCALFMM_BS_CPU_PAR</code></p> +<p>Then we compute the efficiency using both granulirities and keep the .rec files:</p> <pre class="bash"><code>source $SCALFMM_AB/execAllHomogeneous.sh</code></pre> -<p>We should end with all the rec files and their corresponding time files</p> -<pre class="bash"><code></code></pre> +<p>We should end with all the .rec files and their corresponding time files</p> +<pre class="bash"><code>ls $SCALFMM_RES_DIR</code></pre> <p>We compute the efficiencies</p> -<pre class="bash"><code></code></pre> +<pre class="bash"><code>source $SCALFMM_AB/computeHomogeneousEfficiencies</code></pre> <p>We end with efficiency for the application and for the operators.</p> -<pre class="bash"><code></code></pre> +<pre class="bash"><code>cat $SCALFMM_RES_DIR/efficiencies.txt</code></pre> <p>We can plot each of them</p> -<pre class="bash"><code></code></pre> +<pre class="bash"><code>source $SCALFMM_AB/plotEfficiencies.sh $SCALFMM_RES_DIR/efficiencies.txt</code></pre> +<p>Sould give: // IMAGE HERE</p> <h2 id="generating-execution-results">Generating Execution Results</h2> <p>For test case <code>-nb 10000000</code> (10 million) and <code>-h 6</code> (height of the tree equal to 6), we first want to know the best granularity <code>-bs</code>.</p> <p>This parameter will certainly not be the same for sequential/parallel/heterogenous configurations.</p> -<pre class="bash"><code>SCALFMM_NB=10000000 -SCALFMM_H=7 -SCALFMM_MIN_BS=100 -SCALFMM_MAX_BS=3000 -SCALFMM_MAX_NB_CPU=24 -SCALFMM_MAX_NB_GPU=4</code></pre> -<pre class="bash"><code>STARPU_NCPUS=1 -STARPU_NCUDA=0 -SCALFMM_BS_CPU_SEQ=`$SCALFMM_AB/scalfmmFindBs.sh -nb $SCALFMM_NB -h $SCALFMM_H $SCALFMM_MIN_BS $SCALFMM_MAX_BS | $SCALFMM_AB/scalfmm_extract_key "@BEST BS" ` +<pre class="bash"><code>export SCALFMM_NB=10000000 +export SCALFMM_H=7 +export SCALFMM_MIN_BS=100 +export SCALFMM_MAX_BS=3000 +export SCALFMM_MAX_NB_CPU=24 +export SCALFMM_MAX_NB_GPU=4</code></pre> +<pre class="bash"><code>export STARPU_NCPUS=1 +export STARPU_NCUDA=0 +export SCALFMM_BS_CPU_SEQ=`$SCALFMM_AB/scalfmmFindBs.sh -nb $SCALFMM_NB -h $SCALFMM_H $SCALFMM_MIN_BS $SCALFMM_MAX_BS | $SCALFMM_AB/scalfmm_extract_key "@BEST BS" ` if [[ `which gnuplot | wc -l` == "1" ]] ; then gnuplot -e "filename='seq-bs-search'" $SCALFMM_AB/scalfmmFindBs.gplot fi -STARPU_NCPUS=$SCALFMM_MAX_NB_CPU -STARPU_NCUDA=0 -SCALFMM_BS_CPU_PAR=`$SCALFMM_AB/scalfmmFindBs.sh -nb $SCALFMM_NB -h $SCALFMM_H $SCALFMM_MIN_BS $SCALFMM_MAX_BS | $SCALFMM_AB/scalfmm_extract_key "@BEST BS" ` +export STARPU_NCPUS=$SCALFMM_MAX_NB_CPU +export STARPU_NCUDA=0 +export SCALFMM_BS_CPU_PAR=`$SCALFMM_AB/scalfmmFindBs.sh -nb $SCALFMM_NB -h $SCALFMM_H $SCALFMM_MIN_BS $SCALFMM_MAX_BS | $SCALFMM_AB/scalfmm_extract_key "@BEST BS" ` if [[ `which gnuplot | wc -l` == "1" ]] ; then gnuplot -e "filename='par-bs-search'" $SCALFMM_AB/scalfmmFindBs.gplot fi -STARPU_NCPUS=$SCALFMM_MAX_NB_CPU -STARPU_NCUDA=$SCALFMM_MAX_NB_GPU -SCALFMM_BS_CPU_GPU=`$SCALFMM_AB/scalfmmFindBs.sh -nb $SCALFMM_NB -h $SCALFMM_H $SCALFMM_MIN_BS $SCALFMM_MAX_BS | $SCALFMM_AB/scalfmm_extract_key "@BEST BS" ` +export STARPU_NCPUS=$SCALFMM_MAX_NB_CPU +export STARPU_NCUDA=$SCALFMM_MAX_NB_GPU +export SCALFMM_BS_CPU_GPU=`$SCALFMM_AB/scalfmmFindBs.sh -nb $SCALFMM_NB -h $SCALFMM_H $SCALFMM_MIN_BS $SCALFMM_MAX_BS | $SCALFMM_AB/scalfmm_extract_key "@BEST BS" ` if [[ `which gnuplot | wc -l` == "1" ]] ; then gnuplot -e "filename='cpugpu-bs-search'" $SCALFMM_AB/scalfmmFindBs.gplot fi</code></pre> <p>Then, we can execute three best configurations, and keep .rec for each of them:</p> -<pre class="bash"><code>STARPU_NCPUS=1 -STARPU_NCUDA=0 +<pre class="bash"><code>export STARPU_NCPUS=1 +export STARPU_NCUDA=0 ./Tests/Release/testBlockedUnifCudaBench -nb $SCALFMM_NB -h $SCALFMM_H -bs $SCALFMM_CPU_SEQ -SCALFMM_SEQ_REC="trace-nb_$SCALFMM_NB-h_$SCALFMM_H-bs_$SCALFMM_CPU_SEQ-CPU_$STARPU_NCPUS-GPU_$STARPU_NCUDA.rec" +export SCALFMM_SEQ_REC="trace-nb_$SCALFMM_NB-h_$SCALFMM_H-bs_$SCALFMM_CPU_SEQ-CPU_$STARPU_NCPUS-GPU_$STARPU_NCUDA.rec" mv trace.rec $SCALFMM_SEQ_REC -STARPU_NCPUS=$SCALFMM_MAX_NB_CPU -STARPU_NCUDA=0 +export STARPU_NCPUS=$SCALFMM_MAX_NB_CPU +export STARPU_NCUDA=0 ./Tests/Release/testBlockedUnifCudaBench -nb $SCALFMM_NB -h $SCALFMM_H -bs $SCALFMM_BS_CPU_PAR -SCALFMM_PAR_REC="trace-nb_$SCALFMM_NB-h_$SCALFMM_H-bs_$SCALFMM_CPU_SEQ-CPU_$STARPU_NCPUS-GPU_$STARPU_NCUDA.rec" +export SCALFMM_PAR_REC="trace-nb_$SCALFMM_NB-h_$SCALFMM_H-bs_$SCALFMM_CPU_SEQ-CPU_$STARPU_NCPUS-GPU_$STARPU_NCUDA.rec" mv trace.rec $SCALFMM_PAR_REC -STARPU_NCPUS=$SCALFMM_MAX_NB_CPU -STARPU_NCUDA=$SCALFMM_MAX_NB_GPU +export STARPU_NCPUS=$SCALFMM_MAX_NB_CPU +export STARPU_NCUDA=$SCALFMM_MAX_NB_GPU ./Tests/Release/testBlockedUnifCudaBench -nb $SCALFMM_NB -h $SCALFMM_H -bs $SCALFMM_BS_CPU_GPU -SCALFMM_PAR_CPU_GPU_REC="trace-nb_$SCALFMM_NB-h_$SCALFMM_H-bs_$SCALFMM_CPU_SEQ-CPU_$STARPU_NCPUS-GPU_$STARPU_NCUDA.rec" +export SCALFMM_PAR_CPU_GPU_REC="trace-nb_$SCALFMM_NB-h_$SCALFMM_H-bs_$SCALFMM_CPU_SEQ-CPU_$STARPU_NCPUS-GPU_$STARPU_NCUDA.rec" mv trace.rec $SCALFMM_PAR_CPU_GPU_REC</code></pre> <p>And we also want the GPU tasks only on GPU</p> -<pre class="bash"><code>STARPU_NCPUS=$SCALFMM_MAX_NB_CPU -STARPU_NCUDA=$SCALFMM_MAX_NB_GPU +<pre class="bash"><code>export STARPU_NCPUS=$SCALFMM_MAX_NB_CPU +export STARPU_NCUDA=$SCALFMM_MAX_NB_GPU ./Tests/Release/testBlockedUnifCudaBench -nb $SCALFMM_NB -h $SCALFMM_H -bs $SCALFMM_BS_CPU_GPU -p2p-m2l-cuda-only -SCALFMM_PAR_GPU_REC="trace-nb_$SCALFMM_NB-h_$SCALFMM_H-bs_$SCALFMM_CPU_SEQ-CPU_$STARPU_NCPUS-GPU_$STARPU_NCUDA-GPUONLY.rec" +export SCALFMM_PAR_GPU_REC="trace-nb_$SCALFMM_NB-h_$SCALFMM_H-bs_$SCALFMM_CPU_SEQ-CPU_$STARPU_NCPUS-GPU_$STARPU_NCUDA-GPUONLY.rec" mv trace.rec $SCALFMM_PAR_GPU_REC</code></pre> <p>And we want the sequential version with parallel granularity:</p> -<pre class="bash"><code>STARPU_NCPUS=1 -STARPU_NCUDA=0 +<pre class="bash"><code>export STARPU_NCPUS=1 +export STARPU_NCUDA=0 ./Tests/Release/testBlockedUnifCudaBench -nb $SCALFMM_NB -h $SCALFMM_H -bs $SCALFMM_BS_CPU_PAR SCALFMM_SEQ_CPU_BS_REC="trace-nb_$SCALFMM_NB-h_$SCALFMM_H-bs_$SCALFMM_CPU_SEQ-CPU_$STARPU_NCPUS-GPU_$STARPU_NCUDA.rec" diff --git a/Addons/BenchEfficiency/scalfmm.md b/Addons/BenchEfficiency/scalfmm.md index 1d483ddf5e5e8529b9b9acd857429f97fa040fec..2c984565b81fcfef45cdca792ef54e128a67d64c 100644 --- a/Addons/BenchEfficiency/scalfmm.md +++ b/Addons/BenchEfficiency/scalfmm.md @@ -21,6 +21,7 @@ In order to follow this tutorial, it is needed to have the following application * BLAS/LAPACK (The configure of ScalFMM is different if the MKL is used or not, but with the MKL it is recommended to set environment variable `MKLROOT`) * CUDA (>= 7) and `CUDA_PATH` must be set. In our case, `CUDA_PATH=/usr/local/cuda-7.5/` * __Optional__ Vite (from `sudo apt-get install vite` or see [http://vite.gforge.inria.fr/download.php](http://vite.gforge.inria.fr/download.php)) +* __Optional__ Qt5 library to be able to change the colors of the execution traces in order to visualize the different FMM operators > Some installations of CUDA does not have libcuda file. > In this case, one needs to create a link : `sudo ln /usr/local/cuda-7.5/lib64/libcudart.so /usr/local/cuda-7.5/lib64/libcuda.so` @@ -38,11 +39,20 @@ In order to follow this tutorial, it is needed to have the following application The variable `SCALFMM_TEST_DIR` is used to specify the working directory: ```bash export SCALFMM_TEST_DIR=~/scalfmm_test -mkdir $SCALFMM_TEST_DIR +if [[ ! -d $SCALFMM_TEST_DIR ]] ; then + mkdir $SCALFMM_TEST_DIR +fi cd $SCALFMM_TEST_DIR ``` -*Output variables:* `$SCALFMM_TEST_DIR` +In order to be able to stop the tutorial in the middle and restart later, we will keep the register variables in a file that should be source to restart. +```bash +function scalfmmRegisterVariable() { echo "export $1=${!1}" >> "$SCALFMM_TEST_DIR/environment.source"; } +echo "function scalfmmRegisterVariable() { echo \"export $1=${!1}\" >> \"$SCALFMM_TEST_DIR/environment.source\"; }" > "$SCALFMM_TEST_DIR/environment.source" +``` + + +*Output variables:* `scalfmmRegisterVariable SCALFMM_TEST_DIR` Valid-if ```bash @@ -72,12 +82,12 @@ if [[ ! -f hwloc-1.11.2.tar.gz ]] ; then fi tar xvf hwloc-1.11.2.tar.gz cd hwloc-1.11.2/ -SCALFMM_HWLOC_DIR=$SCALFMM_TEST_DIR/hwlocinstall +export SCALFMM_HWLOC_DIR=$SCALFMM_TEST_DIR/hwlocinstall ./configure --prefix=$SCALFMM_HWLOC_DIR make install ``` -*Output variables:* `$SCALFMM_HWLOC_DIR` +*Output variables:* `scalfmmRegisterVariable SCALFMM_HWLOC_DIR` Valid-if: ```bash @@ -94,12 +104,12 @@ if [[ ! -f fxt-0.2.11.tar.gz ]] ; then fi tar xvf fxt-0.2.11.tar.gz cd fxt-0.2.11/ -SCALFMM_FXT_DIR=$SCALFMM_TEST_DIR/fxtinstall +export SCALFMM_FXT_DIR=$SCALFMM_TEST_DIR/fxtinstall ./configure --prefix=$SCALFMM_FXT_DIR make install ``` -*Output variables:* `$SCALFMM_FXT_DIR` +*Output variables:* `scalfmmRegisterVariable SCALFMM_FXT_DIR` Valid-if: ```bash @@ -117,14 +127,14 @@ if [[ ! -f fftw-3.3.4.tar.gz ]] ; then fi tar xvf fftw-3.3.4.tar.gz cd fftw-3.3.4/ -SCALFMM_FFTW_DIR=$SCALFMM_TEST_DIR/fftinstall +export SCALFMM_FFTW_DIR=$SCALFMM_TEST_DIR/fftinstall ./configure --prefix=$SCALFMM_FFTW_DIR make install ./configure --prefix=$SCALFMM_FFTW_DIR --enable-float make install ``` -*Output variables:* `$SCALFMM_FFTW_DIR` +*Output variables:* `scalfmmRegisterVariable SCALFMM_FFTW_DIR` Valid-if: ```bash @@ -140,14 +150,14 @@ if [[ ! -d starpu ]] ; then svn export svn://scm.gforge.inria.fr/svnroot/starpu/trunk starpu fi cd starpu/ -SCALFMM_STARPU_DIR=$SCALFMM_TEST_DIR/starpuinstall +export SCALFMM_STARPU_DIR=$SCALFMM_TEST_DIR/starpuinstall ./autogen.sh ./configure --prefix=$SCALFMM_STARPU_DIR --with-fxt=$SCALFMM_FXT_DIR --with-hwloc=$SCALFMM_HWLOC_DIR --with-cuda-dir=$CUDA_PATH --disable-opencl make install ``` > __Optional__ In case you do not want to use trace (FXT) please remove the `--with-fxt=$SCALFMM_FXT_DIR` parameter from the command -*Output variables:* `$SCALFMM_STARPU_DIR` +*Output variables:* `scalfmmRegisterVariable SCALFMM_STARPU_DIR` Valid-if: ```bash @@ -167,11 +177,11 @@ if [[ ! -d scalfmm-public ]] ; then fi cd scalfmm-public/ export SCALFMM_SOURCE_DIR=`pwd` -Build/ +cd Build/ export SCALFMM_BUILD_DIR=`pwd` ``` -*Output variables:* `SCALFMM_BUILD_DIR` `SCALFMM_SOURCE_DIR` +*Output variables:* `scalfmmRegisterVariable SCALFMM_BUILD_DIR` `scalfmmRegisterVariable SCALFMM_SOURCE_DIR` + Configure (No MKL): ```bash @@ -224,7 +234,30 @@ Information for scalfmm binaries Examples: ``` -STARPU_NCPUS=3 STARPU_NCUDA=1 ./Tests/Release/testBlockedUnifCudaBench -nb 10000 -h 3 +export STARPU_NCPUS=12 +export STARPU_NCUDA=2 +./Tests/Release/testBlockedUnifCudaBench -nb 30000000 -h 7 -bs 800 +``` + +Last part of the output should be: +```bash + Start FGroupTaskStarPUAlgorithm + directPass in 0.0406482s + inblock in 0.000780428s + outblock in 0.0398674s + bottomPass in 0.00586269s + upwardPass in 0.00265723s + transferPass in 0.00323571s + inblock in 0.000124817s + outblock in 0.00298331s + downardPass in 0.00257975s + transferPass in 0.0652285s + inblock in 0.00164774s + outblock in 0.0635799s + L2P in 0.0115733s + Submitting the tasks took 0.139101s + Moving data to the host took 0.0578765s +@EXEC TIME = 14.6321s ``` + Visualize the execution trace (__Optional__) @@ -238,6 +271,17 @@ Then visualize the output with vite vite ./paje.trace ``` +Should be like: // IMAGE HERE + +We can convert the color of the trace by (it needs Qt5 library): + +```bash +$SCALFMM_SOURCE_DIR/Addons/BenchEfficiency/pajecolor paje.trace $SCALFMM_SOURCE_DIR/Addons/BenchEfficiency/paintmodel.fmm.colors +vite ./paje.trace.painted +``` + +Should be like: // IMAGE HERE + + Get execution times ```bash @@ -247,20 +291,37 @@ python $SCALFMM_STARPU_DIR/bin/starpu_trace_state_stats.py -t trace.rec Should give something like: ``` "Name","Count","Type","Duration" -"Initializing",3,"Runtime",5.027746 -"Overhead",37,"Runtime",0.110073 -"Idle",13,"Other",0.03678 -"Scheduling",24,"Runtime",16.529527 -"Sleeping",17,"Other",2197.255516 -"FetchingInput",10,"Runtime",0.012637 -"execute_on_all_wrapper",6,"Task",8.431909 -"PushingOutput",10,"Runtime",16.505568 -"P2P",1,"Task",105.131112 -"Callback",4,"Runtime",0.001048 -"Deinitializing",3,"Runtime",0.014547 -"P2M",1,"Task",2.543303 -"L2P",1,"Task",5.649106 -"M2L-level-2",1,"Task",2.167273 +"Initializing",14,"Runtime",7153.096196 +"Overhead",57010,"Runtime",376.473463 +"Idle",14355,"Other",12.815899 +"Scheduling",28441,"Runtime",238.367394 +"Sleeping",610,"Other",13786.513208 +"FetchingInput",14341,"Runtime",13918.805814 +"execute_on_all_wrapper",30,"Task",21.288802 +"Executing",414,"Runtime",26852.864578 +"PushingOutput",14341,"Runtime",284.96123 +"P2P-out",3846,"Task",60378.266619 +"Callback",13559,"Runtime",4.210633 +"P2P",328,"Task",15383.426991 +"M2L-level-5",41,"Task",2354.702554 +"M2L-level-6",328,"Task",18349.915495 +"Deinitializing",14,"Runtime",109.87483 +"M2L-level-4",6,"Task",275.088295 +"P2M",328,"Task",11312.022842 +"M2M-level-5",328,"Task",829.9055 +"M2M-level-4",41,"Task",93.130498 +"M2L-out-level-5",638,"Task",1914.900053 +"M2M-level-3",6,"Task",11.053067 +"M2M-level-2",1,"Task",1.363157 +"M2L-out-level-4",22,"Task",159.580457 +"L2L-level-4",41,"Task",84.554065 +"L2L-level-5",328,"Task",1087.717767 +"M2L-out-level-6",7692,"Task",18322.518045 +"L2P",328,"Task",27146.256793 +"M2L-level-2",1,"Task",2.661235 +"L2L-level-3",6,"Task",11.346978 +"M2L-level-3",1,"Task",47.612555 +"L2L-level-2",1,"Task",1.471873 ``` Most of the script are in the addon directories @@ -268,6 +329,8 @@ Most of the script are in the addon directories export SCALFMM_AB=$SCALFMM_SOURCE_DIR/Addons/BenchEfficiency/ ``` +*Output variable:* `scalfmmRegisterVariable SCALFMM_AB` + ## Homogeneous Efficiencies Here we compute the efficiencies for a given test case on CPU only. @@ -275,62 +338,68 @@ Here we compute the efficiencies for a given test case on CPU only. Go in the build dir and create output dir ``` cd $SCALFMM_BUILD_DIR -mkdir homogeneous +export SCALFMM_RES_DIR=$SCALFMM_BUILD_DIR/homogeneous +mkdir $SCALFMM_RES_DIR ``` +*Output variable:* `scalfmmRegisterVariable SCALFMM_AB` + Set up the configuration variables: ```bash -SCALFMM_NB=10000000 -SCALFMM_H=7 -SCALFMM_MIN_BS=100 -SCALFMM_MAX_BS=3000 -SCALFMM_MAX_NB_CPU=24 +export SCALFMM_NB=10000000 +export SCALFMM_H=7 +export SCALFMM_MIN_BS=100 +export SCALFMM_MAX_BS=10000 +export SCALFMM_MAX_NB_CPU=24 ``` Find best granularity in sequential and in parallel: ```bash -STARPU_NCPUS=1 -STARPU_NCUDA=0 -SCALFMM_BS_CPU_SEQ=`$SCALFMM_AB/scalfmmFindBs.sh "./Tests/Release/testBlockedUnifCudaBench -nb $SCALFMM_NB -h $SCALFMM_H -bs" $SCALFMM_MIN_BS $SCALFMM_MAX_BS | $SCALFMM_AB/scalfmm_extract_key "@BEST BS" ` +export STARPU_NCPUS=1 +export STARPU_NCUDA=0 +export SCALFMM_BS_CPU_SEQ=`$SCALFMM_AB/scalfmmFindBs.sh "./Tests/Release/testBlockedUnifCudaBench -nb $SCALFMM_NB -h $SCALFMM_H -bs" $SCALFMM_MIN_BS $SCALFMM_MAX_BS | $SCALFMM_AB/scalfmmExtractKey.sh "@BEST BS" ` if [[ `which gnuplot | wc -l` == "1" ]] ; then gnuplot -e "filename='seq-bs-search'" $SCALFMM_AB/scalfmmFindBs.gplot fi -STARPU_NCPUS=$SCALFMM_MAX_NB_CPU -STARPU_NCUDA=0 -=`$SCALFMM_AB/scalfmmFindBs.sh "./Tests/Release/testBlockedUnifCudaBench -nb $SCALFMM_NB -h $SCALFMM_H -bs" $SCALFMM_MIN_BS $SCALFMM_MAX_BS | $SCALFMM_AB/scalfmm_extract_key "@BEST BS" ` +export STARPU_NCPUS=$SCALFMM_MAX_NB_CPU +export STARPU_NCUDA=0 +export SCALFMM_BS_CPU_PAR=`$SCALFMM_AB/scalfmmFindBs.sh "./Tests/Release/testBlockedUnifCudaBench -nb $SCALFMM_NB -h $SCALFMM_H -bs" $SCALFMM_MIN_BS $SCALFMM_MAX_BS | $SCALFMM_AB/scalfmm_extract_key "@BEST BS" ` if [[ `which gnuplot | wc -l` == "1" ]] ; then gnuplot -e "filename='par-bs-search'" $SCALFMM_AB/scalfmmFindBs.gplot fi ``` +In our case we get and 5385. -Then we compute the efficiency using both granulirities and keep the .rec files. +*Output variable:* `scalfmmRegisterVariable SCALFMM_BS_CPU_SEQ` `scalfmmRegisterVariable SCALFMM_BS_CPU_PAR` +Then we compute the efficiency using both granulirities and keep the .rec files: ```bash source $SCALFMM_AB/execAllHomogeneous.sh ``` -We should end with all the rec files and their corresponding time files +We should end with all the .rec files and their corresponding time files ```bash - +ls $SCALFMM_RES_DIR ``` - We compute the efficiencies ```bash - +source $SCALFMM_AB/computeHomogeneousEfficiencies ``` We end with efficiency for the application and for the operators. ```bash - +cat $SCALFMM_RES_DIR/efficiencies.txt ``` We can plot each of them ```bash - +source $SCALFMM_AB/plotEfficiencies.sh $SCALFMM_RES_DIR/efficiencies.txt ``` +Sould give: // IMAGE HERE + ## Generating Execution Results @@ -340,32 +409,32 @@ we first want to know the best granularity `-bs`. This parameter will certainly not be the same for sequential/parallel/heterogenous configurations. ```bash -SCALFMM_NB=10000000 -SCALFMM_H=7 -SCALFMM_MIN_BS=100 -SCALFMM_MAX_BS=3000 -SCALFMM_MAX_NB_CPU=24 -SCALFMM_MAX_NB_GPU=4 +export SCALFMM_NB=10000000 +export SCALFMM_H=7 +export SCALFMM_MIN_BS=100 +export SCALFMM_MAX_BS=3000 +export SCALFMM_MAX_NB_CPU=24 +export SCALFMM_MAX_NB_GPU=4 ``` ```bash -STARPU_NCPUS=1 -STARPU_NCUDA=0 -SCALFMM_BS_CPU_SEQ=`$SCALFMM_AB/scalfmmFindBs.sh -nb $SCALFMM_NB -h $SCALFMM_H $SCALFMM_MIN_BS $SCALFMM_MAX_BS | $SCALFMM_AB/scalfmm_extract_key "@BEST BS" ` +export STARPU_NCPUS=1 +export STARPU_NCUDA=0 +export SCALFMM_BS_CPU_SEQ=`$SCALFMM_AB/scalfmmFindBs.sh -nb $SCALFMM_NB -h $SCALFMM_H $SCALFMM_MIN_BS $SCALFMM_MAX_BS | $SCALFMM_AB/scalfmm_extract_key "@BEST BS" ` if [[ `which gnuplot | wc -l` == "1" ]] ; then gnuplot -e "filename='seq-bs-search'" $SCALFMM_AB/scalfmmFindBs.gplot fi -STARPU_NCPUS=$SCALFMM_MAX_NB_CPU -STARPU_NCUDA=0 -SCALFMM_BS_CPU_PAR=`$SCALFMM_AB/scalfmmFindBs.sh -nb $SCALFMM_NB -h $SCALFMM_H $SCALFMM_MIN_BS $SCALFMM_MAX_BS | $SCALFMM_AB/scalfmm_extract_key "@BEST BS" ` +export STARPU_NCPUS=$SCALFMM_MAX_NB_CPU +export STARPU_NCUDA=0 +export SCALFMM_BS_CPU_PAR=`$SCALFMM_AB/scalfmmFindBs.sh -nb $SCALFMM_NB -h $SCALFMM_H $SCALFMM_MIN_BS $SCALFMM_MAX_BS | $SCALFMM_AB/scalfmm_extract_key "@BEST BS" ` if [[ `which gnuplot | wc -l` == "1" ]] ; then gnuplot -e "filename='par-bs-search'" $SCALFMM_AB/scalfmmFindBs.gplot fi -STARPU_NCPUS=$SCALFMM_MAX_NB_CPU -STARPU_NCUDA=$SCALFMM_MAX_NB_GPU -SCALFMM_BS_CPU_GPU=`$SCALFMM_AB/scalfmmFindBs.sh -nb $SCALFMM_NB -h $SCALFMM_H $SCALFMM_MIN_BS $SCALFMM_MAX_BS | $SCALFMM_AB/scalfmm_extract_key "@BEST BS" ` +export STARPU_NCPUS=$SCALFMM_MAX_NB_CPU +export STARPU_NCUDA=$SCALFMM_MAX_NB_GPU +export SCALFMM_BS_CPU_GPU=`$SCALFMM_AB/scalfmmFindBs.sh -nb $SCALFMM_NB -h $SCALFMM_H $SCALFMM_MIN_BS $SCALFMM_MAX_BS | $SCALFMM_AB/scalfmm_extract_key "@BEST BS" ` if [[ `which gnuplot | wc -l` == "1" ]] ; then gnuplot -e "filename='cpugpu-bs-search'" $SCALFMM_AB/scalfmmFindBs.gplot fi @@ -373,38 +442,38 @@ fi Then, we can execute three best configurations, and keep .rec for each of them: ```bash -STARPU_NCPUS=1 -STARPU_NCUDA=0 +export STARPU_NCPUS=1 +export STARPU_NCUDA=0 ./Tests/Release/testBlockedUnifCudaBench -nb $SCALFMM_NB -h $SCALFMM_H -bs $SCALFMM_CPU_SEQ -SCALFMM_SEQ_REC="trace-nb_$SCALFMM_NB-h_$SCALFMM_H-bs_$SCALFMM_CPU_SEQ-CPU_$STARPU_NCPUS-GPU_$STARPU_NCUDA.rec" +export SCALFMM_SEQ_REC="trace-nb_$SCALFMM_NB-h_$SCALFMM_H-bs_$SCALFMM_CPU_SEQ-CPU_$STARPU_NCPUS-GPU_$STARPU_NCUDA.rec" mv trace.rec $SCALFMM_SEQ_REC -STARPU_NCPUS=$SCALFMM_MAX_NB_CPU -STARPU_NCUDA=0 +export STARPU_NCPUS=$SCALFMM_MAX_NB_CPU +export STARPU_NCUDA=0 ./Tests/Release/testBlockedUnifCudaBench -nb $SCALFMM_NB -h $SCALFMM_H -bs $SCALFMM_BS_CPU_PAR -SCALFMM_PAR_REC="trace-nb_$SCALFMM_NB-h_$SCALFMM_H-bs_$SCALFMM_CPU_SEQ-CPU_$STARPU_NCPUS-GPU_$STARPU_NCUDA.rec" +export SCALFMM_PAR_REC="trace-nb_$SCALFMM_NB-h_$SCALFMM_H-bs_$SCALFMM_CPU_SEQ-CPU_$STARPU_NCPUS-GPU_$STARPU_NCUDA.rec" mv trace.rec $SCALFMM_PAR_REC -STARPU_NCPUS=$SCALFMM_MAX_NB_CPU -STARPU_NCUDA=$SCALFMM_MAX_NB_GPU +export STARPU_NCPUS=$SCALFMM_MAX_NB_CPU +export STARPU_NCUDA=$SCALFMM_MAX_NB_GPU ./Tests/Release/testBlockedUnifCudaBench -nb $SCALFMM_NB -h $SCALFMM_H -bs $SCALFMM_BS_CPU_GPU -SCALFMM_PAR_CPU_GPU_REC="trace-nb_$SCALFMM_NB-h_$SCALFMM_H-bs_$SCALFMM_CPU_SEQ-CPU_$STARPU_NCPUS-GPU_$STARPU_NCUDA.rec" +export SCALFMM_PAR_CPU_GPU_REC="trace-nb_$SCALFMM_NB-h_$SCALFMM_H-bs_$SCALFMM_CPU_SEQ-CPU_$STARPU_NCPUS-GPU_$STARPU_NCUDA.rec" mv trace.rec $SCALFMM_PAR_CPU_GPU_REC ``` And we also want the GPU tasks only on GPU ```bash -STARPU_NCPUS=$SCALFMM_MAX_NB_CPU -STARPU_NCUDA=$SCALFMM_MAX_NB_GPU +export STARPU_NCPUS=$SCALFMM_MAX_NB_CPU +export STARPU_NCUDA=$SCALFMM_MAX_NB_GPU ./Tests/Release/testBlockedUnifCudaBench -nb $SCALFMM_NB -h $SCALFMM_H -bs $SCALFMM_BS_CPU_GPU -p2p-m2l-cuda-only -SCALFMM_PAR_GPU_REC="trace-nb_$SCALFMM_NB-h_$SCALFMM_H-bs_$SCALFMM_CPU_SEQ-CPU_$STARPU_NCPUS-GPU_$STARPU_NCUDA-GPUONLY.rec" +export SCALFMM_PAR_GPU_REC="trace-nb_$SCALFMM_NB-h_$SCALFMM_H-bs_$SCALFMM_CPU_SEQ-CPU_$STARPU_NCPUS-GPU_$STARPU_NCUDA-GPUONLY.rec" mv trace.rec $SCALFMM_PAR_GPU_REC ``` And we want the sequential version with parallel granularity: ```bash -STARPU_NCPUS=1 -STARPU_NCUDA=0 +export STARPU_NCPUS=1 +export STARPU_NCUDA=0 ./Tests/Release/testBlockedUnifCudaBench -nb $SCALFMM_NB -h $SCALFMM_H -bs $SCALFMM_BS_CPU_PAR SCALFMM_SEQ_CPU_BS_REC="trace-nb_$SCALFMM_NB-h_$SCALFMM_H-bs_$SCALFMM_CPU_SEQ-CPU_$STARPU_NCPUS-GPU_$STARPU_NCUDA.rec" diff --git a/Addons/BenchEfficiency/scalfmmFindBs.sh b/Addons/BenchEfficiency/scalfmmFindBs.sh index 3c454832e0c3231625b58d3df8b3a7660b3788de..06a45c402eb3eb094bee6674a692ecda037b38d2 100644 --- a/Addons/BenchEfficiency/scalfmmFindBs.sh +++ b/Addons/BenchEfficiency/scalfmmFindBs.sh @@ -5,7 +5,9 @@ function GetExecTime() { local res_output=`$1 "$2"` >&2 echo "[LOG] Try to find $3" local time_result=`echo "$res_output" | grep "$3" | cut -d'=' -f2 | cut -d's' -f1` - # >&2 echo "[LOG] output : $res_output" + if [[ $VERBOSE ]] ; then + >&2 echo "[LOG] output : $res_output" + fi >&2 echo "[LOG] Done in $time_result" echo $time_result } @@ -31,8 +33,10 @@ fi echo "You ask to find the best bs for:" echo "Command: $1" echo "From $2 to $3" +echo "STARPU_NCPUS = $STARPU_NCPUS" +echo "STARPU_NCUDA = $STARPU_NCUDA" -outputfile=./bs_bench.data +outputfile=./benchBs.data echo "# BS TIME" > $outputfile diff --git a/Addons/BenchEfficiency/seq-bs-search.png b/Addons/BenchEfficiency/seq-bs-search.png new file mode 100644 index 0000000000000000000000000000000000000000..215e5ce2110aca4e0eb3e4fed032aa1a3e5590fd Binary files /dev/null and b/Addons/BenchEfficiency/seq-bs-search.png differ diff --git a/Addons/BenchEfficiency/trace-example-colors.png b/Addons/BenchEfficiency/trace-example-colors.png new file mode 100644 index 0000000000000000000000000000000000000000..dcefa9fb53660927f1509d64f89254ee03e60dec Binary files /dev/null and b/Addons/BenchEfficiency/trace-example-colors.png differ diff --git a/Addons/BenchEfficiency/trace-example.png b/Addons/BenchEfficiency/trace-example.png new file mode 100644 index 0000000000000000000000000000000000000000..5e466b94ed15a4d0905484425a75de9d390f45d7 Binary files /dev/null and b/Addons/BenchEfficiency/trace-example.png differ