diff --git a/.gitlab/bench_plafrim.yml b/.gitlab/bench_plafrim.yml index 0542e3018927c99988efeca00cec18035d47c7bb..24d9f5a7e02e26e4d77b99db22ea4350c4de756e 100644 --- a/.gitlab/bench_plafrim.yml +++ b/.gitlab/bench_plafrim.yml @@ -1,10 +1,29 @@ -bench_plafrim: +bench_plafrim_bora: stage: test tags: ['plafrim'] - variables: - BUILD_OPTIONS: "-DCHAMELEON_USE_MPI=ON -DCMAKE_BUILD_TYPE=Release" - VERSION: miriel + before_script: + - git submodule update --init --recursive script: - - (cd tools/bench/plafrim/miriel && ./run.sh) + - ./tools/bench/plafrim/bora/run.sh only: - schedules + +.bench_plafrim_miriel: + stage: test + tags: ['plafrim'] + before_script: + - git submodule update --init --recursive + script: + - ./tools/bench/plafrim/miriel/run.sh + only: + - schedules + +.bench_plafrim_sirocco: + stage: test + tags: ['plafrim'] + before_script: + - git submodule update --init --recursive + script: + - ./tools/bench/plafrim/sirocco/run.sh + only: + - schedules \ No newline at end of file diff --git a/tools/bench/chameleon_guix.sh b/tools/bench/chameleon_guix.sh new file mode 100755 index 0000000000000000000000000000000000000000..a120649c163ee4143bfbeae8ccab26f0e8156e25 --- /dev/null +++ b/tools/bench/chameleon_guix.sh @@ -0,0 +1,26 @@ +#!/bin/bash + +set -x + +# this script depends on a MPI vendor: openmpi or nmad +MPI=$1 + +# Configure and Build Chameleon +mkdir -p $CI_PROJECT_DIR/build-$NODE-$MPI +cp $CI_PROJECT_DIR/guix.json $CI_PROJECT_DIR/build-$NODE-$MPI/ +cd $CI_PROJECT_DIR/build-$NODE-$MPI +rm CMake* -rf +cmake $BUILD_OPTIONS .. +make -j20 VERBOSE=1 +export CHAMELEON_BUILD=$PWD + +# Execute jube benchmarks +jube run $CI_PROJECT_DIR/tools/bench/$PLATFORM/$NODE/chameleon_$MPI.xml --tag gemm potrf geqrf +# jube analysis +jube analyse $CI_PROJECT_DIR/tools/bench/$PLATFORM/$NODE/results/ +# jube report +jube result $CI_PROJECT_DIR/tools/bench/$PLATFORM/$NODE/results/ -i last > chameleon.csv + +# send results to the elasticsearch server +export PYTHONPATH=$GUIX_ENVIRONMENT/lib/python3.7/site-packages +python3 $CI_PROJECT_DIR/tools/bench/jube/add_result.py -e https://elasticsearch.bordeaux.inria.fr -t hiepacs -p "chameleon" -h $NODE -m $MPI chameleon.csv diff --git a/tools/bench/chameleon_guix_nmad.sl b/tools/bench/chameleon_guix_nmad.sl new file mode 100644 index 0000000000000000000000000000000000000000..b3b952de26f297ff1dbad48c71d25116a6c54335 --- /dev/null +++ b/tools/bench/chameleon_guix_nmad.sl @@ -0,0 +1,22 @@ +#!/usr/bin/env bash +#SBATCH --exclusive +#SBATCH --ntasks-per-node=1 +#SBATCH --threads-per-core=1 + +echo "######################### Chameleon benchmarks #########################" + +# to avoid a lock during fetching chameleon branch in parallel +export XDG_CACHE_HOME=/tmp/guix-$$ + +# save guix commits +guix describe --format=json > guix.json + +# Submit jobs + +# Nmad version +exec guix environment --pure --preserve=^CI --preserve=^SLURM --preserve=^STARPU --preserve=^PADICO --preserve=^PIOM --preserve=PLATFORM --preserve=NODE --preserve=BUILD_OPTIONS chameleon --with-input=openblas=mkl --with-input=openmpi=nmad --ad-hoc slurm jube python python-click python-gitpython python-elasticsearch python-certifi coreutils inetutils util-linux procps grep tar sed gzip which gawk perl zlib openssh hwloc nmad starpu mkl -- /bin/bash --norc $CI_PROJECT_DIR/tools/bench/chameleon_guix.sh nmad + +echo "####################### End Chameleon benchmarks #######################" + +# clean tmp +rm -rf /tmp/guix-$$ diff --git a/tools/bench/chameleon_guix_openmpi.sl b/tools/bench/chameleon_guix_openmpi.sl new file mode 100644 index 0000000000000000000000000000000000000000..fd84573afffc28ac15ce54f9c41cd7bd3d6bcf48 --- /dev/null +++ b/tools/bench/chameleon_guix_openmpi.sl @@ -0,0 +1,22 @@ +#!/usr/bin/env bash +#SBATCH --exclusive +#SBATCH --ntasks-per-node=1 +#SBATCH --threads-per-core=1 + +echo "######################### Chameleon benchmarks #########################" + +# to avoid a lock during fetching chameleon branch in parallel +export XDG_CACHE_HOME=/tmp/guix-$$ + +# save guix commits +guix describe --format=json > guix.json + +# Submit jobs + +# OpenMPI version +exec guix environment --pure --preserve=^CI --preserve=^SLURM --preserve=^STARPU --preserve=PLATFORM --preserve=NODE --preserve=BUILD_OPTIONS chameleon --with-input=openblas=mkl --ad-hoc slurm jube python python-click python-gitpython python-elasticsearch python-certifi sed coreutils grep gawk openssh perl hwloc openmpi starpu mkl -- /bin/bash --norc $CI_PROJECT_DIR/tools/bench/chameleon_guix.sh openmpi + +echo "####################### End Chameleon benchmarks #######################" + +# clean tmp +rm -rf /tmp/guix-$$ diff --git a/tools/bench/jube/add_result.py b/tools/bench/jube/add_result.py index f5e4502cbcb6f18d16b299c1213b5ab73aab890d..63a68d6d27dc93ccc39efe81711b6f4ab8c02a0a 100755 --- a/tools/bench/jube/add_result.py +++ b/tools/bench/jube/add_result.py @@ -32,24 +32,23 @@ def open_csv(filename: str) -> List[Dict[str, str]]: return csv_rows -def format_entry(row: Row, commit_chameleon: Repo, commit_guix: str, commit_guix_hpc: str, commit_guix_hpcnonfree: str) -> Dict[str, Any]: +def format_entry(row: Row, mpivendor: str, commit_chameleon: Repo, commit_guix: str, commit_guix_hpc: str, commit_guix_hpcnonfree: str) -> Dict[str, Any]: """"format a result""" commit_date_chameleon = str(time.strftime("%Y-%m-%d %H:%M:%S", time.gmtime(commit_chameleon.committed_date))) commit_sha_chameleon = str(commit_chameleon.hexsha) hostname = str(row.pop('hostname')) algorithm = str(row.pop('algorithm')) precision = str(row.pop('precision')) - nmpi = int(row.pop('NMPI')) - tdbc_p = int(row.pop('P')) - tdbc_q = int(row.pop('Q')) - nthread = int(row.pop('NTHREAD')) - ngpu = int(row.pop('NGPU')) - m = int(row.pop('M')) - n = int(row.pop('N')) - k = int(row.pop('K')) - cputime = float(row.pop('CPUTIME')) - gflops = float(row.pop('GFLOPS')) - stddev = float(row.pop('STDDEV')) + nmpi = int(row.pop('nmpi')) + p = int(row.pop('p')) + q = int(row.pop('q')) + nthread = int(row.pop('nthr')) + ngpu = int(row.pop('ngpu')) + m = int(row.pop('m')) + n = int(row.pop('n')) + k = int(row.pop('k')) + cputime = float(row.pop('cputime')) + gflops = float(row.pop('gflops')) result = { "Commit_date_chameleon": commit_date_chameleon, "Commit_sha_chameleon": commit_sha_chameleon, @@ -57,19 +56,19 @@ def format_entry(row: Row, commit_chameleon: Repo, commit_guix: str, commit_guix "Commit_sha_guix_hpc": commit_guix_hpc, "Commit_sha_guix_hpcnonfree": commit_guix_hpcnonfree, "Hostname": hostname, + "MPIvendor": mpivendor, "Algorithm": algorithm, "Precision": precision, "Nmpi": nmpi, - "P": tdbc_p, - "Q": tdbc_q, + "P": p, + "Q": q, "Nthread": nthread, "Ngpu": ngpu, "M": m, "N": n, "K": k, "Cputime": cputime, - "Gflops": gflops, - "Stddev": stddev + "Gflops": gflops } return result @@ -80,6 +79,7 @@ def format_entry(row: Row, commit_chameleon: Repo, commit_guix: str, commit_guix @click.option("-t", "--team", required=True, help="team name") @click.option("-p", "--project", required=True, help="project name") @click.option("-h", "--host", required=True, help="host name") +@click.option("-m", "--mpi", required=True, help="MPI vendor (openmpi, nmad)") @click.argument("csv-files", nargs=-1) def main( directory: str, @@ -87,6 +87,7 @@ def main( team: str, project: str, host: str, + mpi: str, csv_files: str, ): """Add a result to an elasticsearch database.""" @@ -104,6 +105,7 @@ def main( "Commit_sha_guix_hpc": {"type": "keyword"}, "Commit_sha_guix_hpcnonfree": {"type": "keyword"}, "Hostname": {"type": "keyword"}, + "MPIvendor": {"type": "keyword"}, "Algorithm": {"type": "keyword"}, "Precision": {"type": "keyword"}, "Nmpi": {"type": "integer"}, @@ -115,8 +117,7 @@ def main( "N": {"type": "integer"}, "K": {"type": "integer"}, "Cputime": {"type": "float"}, - "Gflops": {"type": "float"}, - "Stddev": {"type": "float"} + "Gflops": {"type": "float"} } } } @@ -141,7 +142,7 @@ def main( request for file in csv_files for request in map( - lambda row: format_entry(row, commit_chameleon, commit_guix, commit_guix_hpc, commit_guix_hpcnonfree), + lambda row: format_entry(row, mpi, commit_chameleon, commit_guix, commit_guix_hpc, commit_guix_hpcnonfree), open_csv(file) ) ] diff --git a/tools/bench/jube/paths.xml b/tools/bench/jube/paths.xml deleted file mode 100644 index b92cad88a8aa3cb975df3c4784c2b2cf73f910d1..0000000000000000000000000000000000000000 --- a/tools/bench/jube/paths.xml +++ /dev/null @@ -1,6 +0,0 @@ -<?xml version="1.0" encoding="UTF-8"?> -<jube> - <parameterset name="paths"> - <parameter name="BIN_DIR" type="string">{{CHAMELEON_DIR}}</parameter> - </parameterset> -</jube> diff --git a/tools/bench/jube/patterns.xml b/tools/bench/jube/patterns.xml index 05c8f976db026952d3aa8e6da79aa2dc51e62191..6770537d74c7b50042ee672913da399409532104 100644 --- a/tools/bench/jube/patterns.xml +++ b/tools/bench/jube/patterns.xml @@ -2,16 +2,7 @@ <jube> <!-- Regex pattern --> <patternset name="chameleon"> - <pattern name="NMPI" type="int"># Nb mpi: *(\d+)</pattern> - <pattern name="P" type="int"># PxQ: *(\d+)x\d+</pattern> - <pattern name="Q" type="int"># PxQ: *\d+x(\d+)</pattern> - <pattern name="NTHREAD" type="int"># Nb threads: *(\d+)</pattern> - <pattern name="NGPU" type="int"># Nb GPUs: *(\d+)</pattern> - <pattern name="M" type="int">^ *(\d+) *\d+ *\d+ *\d+\.?\d+ *\d+\.?\d+ \+\- *\d+\.?\d+ *</pattern> - <pattern name="N" type="int">^ *\d+ *(\d+) *\d+ *\d+\.?\d+ *\d+\.?\d+ \+\- *\d+\.?\d+ *</pattern> - <pattern name="K" type="int">^ *\d+ *\d+ *(\d+) *\d+\.?\d+ *\d+\.?\d+ \+\- *\d+\.?\d+ *</pattern> - <pattern name="CPUTIME" type="float">^ *\d+ *\d+ *\d+ *(\d+\.?\d+) *\d+\.?\d+ \+\- *\d+\.?\d+ *</pattern> - <pattern name="GFLOPS" type="float">^ *\d+ *\d+ *\d+ *\d+\.?\d+ *(\d+\.?\d+) \+\- *\d+\.?\d+ *</pattern> - <pattern name="STDDEV" type="float">^ *\d+ *\d+ *\d+ *\d+\.?\d+ *\d+\.?\d+ \+\- *(\d+\.?\d+) *</pattern> + <pattern name="cputime" type="float">^$jube_pat_nint;$jube_pat_nwrd;.*;$jube_pat_fp;$jube_pat_nfp$</pattern> + <pattern name="gflops" type="float">^$jube_pat_nint;$jube_pat_nwrd;.*;$jube_pat_nfp;$jube_pat_fp$</pattern> </patternset> </jube> diff --git a/tools/bench/plafrim/bora/chameleon_nmad.xml b/tools/bench/plafrim/bora/chameleon_nmad.xml new file mode 100644 index 0000000000000000000000000000000000000000..522da430c413789cda55d4e0cc8445a651d41eb0 --- /dev/null +++ b/tools/bench/plafrim/bora/chameleon_nmad.xml @@ -0,0 +1,108 @@ +<?xml version="1.0" encoding="UTF-8"?> +<jube> + <benchmark name="bora" outpath="results"> + <comment>benchmark chameleon on host plafrim bora</comment> + + <parameterset name="param_gemm"> + <parameter name="hostname" type="string">bora</parameter> + <parameter name="algorithm" type="string">gemm</parameter> + <parameter name="precision" type="string">s, d</parameter> + <parameter name="i_pq" type="int" >0, 1, 2</parameter> + <parameter name="p" mode="python" type="int" >[1, 2, 3][$i_pq]</parameter> + <parameter name="q" mode="python" type="int" >[1, 2, 3][$i_pq]</parameter> + <parameter name="nmpi" mode="python" type="int" >[1, 4, 9][$i_pq]</parameter> + <parameter name="nthr" type="int" >34</parameter> + <parameter name="ngpu" type="int" >0</parameter> + <parameter name="b" type="int" >320</parameter> + <parameter name="i_mn" type="int" >0, 1, 2, 3, 4</parameter> + <parameter name="m" mode="python" type="int" >[${nmpi}*${b}, ${nmpi}*5*${b}, ${nmpi}*10*${b}, ${nmpi}*20*${b}, ${nmpi}*40*${b}][$i_mn]</parameter> + <parameter name="k" mode="python" type="int" >[${nmpi}*${b}, ${nmpi}*5*${b}, ${nmpi}*10*${b}, ${nmpi}*20*${b}, ${nmpi}*40*${b}][$i_mn]</parameter> + <parameter name="n" mode="python" type="int" >[${nmpi}*${b}, ${nmpi}*5*${b}, ${nmpi}*10*${b}, ${nmpi}*20*${b}, ${nmpi}*40*${b}][$i_mn]</parameter> + </parameterset> + + <parameterset name="param_potrf"> + <parameter name="hostname" type="string">bora</parameter> + <parameter name="algorithm" type="string">potrf</parameter> + <parameter name="precision" type="string">s, d</parameter> + <parameter name="i_pq" type="int" >0, 1, 2</parameter> + <parameter name="p" mode="python" type="int" >[1, 2, 3][$i_pq]</parameter> + <parameter name="q" mode="python" type="int" >[1, 2, 3][$i_pq]</parameter> + <parameter name="nmpi" mode="python" type="int" >[1, 4, 9][$i_pq]</parameter> + <parameter name="nthr" type="int" >34</parameter> + <parameter name="ngpu" type="int" >0</parameter> + <parameter name="b" type="int" >320</parameter> + <parameter name="i_mn" type="int" >0, 1, 2, 3, 4</parameter> + <parameter name="m" mode="python" type="int" >[${nmpi}*${b}, ${nmpi}*5*${b}, ${nmpi}*10*${b}, ${nmpi}*20*${b}, ${nmpi}*40*${b}][$i_mn]</parameter> + <parameter name="n" mode="python" type="int" >[${nmpi}*${b}, ${nmpi}*5*${b}, ${nmpi}*10*${b}, ${nmpi}*20*${b}, ${nmpi}*40*${b}][$i_mn]</parameter> + <parameter name="k" type="int" >1</parameter> + </parameterset> + + <parameterset name="param_geqrf"> + <parameter name="hostname" type="string">bora</parameter> + <parameter name="algorithm" type="string">geqrf_hqr</parameter> + <parameter name="precision" type="string">s, d</parameter> + <parameter name="i_pq" type="int" >0, 1, 2</parameter> + <parameter name="p" mode="python" type="int" >[1, 2, 3][$i_pq]</parameter> + <parameter name="q" mode="python" type="int" >[1, 2, 3][$i_pq]</parameter> + <parameter name="nmpi" mode="python" type="int" >[1, 4, 9][$i_pq]</parameter> + <parameter name="nthr" type="int" >34</parameter> + <parameter name="ngpu" type="int" >0</parameter> + <parameter name="b" type="int" >320</parameter> + <parameter name="i_mn" type="int" >0, 1, 2, 3, 4</parameter> + <parameter name="m" mode="python" type="int" >[${nmpi}*${b}, ${nmpi}*5*${b}, ${nmpi}*10*${b}, ${nmpi}*20*${b}, ${nmpi}*40*${b}][$i_mn]</parameter> + <parameter name="n" mode="python" type="int" >[${nmpi}*${b}, ${nmpi}*5*${b}, ${nmpi}*10*${b}, ${nmpi}*20*${b}, ${nmpi}*40*${b}][$i_mn]</parameter> + <parameter name="k" type="int" >1</parameter> + </parameterset> + + <!-- Operation --> + <step name="run_gemm" tag="gemm"> + <use>param_gemm</use> + <do>mpiexec -DPIOM_DEDICATED=1 -DPIOM_DEDICATED_WAIT=1 -np $nmpi $CHAMELEON_BUILD/new-testing/${precision}new-testing -o ${algorithm} -P $p -t $nthr -g $ngpu -m $m -n $n -k $k -b $b</do> + </step> + <step name="run_potrf" tag="potrf"> + <use>param_potrf</use> + <do>mpiexec -DPIOM_DEDICATED=1 -DPIOM_DEDICATED_WAIT=1 -np $nmpi $CHAMELEON_BUILD/new-testing/${precision}new-testing -o ${algorithm} -P $p -t $nthr -g $ngpu -m $m -n $n -k $k -b $b</do> + </step> + <step name="run_geqrf_hqr" tag="geqrf"> + <use>param_geqrf</use> + <do>mpiexec -DPIOM_DEDICATED=1 -DPIOM_DEDICATED_WAIT=1 -np $nmpi $CHAMELEON_BUILD/new-testing/${precision}new-testing -o ${algorithm} -P $p -t $nthr -g $ngpu -m $m -n $n -k $k -b $b</do> + </step> + + <!-- Analyse --> + <analyser name="analyse"> + <!-- use a pattern set --> + <use from="../../jube/patterns.xml">chameleon</use> + <analyse step="run_gemm" tag="gemm"> + <file>stdout</file> <!-- file which should be scanned --> + </analyse> + <analyse step="run_potrf" tag="potrf"> + <file>stdout</file> <!-- file which should be scanned --> + </analyse> + <analyse step="run_geqrf_hqr" tag="geqrf"> + <file>stdout</file> <!-- file which should be scanned --> + </analyse> + </analyser> + + + <!-- Create result table --> + <result> + <use>analyse</use> <!-- use existing analyser --> + <!--<table name="result" style="csv" sort="number">--> + <table name="result" style="csv"> + <column>hostname</column> + <column>algorithm</column> + <column>precision</column> + <column>nmpi</column> + <column>p</column> + <column>q</column> + <column>nthr</column> + <column>ngpu</column> + <column>m</column> + <column>n</column> + <column>k</column> + <column>cputime</column> + <column>gflops</column> + </table> + </result> + </benchmark> +</jube> diff --git a/tools/bench/plafrim/bora/chameleon_openmpi.xml b/tools/bench/plafrim/bora/chameleon_openmpi.xml new file mode 100644 index 0000000000000000000000000000000000000000..fe80ac54b44ce1078f140c6f9beb72cd6b41fa92 --- /dev/null +++ b/tools/bench/plafrim/bora/chameleon_openmpi.xml @@ -0,0 +1,108 @@ +<?xml version="1.0" encoding="UTF-8"?> +<jube> + <benchmark name="bora" outpath="results"> + <comment>benchmark chameleon on host plafrim bora</comment> + + <parameterset name="param_gemm"> + <parameter name="hostname" type="string">bora</parameter> + <parameter name="algorithm" type="string">gemm</parameter> + <parameter name="precision" type="string">s, d</parameter> + <parameter name="i_pq" type="int" >0, 1, 2</parameter> + <parameter name="p" mode="python" type="int" >[1, 2, 3][$i_pq]</parameter> + <parameter name="q" mode="python" type="int" >[1, 2, 3][$i_pq]</parameter> + <parameter name="nmpi" mode="python" type="int" >[1, 4, 9][$i_pq]</parameter> + <parameter name="nthr" type="int" >34</parameter> + <parameter name="ngpu" type="int" >0</parameter> + <parameter name="b" type="int" >320</parameter> + <parameter name="i_mn" type="int" >0, 1, 2, 3, 4</parameter> + <parameter name="m" mode="python" type="int" >[${nmpi}*${b}, ${nmpi}*5*${b}, ${nmpi}*10*${b}, ${nmpi}*20*${b}, ${nmpi}*40*${b}][$i_mn]</parameter> + <parameter name="k" mode="python" type="int" >[${nmpi}*${b}, ${nmpi}*5*${b}, ${nmpi}*10*${b}, ${nmpi}*20*${b}, ${nmpi}*40*${b}][$i_mn]</parameter> + <parameter name="n" mode="python" type="int" >[${nmpi}*${b}, ${nmpi}*5*${b}, ${nmpi}*10*${b}, ${nmpi}*20*${b}, ${nmpi}*40*${b}][$i_mn]</parameter> + </parameterset> + + <parameterset name="param_potrf"> + <parameter name="hostname" type="string">bora</parameter> + <parameter name="algorithm" type="string">potrf</parameter> + <parameter name="precision" type="string">s, d</parameter> + <parameter name="i_pq" type="int" >0, 1, 2</parameter> + <parameter name="p" mode="python" type="int" >[1, 2, 3][$i_pq]</parameter> + <parameter name="q" mode="python" type="int" >[1, 2, 3][$i_pq]</parameter> + <parameter name="nmpi" mode="python" type="int" >[1, 4, 9][$i_pq]</parameter> + <parameter name="nthr" type="int" >34</parameter> + <parameter name="ngpu" type="int" >0</parameter> + <parameter name="b" type="int" >320</parameter> + <parameter name="i_mn" type="int" >0, 1, 2, 3, 4</parameter> + <parameter name="m" mode="python" type="int" >[${nmpi}*${b}, ${nmpi}*5*${b}, ${nmpi}*10*${b}, ${nmpi}*20*${b}, ${nmpi}*40*${b}][$i_mn]</parameter> + <parameter name="n" mode="python" type="int" >[${nmpi}*${b}, ${nmpi}*5*${b}, ${nmpi}*10*${b}, ${nmpi}*20*${b}, ${nmpi}*40*${b}][$i_mn]</parameter> + <parameter name="k" type="int" >1</parameter> + </parameterset> + + <parameterset name="param_geqrf"> + <parameter name="hostname" type="string">bora</parameter> + <parameter name="algorithm" type="string">geqrf_hqr</parameter> + <parameter name="precision" type="string">s, d</parameter> + <parameter name="i_pq" type="int" >0, 1, 2</parameter> + <parameter name="p" mode="python" type="int" >[1, 2, 3][$i_pq]</parameter> + <parameter name="q" mode="python" type="int" >[1, 2, 3][$i_pq]</parameter> + <parameter name="nmpi" mode="python" type="int" >[1, 4, 9][$i_pq]</parameter> + <parameter name="nthr" type="int" >34</parameter> + <parameter name="ngpu" type="int" >0</parameter> + <parameter name="b" type="int" >320</parameter> + <parameter name="i_mn" type="int" >0, 1, 2, 3, 4</parameter> + <parameter name="m" mode="python" type="int" >[${nmpi}*${b}, ${nmpi}*5*${b}, ${nmpi}*10*${b}, ${nmpi}*20*${b}, ${nmpi}*40*${b}][$i_mn]</parameter> + <parameter name="n" mode="python" type="int" >[${nmpi}*${b}, ${nmpi}*5*${b}, ${nmpi}*10*${b}, ${nmpi}*20*${b}, ${nmpi}*40*${b}][$i_mn]</parameter> + <parameter name="k" type="int" >1</parameter> + </parameterset> + + <!-- Operation --> + <step name="run_gemm" tag="gemm"> + <use>param_gemm</use> + <do>mpiexec -np $nmpi $CHAMELEON_BUILD/new-testing/${precision}new-testing -o ${algorithm} -P $p -t $nthr -g $ngpu -m $m -n $n -k $k -b $b</do> + </step> + <step name="run_potrf" tag="potrf"> + <use>param_potrf</use> + <do>mpiexec -np $nmpi $CHAMELEON_BUILD/new-testing/${precision}new-testing -o ${algorithm} -P $p -t $nthr -g $ngpu -m $m -n $n -k $k -b $b</do> + </step> + <step name="run_geqrf_hqr" tag="geqrf"> + <use>param_geqrf</use> + <do>mpiexec -np $nmpi $CHAMELEON_BUILD/new-testing/${precision}new-testing -o ${algorithm} -P $p -t $nthr -g $ngpu -m $m -n $n -k $k -b $b</do> + </step> + + <!-- Analyse --> + <analyser name="analyse"> + <!-- use a pattern set --> + <use from="../../jube/patterns.xml">chameleon</use> + <analyse step="run_gemm" tag="gemm"> + <file>stdout</file> <!-- file which should be scanned --> + </analyse> + <analyse step="run_potrf" tag="potrf"> + <file>stdout</file> <!-- file which should be scanned --> + </analyse> + <analyse step="run_geqrf_hqr" tag="geqrf"> + <file>stdout</file> <!-- file which should be scanned --> + </analyse> + </analyser> + + + <!-- Create result table --> + <result> + <use>analyse</use> <!-- use existing analyser --> + <!--<table name="result" style="csv" sort="number">--> + <table name="result" style="csv"> + <column>hostname</column> + <column>algorithm</column> + <column>precision</column> + <column>nmpi</column> + <column>p</column> + <column>q</column> + <column>nthr</column> + <column>ngpu</column> + <column>m</column> + <column>n</column> + <column>k</column> + <column>cputime</column> + <column>gflops</column> + </table> + </result> + </benchmark> +</jube> diff --git a/tools/bench/plafrim/bora/run.sh b/tools/bench/plafrim/bora/run.sh new file mode 100755 index 0000000000000000000000000000000000000000..baf9e742435029acc751944c4dee659e08fd5fab --- /dev/null +++ b/tools/bench/plafrim/bora/run.sh @@ -0,0 +1,70 @@ +#!/bin/bash + +echo "######################### Chameleon benchmarks #########################" +echo "HOSTNAME $HOSTNAME" +echo "USERNAME $USERNAME" +echo "GIT REPO $CI_REPOSITORY_URL" +echo "GIT BRANCH $CI_COMMIT_REF_NAME" +echo "GIT COMMIT $CI_COMMIT_SHA" +echo "PROJECT DIR $CI_PROJECT_DIR" + +set -x + +function wait_completion { + # Wait for completion of jobs + echo "JOB_LIST $JOB_LIST" + while [ "$NJOB" -gt 0 ] + do + for JOB in $JOB_LIST + do + IS_JOB_IN_QUEUE=`squeue |grep "$JOB"` + if [[ -z "$IS_JOB_IN_QUEUE" ]] + then + NJOB=$[NJOB-1] + JOB_LIST=`echo $JOB_LIST | sed "s#$JOB##"` + echo "JOB $JOB finished" + else + echo "$IS_JOB_IN_QUEUE" + fi + done + sleep 30 + done +} + +# Parameters for scripts +export PLATFORM=plafrim +export NODE=bora +export BUILD_OPTIONS="-DCHAMELEON_USE_MPI=ON -DCMAKE_BUILD_TYPE=Release" +export STARPU_SILENT=1 +#export STARPU_LIMIT_CPU_MEM=180000 +#export STARPU_LIMIT_MAX_SUBMITTED_TASKS=16000 +#export STARPU_LIMIT_MIN_SUBMITTED_TASKS=15000 + +# Parameters of the Slurm jobs +TIME=01:00:00 +PART=routage +CONS=bora +EXCL= +NP=9 + +# Submit jobs +NJOB=0 +MPI_LIST="openmpi nmad" +#MPI_LIST="nmad" +for MPI in $MPI_LIST +do + JOB_ID=`JOB_NAME=chameleon\_$MPI\_$NP && sbatch --job-name="$JOB_NAME" --output="$JOB_NAME.out" --error="$JOB_NAME.err" --nodes=$NP --time=$TIME --partition=$PART --constraint=$CONS --exclude=$EXCL $CI_PROJECT_DIR/tools/bench/chameleon_guix\_$MPI.sl | sed "s#Submitted batch job ##"` + #JOB_ID=`JOB_NAME=chameleon\_$NP && sbatch --job-name="$JOB_NAME" --output="$JOB_NAME.out" --error="$JOB_NAME.err" --nodes=$NP --time=$TIME --partition=$PART --constraint=$CONS --exclude=$EXCL chameleon_guix.sl | sed "s#Submitted batch job ##"` + if [[ -n "$JOB_ID" ]] + then + JOB_LIST="$JOB_LIST $JOB_ID" + NJOB=$[NJOB+1] + fi +done + +# Wait for completion of jobs +wait_completion + +echo "####################### End Chameleon benchmarks #######################" + +exit 0 diff --git a/tools/bench/plafrim/miriel/chameleon_guix.sh b/tools/bench/plafrim/miriel/chameleon_guix.sh deleted file mode 100755 index a31d935ee83b87dbe533306febd66a62b3f0c4aa..0000000000000000000000000000000000000000 --- a/tools/bench/plafrim/miriel/chameleon_guix.sh +++ /dev/null @@ -1,25 +0,0 @@ -#!/bin/bash - -set -x - -# Configure and Build Chameleon -echo $VERSION -cd ../../../../build-$VERSION -CHAMELEON_DIR=`pwd` -cmake $BUILD_OPTIONS .. -make -j5 -cd - - -# Define where to find the build directory for jube -sed 's@{{CHAMELEON_DIR}}@'"${CHAMELEON_DIR}"'@g' -i ../../jube/paths.xml - -# Execute jube benchmarks -jube run chameleon.xml --tag gemm potrf geqrf -# jube analysis -jube analyse results/$VERSION/ -# jube report -jube result results/$VERSION/ -i last > chameleon.csv - -# send results to the elasticsearch server -export PYTHONPATH=$GUIX_ENVIRONMENT/lib/python3.7/site-packages -python3 ../../jube/add_result.py -e https://elasticsearch.bordeaux.inria.fr -t hiepacs -p "chameleon" -h $VERSION chameleon.csv diff --git a/tools/bench/plafrim/miriel/chameleon_guix.sl b/tools/bench/plafrim/miriel/chameleon_guix.sl deleted file mode 100644 index 8c3dd9d6e85e5238d4fcd10ad329fd10bd33ef2b..0000000000000000000000000000000000000000 --- a/tools/bench/plafrim/miriel/chameleon_guix.sl +++ /dev/null @@ -1,25 +0,0 @@ -#!/usr/bin/env bash -#SBATCH --exclusive -#SBATCH --ntasks-per-node=1 -#SBATCH --threads-per-core=1 - -echo "######################### Chameleon benchmarks #########################" -echo "HOSTNAME $HOSTNAME" -echo "USERNAME $USERNAME" -echo "GIT REPO $CI_REPOSITORY_URL" -echo "GIT BRANCH $CI_COMMIT_REF_NAME" -echo "GIT COMMIT $CI_COMMIT_SHA" - -# to avoid a lock during fetching chameleon branch in parallel -export XDG_CACHE_HOME=/tmp/guix-$$ - -# save guix commits -guix describe --format=json > guix.json - -# Submit jobs -exec guix environment --pure --preserve=SLURM --preserve=VERSION --preserve=BUILD_OPTIONS chameleon --with-input=openblas=mkl --ad-hoc slurm jube python python-click python-gitpython python-elasticsearch python-certifi sed coreutils grep gawk openssh perl hwloc openmpi starpu mkl -- /bin/bash --norc chameleon_guix.sh - -echo "####################### End Chameleon benchmarks #######################" - -# clean tmp -rm -rf /tmp/guix-$$ diff --git a/tools/bench/plafrim/miriel/chameleon.xml b/tools/bench/plafrim/miriel/chameleon_nmad.xml similarity index 75% rename from tools/bench/plafrim/miriel/chameleon.xml rename to tools/bench/plafrim/miriel/chameleon_nmad.xml index 98d369ff2fd038eea344e1a350d8ee6fa5994103..28be9ef0b711e22132deefe24d17f83e7909c0c8 100644 --- a/tools/bench/plafrim/miriel/chameleon.xml +++ b/tools/bench/plafrim/miriel/chameleon_nmad.xml @@ -15,9 +15,9 @@ <parameter name="ngpu" type="int" >0</parameter> <parameter name="b" type="int" >320</parameter> <parameter name="i_mn" type="int" >0, 1, 2, 3, 4</parameter> - <parameter name="m" mode="python" type="int" >[${p}*${b}, ${p}*5*${b}, ${p}*10*${b}, ${p}*20*${b}, ${p}*50*${b}][$i_mn]</parameter> - <parameter name="k" mode="python" type="int" >[${p}*${b}, ${p}*5*${b}, ${p}*10*${b}, ${p}*20*${b}, ${p}*50*${b}][$i_mn]</parameter> - <parameter name="n" mode="python" type="int" >[${p}*${b}, ${p}*5*${b}, ${p}*10*${b}, ${p}*20*${b}, ${p}*50*${b}][$i_mn]</parameter> + <parameter name="m" mode="python" type="int" >[${nmpi}*${b}, ${nmpi}*5*${b}, ${nmpi}*10*${b}, ${nmpi}*20*${b}, ${nmpi}*40*${b}][$i_mn]</parameter> + <parameter name="k" mode="python" type="int" >[${nmpi}*${b}, ${nmpi}*5*${b}, ${nmpi}*10*${b}, ${nmpi}*20*${b}, ${nmpi}*40*${b}][$i_mn]</parameter> + <parameter name="n" mode="python" type="int" >[${nmpi}*${b}, ${nmpi}*5*${b}, ${nmpi}*10*${b}, ${nmpi}*20*${b}, ${nmpi}*40*${b}][$i_mn]</parameter> </parameterset> <parameterset name="param_potrf"> @@ -32,8 +32,8 @@ <parameter name="ngpu" type="int" >0</parameter> <parameter name="b" type="int" >320</parameter> <parameter name="i_mn" type="int" >0, 1, 2, 3, 4</parameter> - <parameter name="m" mode="python" type="int" >[${p}*${b}, ${p}*5*${b}, ${p}*10*${b}, ${p}*20*${b}, ${p}*50*${b}][$i_mn]</parameter> - <parameter name="n" mode="python" type="int" >[${p}*${b}, ${p}*5*${b}, ${p}*10*${b}, ${p}*20*${b}, ${p}*50*${b}][$i_mn]</parameter> + <parameter name="m" mode="python" type="int" >[${nmpi}*${b}, ${nmpi}*5*${b}, ${nmpi}*10*${b}, ${nmpi}*20*${b}, ${nmpi}*40*${b}][$i_mn]</parameter> + <parameter name="n" mode="python" type="int" >[${nmpi}*${b}, ${nmpi}*5*${b}, ${nmpi}*10*${b}, ${nmpi}*20*${b}, ${nmpi}*40*${b}][$i_mn]</parameter> <parameter name="k" type="int" >1</parameter> </parameterset> @@ -49,26 +49,23 @@ <parameter name="ngpu" type="int" >0</parameter> <parameter name="b" type="int" >320</parameter> <parameter name="i_mn" type="int" >0, 1, 2, 3, 4</parameter> - <parameter name="m" mode="python" type="int" >[${p}*${b}, ${p}*5*${b}, ${p}*10*${b}, ${p}*20*${b}, ${p}*50*${b}][$i_mn]</parameter> - <parameter name="n" mode="python" type="int" >[${p}*${b}, ${p}*5*${b}, ${p}*10*${b}, ${p}*20*${b}, ${p}*50*${b}][$i_mn]</parameter> + <parameter name="m" mode="python" type="int" >[${nmpi}*${b}, ${nmpi}*5*${b}, ${nmpi}*10*${b}, ${nmpi}*20*${b}, ${nmpi}*40*${b}][$i_mn]</parameter> + <parameter name="n" mode="python" type="int" >[${nmpi}*${b}, ${nmpi}*5*${b}, ${nmpi}*10*${b}, ${nmpi}*20*${b}, ${nmpi}*40*${b}][$i_mn]</parameter> <parameter name="k" type="int" >1</parameter> </parameterset> <!-- Operation --> <step name="run_gemm" tag="gemm"> <use>param_gemm</use> - <use from="../../jube/paths.xml">paths</use> - <do>STARPU_SILENT=1 mpiexec -np $nmpi $BIN_DIR/timing/time_${precision}${algorithm}_tile -P $p -t $nthr -g $ngpu -m $m -n $n -k $k -b $b --niter=3</do> + <do>mpiexec -DPIOM_DEDICATED=1 -DPIOM_DEDICATED_WAIT=1 -np $nmpi $CHAMELEON_BUILD/new-testing/${precision}new-testing -o ${algorithm} -P $p -t $nthr -g $ngpu -m $m -n $n -k $k -b $b</do> </step> <step name="run_potrf" tag="potrf"> <use>param_potrf</use> - <use from="../../jube/paths.xml">paths</use> - <do>STARPU_SILENT=1 mpiexec -np $nmpi $BIN_DIR/timing/time_${precision}${algorithm}_tile -P $p -t $nthr -g $ngpu -m $m -n $n -k $k -b $b --niter=3</do> + <do>mpiexec -DPIOM_DEDICATED=1 -DPIOM_DEDICATED_WAIT=1 -np $nmpi $CHAMELEON_BUILD/new-testing/${precision}new-testing -o ${algorithm} -P $p -t $nthr -g $ngpu -m $m -n $n -k $k -b $b</do> </step> <step name="run_geqrf_hqr" tag="geqrf"> <use>param_geqrf</use> - <use from="../../jube/paths.xml">paths</use> - <do>STARPU_SILENT=1 mpiexec -np $nmpi $BIN_DIR/timing/time_${precision}${algorithm}_tile -P $p -t $nthr -g $ngpu -m $m -n $n -k $k -b $b --niter=3</do> + <do>mpiexec -DPIOM_DEDICATED=1 -DPIOM_DEDICATED_WAIT=1 -np $nmpi $CHAMELEON_BUILD/new-testing/${precision}new-testing -o ${algorithm} -P $p -t $nthr -g $ngpu -m $m -n $n -k $k -b $b</do> </step> <!-- Analyse --> @@ -95,17 +92,16 @@ <column>hostname</column> <column>algorithm</column> <column>precision</column> - <column>NMPI</column> - <column>P</column> - <column>Q</column> - <column>NTHREAD</column> - <column>NGPU</column> - <column>M</column> - <column>N</column> - <column>K</column> - <column>CPUTIME</column> - <column>GFLOPS</column> - <column>STDDEV</column> + <column>nmpi</column> + <column>p</column> + <column>q</column> + <column>nthr</column> + <column>ngpu</column> + <column>m</column> + <column>n</column> + <column>k</column> + <column>cputime</column> + <column>gflops</column> </table> </result> </benchmark> diff --git a/tools/bench/plafrim/miriel/chameleon_openmpi.xml b/tools/bench/plafrim/miriel/chameleon_openmpi.xml new file mode 100644 index 0000000000000000000000000000000000000000..d6b70cc8417353a590740a17fae67ed07fd5a040 --- /dev/null +++ b/tools/bench/plafrim/miriel/chameleon_openmpi.xml @@ -0,0 +1,108 @@ +<?xml version="1.0" encoding="UTF-8"?> +<jube> + <benchmark name="miriel" outpath="results/miriel"> + <comment>benchmark chameleon on host plafrim miriel</comment> + + <parameterset name="param_gemm"> + <parameter name="hostname" type="string">miriel</parameter> + <parameter name="algorithm" type="string">gemm</parameter> + <parameter name="precision" type="string">s, d</parameter> + <parameter name="i_pq" type="int" >0, 1, 2</parameter> + <parameter name="p" mode="python" type="int" >[1, 2, 3][$i_pq]</parameter> + <parameter name="q" mode="python" type="int" >[1, 2, 3][$i_pq]</parameter> + <parameter name="nmpi" mode="python" type="int" >[1, 4, 9][$i_pq]</parameter> + <parameter name="nthr" type="int" >22</parameter> + <parameter name="ngpu" type="int" >0</parameter> + <parameter name="b" type="int" >320</parameter> + <parameter name="i_mn" type="int" >0, 1, 2, 3, 4</parameter> + <parameter name="m" mode="python" type="int" >[${nmpi}*${b}, ${nmpi}*5*${b}, ${nmpi}*10*${b}, ${nmpi}*20*${b}, ${nmpi}*40*${b}][$i_mn]</parameter> + <parameter name="k" mode="python" type="int" >[${nmpi}*${b}, ${nmpi}*5*${b}, ${nmpi}*10*${b}, ${nmpi}*20*${b}, ${nmpi}*40*${b}][$i_mn]</parameter> + <parameter name="n" mode="python" type="int" >[${nmpi}*${b}, ${nmpi}*5*${b}, ${nmpi}*10*${b}, ${nmpi}*20*${b}, ${nmpi}*40*${b}][$i_mn]</parameter> + </parameterset> + + <parameterset name="param_potrf"> + <parameter name="hostname" type="string">miriel</parameter> + <parameter name="algorithm" type="string">potrf</parameter> + <parameter name="precision" type="string">s, d</parameter> + <parameter name="i_pq" type="int" >0, 1, 2</parameter> + <parameter name="p" mode="python" type="int" >[1, 2, 3][$i_pq]</parameter> + <parameter name="q" mode="python" type="int" >[1, 2, 3][$i_pq]</parameter> + <parameter name="nmpi" mode="python" type="int" >[1, 4, 9][$i_pq]</parameter> + <parameter name="nthr" type="int" >22</parameter> + <parameter name="ngpu" type="int" >0</parameter> + <parameter name="b" type="int" >320</parameter> + <parameter name="i_mn" type="int" >0, 1, 2, 3, 4</parameter> + <parameter name="m" mode="python" type="int" >[${nmpi}*${b}, ${nmpi}*5*${b}, ${nmpi}*10*${b}, ${nmpi}*20*${b}, ${nmpi}*40*${b}][$i_mn]</parameter> + <parameter name="n" mode="python" type="int" >[${nmpi}*${b}, ${nmpi}*5*${b}, ${nmpi}*10*${b}, ${nmpi}*20*${b}, ${nmpi}*40*${b}][$i_mn]</parameter> + <parameter name="k" type="int" >1</parameter> + </parameterset> + + <parameterset name="param_geqrf"> + <parameter name="hostname" type="string">miriel</parameter> + <parameter name="algorithm" type="string">geqrf_hqr</parameter> + <parameter name="precision" type="string">s, d</parameter> + <parameter name="i_pq" type="int" >0, 1, 2</parameter> + <parameter name="p" mode="python" type="int" >[1, 2, 3][$i_pq]</parameter> + <parameter name="q" mode="python" type="int" >[1, 2, 3][$i_pq]</parameter> + <parameter name="nmpi" mode="python" type="int" >[1, 4, 9][$i_pq]</parameter> + <parameter name="nthr" type="int" >22</parameter> + <parameter name="ngpu" type="int" >0</parameter> + <parameter name="b" type="int" >320</parameter> + <parameter name="i_mn" type="int" >0, 1, 2, 3, 4</parameter> + <parameter name="m" mode="python" type="int" >[${nmpi}*${b}, ${nmpi}*5*${b}, ${nmpi}*10*${b}, ${nmpi}*20*${b}, ${nmpi}*40*${b}][$i_mn]</parameter> + <parameter name="n" mode="python" type="int" >[${nmpi}*${b}, ${nmpi}*5*${b}, ${nmpi}*10*${b}, ${nmpi}*20*${b}, ${nmpi}*40*${b}][$i_mn]</parameter> + <parameter name="k" type="int" >1</parameter> + </parameterset> + + <!-- Operation --> + <step name="run_gemm" tag="gemm"> + <use>param_gemm</use> + <do>mpiexec -np $nmpi $CHAMELEON_BUILD/new-testing/${precision}new-testing -o ${algorithm} -P $p -t $nthr -g $ngpu -m $m -n $n -k $k -b $b</do> + </step> + <step name="run_potrf" tag="potrf"> + <use>param_potrf</use> + <do>mpiexec -np $nmpi $CHAMELEON_BUILD/new-testing/${precision}new-testing -o ${algorithm} -P $p -t $nthr -g $ngpu -m $m -n $n -k $k -b $b</do> + </step> + <step name="run_geqrf_hqr" tag="geqrf"> + <use>param_geqrf</use> + <do>mpiexec -np $nmpi $CHAMELEON_BUILD/new-testing/${precision}new-testing -o ${algorithm} -P $p -t $nthr -g $ngpu -m $m -n $n -k $k -b $b</do> + </step> + + <!-- Analyse --> + <analyser name="analyse"> + <!-- use a pattern set --> + <use from="../../jube/patterns.xml">chameleon</use> + <analyse step="run_gemm" tag="gemm"> + <file>stdout</file> <!-- file which should be scanned --> + </analyse> + <analyse step="run_potrf" tag="potrf"> + <file>stdout</file> <!-- file which should be scanned --> + </analyse> + <analyse step="run_geqrf_hqr" tag="geqrf"> + <file>stdout</file> <!-- file which should be scanned --> + </analyse> + </analyser> + + + <!-- Create result table --> + <result> + <use>analyse</use> <!-- use existing analyser --> + <!--<table name="result" style="csv" sort="number">--> + <table name="result" style="csv"> + <column>hostname</column> + <column>algorithm</column> + <column>precision</column> + <column>nmpi</column> + <column>p</column> + <column>q</column> + <column>nthr</column> + <column>ngpu</column> + <column>m</column> + <column>n</column> + <column>k</column> + <column>cputime</column> + <column>gflops</column> + </table> + </result> + </benchmark> +</jube> diff --git a/tools/bench/plafrim/miriel/run.sh b/tools/bench/plafrim/miriel/run.sh index c5e21926e3aa1a00f5389fd017e9f4814645a97d..d37974b0381fb086db07d620188eefe9c3e99fd5 100755 --- a/tools/bench/plafrim/miriel/run.sh +++ b/tools/bench/plafrim/miriel/run.sh @@ -6,26 +6,21 @@ echo "USERNAME $USERNAME" echo "GIT REPO $CI_REPOSITORY_URL" echo "GIT BRANCH $CI_COMMIT_REF_NAME" echo "GIT COMMIT $CI_COMMIT_SHA" +echo "PROJECT DIR $CI_PROJECT_DIR" -# Parameters of the Slurm jobs -TIME=01:00:00 -PART=court -CONS=MirielIB -EXCL= -NP=9 -JOBSLIM=1 +set -x function wait_completion { # Wait for completion of jobs echo "JOB_LIST $JOB_LIST" - while [ "$ITER" -ge "$JOBSLIM" ] + while [ "$NJOB" -gt 0 ] do for JOB in $JOB_LIST do IS_JOB_IN_QUEUE=`squeue |grep "$JOB"` if [[ -z "$IS_JOB_IN_QUEUE" ]] then - ITER=$[ITER-1] + NJOB=$[NJOB-1] JOB_LIST=`echo $JOB_LIST | sed "s#$JOB##"` echo "JOB $JOB finished" else @@ -36,22 +31,40 @@ function wait_completion { done } +# Parameters for scripts +export PLATFORM=plafrim +export NODE=miriel +export BUILD_OPTIONS="-DCHAMELEON_USE_MPI=ON -DCMAKE_BUILD_TYPE=Release" +export STARPU_SILENT=1 +export STARPU_LIMIT_CPU_MEM=120000 +export STARPU_LIMIT_MAX_SUBMITTED_TASKS=16000 +export STARPU_LIMIT_MIN_SUBMITTED_TASKS=15000 + +# Parameters of the Slurm jobs +TIME=01:00:00 +PART=routage +CONS="miriel,omnipath" +EXCL= +NP=9 # Submit jobs -ITER=0 -JOB_ID=`JOB_NAME=chameleon_bench\_$NP && sbatch --job-name="$JOB_NAME" --output="$JOB_NAME.out" --error="$JOB_NAME.err" --nodes=$NP --time=$TIME --partition=$PART --constraint=$CONS --exclude=$EXCL chameleon_guix.sl | sed "s#Submitted batch job ##"` -if [[ -n "$JOB_ID" ]] -then - JOB_LIST="$JOB_LIST $JOB_ID" - ITER=$[ITER+1] -fi +NJOB=0 +MPI_LIST="openmpi nmad" +#MPI_LIST="nmad" +for MPI in $MPI_LIST +do + JOB_ID=`JOB_NAME=chameleon\_$MPI\_$NP && sbatch --job-name="$JOB_NAME" --output="$JOB_NAME.out" --error="$JOB_NAME.err" --nodes=$NP --time=$TIME --partition=$PART --constraint=$CONS --exclude=$EXCL $CI_PROJECT_DIR/tools/bench/chameleon_guix\_$MPI.sl | sed "s#Submitted batch job ##"` + #JOB_ID=`JOB_NAME=chameleon\_$NP && sbatch --job-name="$JOB_NAME" --output="$JOB_NAME.out" --error="$JOB_NAME.err" --nodes=$NP --time=$TIME --partition=$PART --constraint=$CONS --exclude=$EXCL chameleon_guix.sl | sed "s#Submitted batch job ##"` + if [[ -n "$JOB_ID" ]] + then + JOB_LIST="$JOB_LIST $JOB_ID" + NJOB=$[NJOB+1] + fi +done # Wait for completion of jobs wait_completion -# Print results -cat chameleon_bench\_$NP.out - echo "####################### End Chameleon benchmarks #######################" exit 0 diff --git a/tools/bench/plafrim/sirocco/chameleon.xml b/tools/bench/plafrim/sirocco/chameleon.xml index 341d035fb0dbdecfd4dcf8a3460cb43eb013ff28..fda2c21c7db4a7c204dc1ab1da99003b4194181d 100644 --- a/tools/bench/plafrim/sirocco/chameleon.xml +++ b/tools/bench/plafrim/sirocco/chameleon.xml @@ -15,9 +15,9 @@ <parameter name="ngpu" type="int" >2</parameter> <parameter name="b" type="int" >1600</parameter> <parameter name="i_mn" type="int" >0, 1, 2, 3, 4</parameter> - <parameter name="m" mode="python" type="int" >[${p}*${b}, ${p}*5*${b}, ${p}*10*${b}, ${p}*20*${b}, ${p}*50*${b}][$i_mn]</parameter> - <parameter name="k" mode="python" type="int" >[${p}*${b}, ${p}*5*${b}, ${p}*10*${b}, ${p}*20*${b}, ${p}*50*${b}][$i_mn]</parameter> - <parameter name="n" mode="python" type="int" >[${p}*${b}, ${p}*5*${b}, ${p}*10*${b}, ${p}*20*${b}, ${p}*50*${b}][$i_mn]</parameter> + <parameter name="m" mode="python" type="int" >[${nmpi}*${b}, ${nmpi}*5*${b}, ${nmpi}*10*${b}, ${nmpi}*20*${b}, ${nmpi}*40*${b}][$i_mn]</parameter> + <parameter name="k" mode="python" type="int" >[${nmpi}*${b}, ${nmpi}*5*${b}, ${nmpi}*10*${b}, ${nmpi}*20*${b}, ${nmpi}*40*${b}][$i_mn]</parameter> + <parameter name="n" mode="python" type="int" >[${nmpi}*${b}, ${nmpi}*5*${b}, ${nmpi}*10*${b}, ${nmpi}*20*${b}, ${nmpi}*40*${b}][$i_mn]</parameter> </parameterset> <parameterset name="param_potrf"> @@ -32,8 +32,8 @@ <parameter name="ngpu" type="int" >2</parameter> <parameter name="b" type="int" >1600</parameter> <parameter name="i_mn" type="int" >0, 1, 2, 3, 4</parameter> - <parameter name="m" mode="python" type="int" >[${p}*${b}, ${p}*5*${b}, ${p}*10*${b}, ${p}*20*${b}, ${p}*50*${b}][$i_mn]</parameter> - <parameter name="n" mode="python" type="int" >[${p}*${b}, ${p}*5*${b}, ${p}*10*${b}, ${p}*20*${b}, ${p}*50*${b}][$i_mn]</parameter> + <parameter name="m" mode="python" type="int" >[${nmpi}*${b}, ${nmpi}*5*${b}, ${nmpi}*10*${b}, ${nmpi}*20*${b}, ${nmpi}*40*${b}][$i_mn]</parameter> + <parameter name="n" mode="python" type="int" >[${nmpi}*${b}, ${nmpi}*5*${b}, ${nmpi}*10*${b}, ${nmpi}*20*${b}, ${nmpi}*40*${b}][$i_mn]</parameter> <parameter name="k" type="int" >1</parameter> </parameterset> @@ -46,29 +46,26 @@ <parameter name="q" mode="python" type="int" >[1, 2, 3][$i_pq]</parameter> <parameter name="nmpi" mode="python" type="int" >[1, 4, 9][$i_pq]</parameter> <parameter name="nthr" type="int" >29</parameter> - <parameter name="ngpu" type="int" >0</parameter> + <parameter name="ngpu" type="int" >2</parameter> <parameter name="b" type="int" >1600</parameter> <parameter name="i_mn" type="int" >0, 1, 2, 3, 4</parameter> - <parameter name="m" mode="python" type="int" >[${p}*${b}, ${p}*5*${b}, ${p}*10*${b}, ${p}*20*${b}, ${p}*50*${b}][$i_mn]</parameter> - <parameter name="n" mode="python" type="int" >[${p}*${b}, ${p}*5*${b}, ${p}*10*${b}, ${p}*20*${b}, ${p}*50*${b}][$i_mn]</parameter> + <parameter name="m" mode="python" type="int" >[${nmpi}*${b}, ${nmpi}*5*${b}, ${nmpi}*10*${b}, ${nmpi}*20*${b}, ${nmpi}*40*${b}][$i_mn]</parameter> + <parameter name="n" mode="python" type="int" >[${nmpi}*${b}, ${nmpi}*5*${b}, ${nmpi}*10*${b}, ${nmpi}*20*${b}, ${nmpi}*40*${b}][$i_mn]</parameter> <parameter name="k" type="int" >1</parameter> </parameterset> <!-- Operation --> <step name="run_gemm" tag="gemm"> <use>param_gemm</use> - <use from="../../jube/paths.xml">paths</use> - <do>STARPU_SILENT=1 mpiexec -np $nmpi $BIN_DIR/timing/time_${precision}${algorithm}_tile -P $p -t $nthr -g $ngpu -m $m -n $n -k $k -b $b --niter=3</do> + <do>mpiexec -np $nmpi $CHAMELEON_BUILD/new-testing/${precision}new-testing -o ${algorithm} -P $p -t $nthr -g $ngpu -m $m -n $n -k $k -b $b</do> </step> <step name="run_potrf" tag="potrf"> <use>param_potrf</use> - <use from="../../jube/paths.xml">paths</use> - <do>STARPU_SILENT=1 mpiexec -np $nmpi $BIN_DIR/timing/time_${precision}${algorithm}_tile -P $p -t $nthr -g $ngpu -m $m -n $n -k $k -b $b --niter=3</do> + <do>mpiexec -np $nmpi $CHAMELEON_BUILD/new-testing/${precision}new-testing -o ${algorithm} -P $p -t $nthr -g $ngpu -m $m -n $n -k $k -b $b</do> </step> <step name="run_geqrf_hqr" tag="geqrf"> <use>param_geqrf</use> - <use from="../../jube/paths.xml">paths</use> - <do>STARPU_SILENT=1 mpiexec -np $nmpi $BIN_DIR/timing/time_${precision}${algorithm}_tile -P $p -t $nthr -g $ngpu -m $m -n $n -k $k -b $b --niter=3</do> + <do>mpiexec -np $nmpi $CHAMELEON_BUILD/new-testing/${precision}new-testing -o ${algorithm} -P $p -t $nthr -g $ngpu -m $m -n $n -k $k -b $b</do> </step> <!-- Analyse --> @@ -95,17 +92,16 @@ <column>hostname</column> <column>algorithm</column> <column>precision</column> - <column>NMPI</column> - <column>P</column> - <column>Q</column> - <column>NTHREAD</column> - <column>NGPU</column> - <column>M</column> - <column>N</column> - <column>K</column> - <column>CPUTIME</column> - <column>GFLOPS</column> - <column>STDDEV</column> + <column>nmpi</column> + <column>p</column> + <column>q</column> + <column>nthr</column> + <column>ngpu</column> + <column>m</column> + <column>n</column> + <column>k</column> + <column>cputime</column> + <column>gflops</column> </table> </result> </benchmark> diff --git a/tools/bench/plafrim/sirocco/chameleon_guix.sh b/tools/bench/plafrim/sirocco/chameleon_guix.sh deleted file mode 100755 index a31d935ee83b87dbe533306febd66a62b3f0c4aa..0000000000000000000000000000000000000000 --- a/tools/bench/plafrim/sirocco/chameleon_guix.sh +++ /dev/null @@ -1,25 +0,0 @@ -#!/bin/bash - -set -x - -# Configure and Build Chameleon -echo $VERSION -cd ../../../../build-$VERSION -CHAMELEON_DIR=`pwd` -cmake $BUILD_OPTIONS .. -make -j5 -cd - - -# Define where to find the build directory for jube -sed 's@{{CHAMELEON_DIR}}@'"${CHAMELEON_DIR}"'@g' -i ../../jube/paths.xml - -# Execute jube benchmarks -jube run chameleon.xml --tag gemm potrf geqrf -# jube analysis -jube analyse results/$VERSION/ -# jube report -jube result results/$VERSION/ -i last > chameleon.csv - -# send results to the elasticsearch server -export PYTHONPATH=$GUIX_ENVIRONMENT/lib/python3.7/site-packages -python3 ../../jube/add_result.py -e https://elasticsearch.bordeaux.inria.fr -t hiepacs -p "chameleon" -h $VERSION chameleon.csv diff --git a/tools/bench/plafrim/sirocco/chameleon_guix.sl b/tools/bench/plafrim/sirocco/chameleon_guix.sl deleted file mode 100644 index 8c3dd9d6e85e5238d4fcd10ad329fd10bd33ef2b..0000000000000000000000000000000000000000 --- a/tools/bench/plafrim/sirocco/chameleon_guix.sl +++ /dev/null @@ -1,25 +0,0 @@ -#!/usr/bin/env bash -#SBATCH --exclusive -#SBATCH --ntasks-per-node=1 -#SBATCH --threads-per-core=1 - -echo "######################### Chameleon benchmarks #########################" -echo "HOSTNAME $HOSTNAME" -echo "USERNAME $USERNAME" -echo "GIT REPO $CI_REPOSITORY_URL" -echo "GIT BRANCH $CI_COMMIT_REF_NAME" -echo "GIT COMMIT $CI_COMMIT_SHA" - -# to avoid a lock during fetching chameleon branch in parallel -export XDG_CACHE_HOME=/tmp/guix-$$ - -# save guix commits -guix describe --format=json > guix.json - -# Submit jobs -exec guix environment --pure --preserve=SLURM --preserve=VERSION --preserve=BUILD_OPTIONS chameleon --with-input=openblas=mkl --ad-hoc slurm jube python python-click python-gitpython python-elasticsearch python-certifi sed coreutils grep gawk openssh perl hwloc openmpi starpu mkl -- /bin/bash --norc chameleon_guix.sh - -echo "####################### End Chameleon benchmarks #######################" - -# clean tmp -rm -rf /tmp/guix-$$ diff --git a/tools/bench/plafrim/sirocco/run.sh b/tools/bench/plafrim/sirocco/run.sh index 5c3f5e3c2de3fb4e019ad99c237f66ba64991e3a..e082b5a7958626374e66f5826c796fbe156554ec 100755 --- a/tools/bench/plafrim/sirocco/run.sh +++ b/tools/bench/plafrim/sirocco/run.sh @@ -6,52 +6,65 @@ echo "USERNAME $USERNAME" echo "GIT REPO $CI_REPOSITORY_URL" echo "GIT BRANCH $CI_COMMIT_REF_NAME" echo "GIT COMMIT $CI_COMMIT_SHA" +echo "PROJECT DIR $CI_PROJECT_DIR" -# Parameters of the Slurm jobs -TIME=01:00:00 -PART=court_sirocco -CONS=Skylake -EXCL= -NP=1 -JOBSLIM=1 +set -x function wait_completion { # Wait for completion of jobs echo "JOB_LIST $JOB_LIST" - while [ "$ITER" -ge "$JOBSLIM" ] + while [ "$NJOB" -gt 0 ] do - for JOB in $JOB_LIST - do - IS_JOB_IN_QUEUE=`squeue |grep "$JOB"` - if [[ -z "$IS_JOB_IN_QUEUE" ]] - then - ITER=$[ITER-1] - JOB_LIST=`echo $JOB_LIST | sed "s#$JOB##"` - echo "JOB $JOB finished" - else - echo "$IS_JOB_IN_QUEUE" - fi - done - sleep 30 + for JOB in $JOB_LIST + do + IS_JOB_IN_QUEUE=`squeue |grep "$JOB"` + if [[ -z "$IS_JOB_IN_QUEUE" ]] + then + NJOB=$[NJOB-1] + JOB_LIST=`echo $JOB_LIST | sed "s#$JOB##"` + echo "JOB $JOB finished" + else + echo "$IS_JOB_IN_QUEUE" + fi + done + sleep 30 done } +# Parameters for scripts +export PLATFORM=plafrim +export NODE=v100 +export BUILD_OPTIONS="-DCHAMELEON_USE_CUDA=ON -DCHAMELEON_USE_MPI=ON -DCMAKE_BUILD_TYPE=Release" +export STARPU_SILENT=1 +export STARPU_LIMIT_CPU_MEM=370000 +export STARPU_LIMIT_MAX_SUBMITTED_TASKS=16000 +export STARPU_LIMIT_MIN_SUBMITTED_TASKS=15000 + +# Parameters of the Slurm jobs +TIME=01:00:00 +PART=routage +CONS=v100 +EXCL= +NP=1 # Submit jobs -ITER=0 -JOB_ID=`JOB_NAME=chameleon_bench\_$NP && sbatch --job-name="$JOB_NAME" --output="$JOB_NAME.out" --error="$JOB_NAME.err" --nodes=$NP --time=$TIME --partition=$PART --constraint=$CONS chameleon_guix.sl | sed "s#Submitted batch job ##"` -if [[ -n "$JOB_ID" ]] -then - JOB_LIST="$JOB_LIST $JOB_ID" - ITER=$[ITER+1] -fi +NJOB=0 +#MPI_LIST="openmpi nmad" +MPI_LIST="openmpi" +for MPI in $MPI_LIST +do + JOB_ID=`JOB_NAME=chameleon\_$MPI\_$NP && sbatch --job-name="$JOB_NAME" --output="$JOB_NAME.out" --error="$JOB_NAME.err" --nodes=$NP --time=$TIME --partition=$PART --constraint=$CONS --exclude=$EXCL $CI_PROJECT_DIR/tools/bench/chameleon_guix\_$MPI\_cuda.sl | sed "s#Submitted batch job ##"` + #JOB_ID=`JOB_NAME=chameleon\_$NP && sbatch --job-name="$JOB_NAME" --output="$JOB_NAME.out" --error="$JOB_NAME.err" --nodes=$NP --time=$TIME --partition=$PART --constraint=$CONS --exclude=$EXCL chameleon_guix.sl | sed "s#Submitted batch job ##"` + if [[ -n "$JOB_ID" ]] + then + JOB_LIST="$JOB_LIST $JOB_ID" + NJOB=$[NJOB+1] + fi +done # Wait for completion of jobs wait_completion -# Print results -cat chameleon_bench\_$NP.out - echo "####################### End Chameleon benchmarks #######################" exit 0