From dff21c1ae52fd5377d7f8154e269ce0de619a596 Mon Sep 17 00:00:00 2001 From: Florent Pruvost <florent.pruvost@inria.fr> Date: Tue, 30 Apr 2024 11:23:08 +0200 Subject: [PATCH] Update benchmarks: * add regression test on the gflops * add -test jube files to test on minimalist cases * improve errors handling --- .gitlab/bench_plafrim.yml | 5 +- tools/bench/chameleon_guix.sh | 33 +-- tools/bench/guix-channels.scm | 21 +- tools/bench/jube/add_result.py | 230 ++++++++++++++++-- tools/bench/plafrim/chameleon-test.xml | 56 +++++ .../parameters/bora/parameters-test.xml | 54 ++++ .../parameters/sirocco/parameters-test.xml | 54 ++++ tools/bench/plafrim/run.sh | 41 ++-- tools/bench/plafrim/slurm.sh | 41 +--- 9 files changed, 430 insertions(+), 105 deletions(-) create mode 100644 tools/bench/plafrim/chameleon-test.xml create mode 100644 tools/bench/plafrim/parameters/bora/parameters-test.xml create mode 100644 tools/bench/plafrim/parameters/sirocco/parameters-test.xml diff --git a/.gitlab/bench_plafrim.yml b/.gitlab/bench_plafrim.yml index d47728d70..7e532985a 100644 --- a/.gitlab/bench_plafrim.yml +++ b/.gitlab/bench_plafrim.yml @@ -9,14 +9,15 @@ - git submodule update --init --recursive script: - ./tools/bench/plafrim/run.sh + allow_failure: true artifacts: name: "$CI_JOB_NAME-$CI_COMMIT_REF_SLUG" expire_in: 1 week + when: always paths: - "chameleon-$NODE-$MPI-$SLURM_NP.err" - "chameleon-$NODE-$MPI-$SLURM_NP.out" - - "tools/bench/plafrim/chameleon.csv" - - "tools/bench/plafrim/results/$JUBE_ID" + - "chameleon-$NODE-$MPI.csv" variables: PLATFORM: plafrim diff --git a/tools/bench/chameleon_guix.sh b/tools/bench/chameleon_guix.sh index d5750de59..1f4ee1235 100755 --- a/tools/bench/chameleon_guix.sh +++ b/tools/bench/chameleon_guix.sh @@ -10,27 +10,30 @@ # @date 2022-02-22 # ### -set -x +set -ex # Configure and Build Chameleon -mkdir -p $CI_PROJECT_DIR/build-$NODE-$MPI -cd $CI_PROJECT_DIR/build-$NODE-$MPI -rm CMake* -rf -cmake $CHAMELEON_BUILD_OPTIONS .. -make -j20 VERBOSE=1 -export CHAMELEON_BUILD=$PWD +if [ -d build-$NODE-$MPI ]; then + rm build-$NODE-$MPI -r +fi +cmake -B build-$NODE-$MPI $CHAMELEON_BUILD_OPTIONS +cmake --build build-$NODE-$MPI -j20 --verbose +export CHAMELEON_BUILD=$PWD/build-$NODE-$MPI # clean old benchmarks -cd $CI_PROJECT_DIR/tools/bench/$PLATFORM/results -jube remove --force --id $JUBE_ID +if [ -d tools/bench/$PLATFORM/results ]; then + rm tools/bench/$PLATFORM/results -r +fi # Execute jube benchmarks -cd $CI_PROJECT_DIR/tools/bench/$PLATFORM/ -jube run chameleon.xml --tag gemm potrf geqrf --include-path parameters/$NODE --id $JUBE_ID +jube run tools/bench/$PLATFORM/chameleon.xml --tag gemm potrf geqrf --include-path tools/bench/$PLATFORM/parameters/$NODE --id $JUBE_ID +#jube run tools/bench/$PLATFORM/chameleon-test.xml --tag gemm potrf geqrf --include-path tools/bench/$PLATFORM/parameters/$NODE --id $JUBE_ID # jube analysis -jube analyse results --id $JUBE_ID +jube analyse tools/bench/$PLATFORM/results --id $JUBE_ID # jube report -jube result results --id $JUBE_ID > chameleon.csv +jube result tools/bench/$PLATFORM/results --id $JUBE_ID > chameleon\-$NODE\-$MPI.csv +cat chameleon\-$NODE\-$MPI.csv # send results to the elasticsearch server -cp $CI_PROJECT_DIR/guix.json . -python3 $CI_PROJECT_DIR/tools/bench/jube/add_result.py -e https://elasticsearch.bordeaux.inria.fr -t hiepacs -p "chameleon" -h $NODE -m $MPI chameleon.csv +#ls guix.json +python3 tools/bench/jube/add_result.py -e https://elasticsearch.bordeaux.inria.fr -t hiepacs -p "chameleon" -m $MPI chameleon\-$NODE\-$MPI.csv +#python3 tools/bench/jube/add_result.py -e https://elasticsearch.bordeaux.inria.fr -t hiepacs -p "chameleon-test" -m $MPI chameleon\-$NODE\-$MPI.csv diff --git a/tools/bench/guix-channels.scm b/tools/bench/guix-channels.scm index 25eb5c93a..a75ba45a7 100644 --- a/tools/bench/guix-channels.scm +++ b/tools/bench/guix-channels.scm @@ -3,7 +3,7 @@ (url "https://git.savannah.gnu.org/git/guix.git") (branch "master") (commit - "842e491e2e798298246f3449b39d1e3f36fe7d1c") + "8d29f416a9378d30f63c2a95f1bd1a420d9ccab4") (introduction (make-channel-introduction "9edb3f66fd807b096b48283debdcddccfea34bad" @@ -14,13 +14,13 @@ (url "https://gitlab.inria.fr/guix-hpc/guix-hpc-non-free.git") (branch "master") (commit - "c91f254c22f6ae467a7eda65a660d0ae1f0b690a")) + "23d5f240e10f6431e8b6feb57bf20b4def78baa2")) (channel (name 'guix-science-nonfree) (url "https://github.com/guix-science/guix-science-nonfree.git") (branch "master") (commit - "9a3f3824d8ed289832d706679410edadac1202ae") + "417f86b4819bb45df671c3276216e57d2a427156") (introduction (make-channel-introduction "58661b110325fd5d9b40e6f0177cc486a615817e" @@ -31,15 +31,26 @@ (url "https://gitlab.inria.fr/guix-hpc/guix-past") (branch "master") (commit - "1e25b23faa6b1716deaf7e1782becb5da6855942") + "921f845dc0dec9f052dcda479a15e787f9fd5b0a") (introduction (make-channel-introduction "0c119db2ea86a389769f4d2b9c6f5c41c027e336" (openpgp-fingerprint "3CE4 6455 8A84 FDC6 9DB4 0CFB 090B 1199 3D9A EBB5")))) + (channel + (name 'guix-science) + (url "https://github.com/guix-science/guix-science.git") + (branch "master") + (commit + "f85279b8aeac3cc3e6e2aec866841c722c5663fe") + (introduction + (make-channel-introduction + "b1fe5aaff3ab48e798a4cce02f0212bc91f423dc" + (openpgp-fingerprint + "CA4F 8CF4 37D7 478F DA05 5FD4 4213 7701 1A37 8446")))) (channel (name 'guix-hpc) (url "https://gitlab.inria.fr/guix-hpc/guix-hpc.git") (branch "master") (commit - "674bc362dea0b233eb84d79d36e25460907afd7a"))) + "2a223f9c125bd3325511d2079c7215f8da0958a7"))) diff --git a/tools/bench/jube/add_result.py b/tools/bench/jube/add_result.py index 95bdb2709..ad8fdbad6 100755 --- a/tools/bench/jube/add_result.py +++ b/tools/bench/jube/add_result.py @@ -9,15 +9,15 @@ # @author Florent Pruvost # @date 2022-02-22 # -from typing import Any, Dict, List, Union -from copy import deepcopy -import json import click import csv +import json +import math +import sys import time -from git import Repo from elasticsearch import Elasticsearch - +from git import Repo +from typing import Any, Dict, List, Union Row = Dict[str, Union[str, float]] @@ -41,7 +41,7 @@ def open_csv(filename: str) -> List[Dict[str, str]]: return csv_rows -def format_entry(row: Row, mpivendor: str, commit_chameleon: Repo, commit_guix: str, commit_guix_hpc: str, commit_guix_hpcnonfree: str) -> Dict[str, Any]: +def format_entry(row: Row, mpivendor: str, commit_chameleon: Repo, guix_commits: Dict) -> Dict[str, Any]: """"format a result""" commit_date_chameleon = str(time.strftime("%Y-%m-%d %H:%M:%S", time.gmtime(commit_chameleon.committed_date))) commit_sha_chameleon = str(commit_chameleon.hexsha) @@ -61,9 +61,9 @@ def format_entry(row: Row, mpivendor: str, commit_chameleon: Repo, commit_guix: result = { "Commit_date_chameleon": commit_date_chameleon, "Commit_sha_chameleon": commit_sha_chameleon, - "Commit_sha_guix": commit_guix, - "Commit_sha_guix_hpc": commit_guix_hpc, - "Commit_sha_guix_hpcnonfree": commit_guix_hpcnonfree, + "Commit_sha_guix": guix_commits["guix"], + "Commit_sha_guix_hpc": guix_commits["guix-hpc"], + "Commit_sha_guix_hpcnonfree": guix_commits["guix-hpc-non-free"], "Hostname": hostname, "MPIvendor": mpivendor, "Algorithm": algorithm, @@ -81,13 +81,127 @@ def format_entry(row: Row, mpivendor: str, commit_chameleon: Repo, commit_guix: } return result +def format_entry_stats(row: Row, mpivendor: str, commit_chameleon: Repo, commit_stats: str, + guix_commits: Dict, es: Elasticsearch, es_index: str): + """"format a result and compute stats: mean and stdev of gflop""" + err = 0 + + # format measures entry + result = format_entry(row, mpivendor, commit_chameleon, guix_commits) + + # prepare default gflops stats entry if not existing + result_stats = { + "Commit_date_chameleon": result['Commit_date_chameleon'], + "Commit_sha_chameleon": result['Commit_sha_chameleon'], + "Commit_sha_guix": result['Commit_sha_guix'], + "Commit_sha_guix_hpc": result['Commit_sha_guix_hpc'], + "Commit_sha_guix_hpcnonfree": result['Commit_sha_guix_hpcnonfree'], + "Hostname": result['Hostname'], + "MPIvendor": result['MPIvendor'], + "Algorithm": result['Algorithm'], + "Precision": result['Precision'], + "Nmpi": result['Nmpi'], + "P": result['P'], + "Q": result['Q'], + "Nthread": result['Nthread'], + "Ngpu": result['Ngpu'], + "M": result['M'], + "N": result['N'], + "K": result['K'], + "mean": format(result['Gflops'], '.1f'), + "stdev": format(result['Gflops']*0.1, '.1f') + } + + if commit_stats != 'null': + # search stats data for this commit and given input parameters + search_param = { + "query": { + "bool": { + "must": [ + { "match": { "Commit_sha_chameleon": commit_stats }}, + { "match": { "Hostname" : result['Hostname'] }}, + { "match": { "MPIvendor" : result['MPIvendor'] }}, + { "match": { "Algorithm" : result['Algorithm'] }}, + { "match": { "Precision" : result['Precision'] }}, + { "match": { "Nmpi" : result['Nmpi'] }}, + { "match": { "P" : result['P'] }}, + { "match": { "Q" : result['Q'] }}, + { "match": { "Nthread" : result['Nthread'] }}, + { "match": { "Ngpu" : result['Ngpu'] }}, + { "match": { "M" : result['M'] }}, + { "match": { "N" : result['N'] }}, + { "match": { "K" : result['K'] }} + ] + } + }, + "size": 1, + "_source": ["mean", "stdev"], + } + response = es.search(index=es_index, body=search_param) + elastic_docs2 = response["hits"]["hits"] + + if len(elastic_docs2) > 0: + last_stats_data = elastic_docs2[0]["_source"] + #print("last_stats_data ", last_stats_data) + + # compute formula from https://public.kitware.com/Wiki/CDash:Design#Test_Timing + alpha = 0.3 + multiplier = 3 + + previousMean = float(last_stats_data['mean']) + previousSD = float(last_stats_data['stdev']) + #print("previousMean ", previousMean) + + currentV = result['Gflops'] + # just to test: apply a perturbation + #pert = random.uniform(-previousSD, previousSD) + #currentV = result['Gflops'] + pert + #print("currentV ", currentV) + + newMean = (1-alpha)*previousMean + alpha*currentV + newSD = math.sqrt((1-alpha)*previousSD*previousSD + alpha*(currentV-newMean)*(currentV-newMean)) + + # prepare stats data to put in database newMean and newSD + result_stats = { + "Commit_date_chameleon": result['Commit_date_chameleon'], + "Commit_sha_chameleon": result['Commit_sha_chameleon'], + "Commit_sha_guix": result['Commit_sha_guix'], + "Commit_sha_guix_hpc": result['Commit_sha_guix_hpc'], + "Commit_sha_guix_hpcnonfree": result['Commit_sha_guix_hpcnonfree'], + "Hostname": result['Hostname'], + "MPIvendor": result['MPIvendor'], + "Algorithm": result['Algorithm'], + "Precision": result['Precision'], + "Nmpi": result['Nmpi'], + "P": result['P'], + "Q": result['Q'], + "Nthread": result['Nthread'], + "Ngpu": result['Ngpu'], + "M": result['M'], + "N": result['N'], + "K": result['K'], + "mean": format(newMean, '.1f'), + "stdev": format(newSD, '.1f') + } + + # check for regression + thresholdSD = 0.1*previousMean + if previousSD < thresholdSD: + previousSD = thresholdSD + maxAcceptableDiff = multiplier*previousSD + diff = abs(currentV-previousMean) + if diff > maxAcceptableDiff: + print("Regression: inputs %(Hostname)s, %(MPIvendor)s, %(Algorithm)s, %(Precision)s, %(Nmpi)s, %(P)s, %(Q)s, %(Nthread)s, %(Ngpu)s, %(M)s, %(N)s, %(K)s " % result) + print("Regression: outputs Gflops={0}, previousMean={1}, diff={2}, maxAcceptableDiff={3}".format(currentV, previousMean, diff, maxAcceptableDiff)) + err = 1 + + return [result_stats, err] @click.command() @click.option("-d", "--directory", default=".", help="git working directory") @click.option("-e", "--elastic-url", default="http://localhost:9200", help="elasticsearch instance url") @click.option("-t", "--team", required=True, help="team name") @click.option("-p", "--project", required=True, help="project name") -@click.option("-h", "--host", required=True, help="host name") @click.option("-m", "--mpi", required=True, help="MPI vendor (openmpi, nmad)") @click.argument("csv-files", nargs=-1) def main( @@ -95,16 +209,31 @@ def main( elastic_url: str, team: str, project: str, - host: str, mpi: str, csv_files: str, ): """Add a result to an elasticsearch database.""" + + repo = Repo(directory, search_parent_directories=True) + commit_chameleon = repo.head.commit + + guix_commits = {"guix" : "", + "guix-hpc": "", + "guix-hpc-non-free" : ""} + # collect guix commits info + with open('guix.json') as f: + guix_describe = json.load(f) + for index_guix in guix_describe: + if index_guix["name"] in guix_commits.keys() : + guix_commits[ index_guix["name"] ] = index_guix["commit"] + + # measures in term of cputimes and gflops es = Elasticsearch(elastic_url) es_index = team + "-" + project + "_" + "perf" if not es.indices.exists(es_index): es.indices.create(es_index) + # call this if mapping must be changed (e.g. add a new field) mapping_input = { "properties": { "Commit_date_chameleon": {"type": "date", "format": "yyyy-MM-dd HH:mm:ss"}, @@ -127,35 +256,82 @@ def main( "Cputime": {"type": "float"}, "Gflops": {"type": "float"} } - } es.indices.put_mapping(index=es_index, body=mapping_input) - repo = Repo(directory, search_parent_directories=True) - commit_chameleon = repo.head.commit - - # collect guix commits info - with open('guix.json') as f: - guix_describe = json.load(f) - for index_guix in guix_describe: - if index_guix["name"] == "guix": - commit_guix = index_guix["commit"] - elif index_guix["name"] == "guix-hpc": - commit_guix_hpc = index_guix["commit"] - elif index_guix["name"] == "guix-hpc-non-free": - commit_guix_hpcnonfree = index_guix["commit"] - + # load data from csv file requests = [ request for file in csv_files for request in map( - lambda row: format_entry(row, mpi, commit_chameleon, commit_guix, commit_guix_hpc, commit_guix_hpcnonfree), + lambda row: format_entry(row, mpi, commit_chameleon, guix_commits), open_csv(file) ) ] + + # insert measures in database for request in requests: es.index(index=es_index.lower(), body=request) + # compute stats: mean and stdev of gflops measured + # database for stats + es_index_stats = team + "-" + project + "_" + "stats" + if not es.indices.exists(es_index_stats): + es.indices.create(es_index_stats) + + # call this if mapping must be changed (e.g. add a new field) + mapping_input_stats = { + "properties": { + "Commit_date_chameleon": {"type": "date", "format": "yyyy-MM-dd HH:mm:ss"}, + "Commit_sha_chameleon": {"type": "keyword"}, + "Commit_sha_guix": {"type": "keyword"}, + "Commit_sha_guix_hpc": {"type": "keyword"}, + "Commit_sha_guix_hpcnonfree": {"type": "keyword"}, + "Hostname": {"type": "keyword"}, + "MPIvendor": {"type": "keyword"}, + "Algorithm": {"type": "keyword"}, + "Precision": {"type": "keyword"}, + "Nmpi": {"type": "integer"}, + "P": {"type": "integer"}, + "Q": {"type": "integer"}, + "Nthread": {"type": "integer"}, + "Ngpu": {"type": "integer"}, + "M": {"type": "integer"}, + "N": {"type": "integer"}, + "K": {"type": "integer"}, + "mean": {"type": "float"}, + "stdev": {"type": "float"} + } + } + es.indices.put_mapping(index=es_index_stats, body=mapping_input_stats) + + # search last commit of database stats + search_param = { + "sort": [{"Commit_date_chameleon": {"order": "desc"}}], + "size": 1, + "_source": ["Commit_sha_chameleon"], + } + response = es.search(index=es_index_stats, body=search_param) + elastic_docs = response["hits"]["hits"] + last_stats_commit = 'null' + if len(elastic_docs) > 0: + # search last records commit + last_stats_commit = elastic_docs[0]["_source"]['Commit_sha_chameleon'] + print("Regression: mean and stdev taken from commit ", last_stats_commit) + + for file in csv_files: + csvfile = open_csv(file) + err = 0 + for row in csvfile: + #print("row ", row) + [entry, err2] = format_entry_stats(row, mpi, commit_chameleon, last_stats_commit, + guix_commits, es, es_index_stats) + err = max(err, err2) + # insert updated mean and stdev in database + #print("entry ", entry) + es.index(index=es_index_stats.lower(), body=entry) + if err == 1: + sys.exit(1) if __name__ == "__main__": main() diff --git a/tools/bench/plafrim/chameleon-test.xml b/tools/bench/plafrim/chameleon-test.xml new file mode 100644 index 000000000..ef6c113ae --- /dev/null +++ b/tools/bench/plafrim/chameleon-test.xml @@ -0,0 +1,56 @@ +<?xml version="1.0" encoding="UTF-8"?> +<jube> + <benchmark name="plafrim" outpath="results"> + <comment>benchmark chameleon on plafrim</comment> + + <!-- Operation --> + <step name="run_gemm" tag="gemm"> + <use from= "parameters-test.xml">param_gemm</use> + <do>$command</do> + </step> + <step name="run_potrf" tag="potrf"> + <use from= "parameters-test.xml">param_potrf</use> + <do>$command</do> + </step> + <step name="run_geqrf_hqr" tag="geqrf"> + <use from= "parameters-test.xml">param_geqrf</use> + <do>$command</do> + </step> + + <!-- Analyse --> + <analyser name="analyse"> + <!-- use a pattern set --> + <use from="../jube/patterns.xml">chameleon</use> + <analyse step="run_gemm" tag="gemm"> + <file>stdout</file> <!-- file which should be scanned --> + </analyse> + <analyse step="run_potrf" tag="potrf"> + <file>stdout</file> <!-- file which should be scanned --> + </analyse> + <analyse step="run_geqrf_hqr" tag="geqrf"> + <file>stdout</file> <!-- file which should be scanned --> + </analyse> + </analyser> + + <!-- Create result table --> + <result> + <use>analyse</use> <!-- use existing analyser --> + <!--<table name="result" style="csv" sort="number">--> + <table name="result" style="csv"> + <column>hostname</column> + <column>algorithm</column> + <column>precision</column> + <column>nmpi</column> + <column>p</column> + <column>q</column> + <column>nthr</column> + <column>ngpu</column> + <column>m</column> + <column>n</column> + <column>k</column> + <column>cputime</column> + <column>gflops</column> + </table> + </result> + </benchmark> +</jube> diff --git a/tools/bench/plafrim/parameters/bora/parameters-test.xml b/tools/bench/plafrim/parameters/bora/parameters-test.xml new file mode 100644 index 000000000..1ea11e008 --- /dev/null +++ b/tools/bench/plafrim/parameters/bora/parameters-test.xml @@ -0,0 +1,54 @@ +<?xml version="1.0" encoding="UTF-8"?> +<jube> + <parameterset name="param_gemm"> + <parameter name="hostname" type="string">bora</parameter> + <parameter name="algorithm" type="string">gemm</parameter> + <parameter name="precision" type="string">s, d</parameter> + <parameter name="i_pq" type="int" >0, 1</parameter> + <parameter name="p" mode="python" type="int" >[1, 2][$i_pq]</parameter> + <parameter name="q" mode="python" type="int" >[1, 1][$i_pq]</parameter> + <parameter name="nmpi" mode="python" type="int" >[1, 2][$i_pq]</parameter> + <parameter name="nthr" type="int" >34</parameter> + <parameter name="ngpu" type="int" >0</parameter> + <parameter name="b" type="int" >280</parameter> + <parameter name="i_mn" type="int" >0, 1, 2</parameter> + <parameter name="m" mode="python" type="int" >[${nmpi}*${b}, ${nmpi}*2*${b}, ${nmpi}*4*${b}][$i_mn]</parameter> + <parameter name="k" mode="python" type="int" >${m}</parameter> + <parameter name="n" mode="python" type="int" >${m}</parameter> + <parameter name="command" type="string">mpiexec -np $nmpi $MPI_OPTIONS $CHAMELEON_BUILD/testing/chameleon_${precision}testing -o ${algorithm} -P $p -t $nthr -g $ngpu -m $m -n $n -k $k -b $b</parameter> + </parameterset> + <parameterset name="param_potrf"> + <parameter name="hostname" type="string">bora</parameter> + <parameter name="algorithm" type="string">potrf</parameter> + <parameter name="precision" type="string">s, d</parameter> + <parameter name="i_pq" type="int" >0, 1</parameter> + <parameter name="p" mode="python" type="int" >[1, 2][$i_pq]</parameter> + <parameter name="q" mode="python" type="int" >[1, 1][$i_pq]</parameter> + <parameter name="nmpi" mode="python" type="int" >[1, 2][$i_pq]</parameter> + <parameter name="nthr" type="int" >34</parameter> + <parameter name="ngpu" type="int" >0</parameter> + <parameter name="b" type="int" >280</parameter> + <parameter name="i_mn" type="int" >0, 1, 2</parameter> + <parameter name="m" mode="python" type="int" >[${nmpi}*${b}, ${nmpi}*2*${b}, ${nmpi}*4*${b}][$i_mn]</parameter> + <parameter name="n" mode="python" type="int" >${m}</parameter> + <parameter name="k" type="int" >1</parameter> + <parameter name="command" type="string">mpiexec -np $nmpi $MPI_OPTIONS $CHAMELEON_BUILD/testing/chameleon_${precision}testing -o ${algorithm} -P $p -t $nthr -g $ngpu -m $m -n $n -k $k -b $b</parameter> + </parameterset> + <parameterset name="param_geqrf"> + <parameter name="hostname" type="string">bora</parameter> + <parameter name="algorithm" type="string">geqrf_hqr</parameter> + <parameter name="precision" type="string">s, d</parameter> + <parameter name="i_pq" type="int" >0, 1</parameter> + <parameter name="p" mode="python" type="int" >[1, 2][$i_pq]</parameter> + <parameter name="q" mode="python" type="int" >[1, 1][$i_pq]</parameter> + <parameter name="nmpi" mode="python" type="int" >[1, 2][$i_pq]</parameter> + <parameter name="nthr" type="int" >34</parameter> + <parameter name="ngpu" type="int" >0</parameter> + <parameter name="b" type="int" >280</parameter> + <parameter name="i_mn" type="int" >0, 1, 2</parameter> + <parameter name="m" mode="python" type="int" >[${nmpi}*${b}, ${nmpi}*2*${b}, ${nmpi}*4*${b}][$i_mn]</parameter> + <parameter name="n" mode="python" type="int" >${m}</parameter> + <parameter name="k" type="int" >1</parameter> + <parameter name="command" type="string">mpiexec -np $nmpi $MPI_OPTIONS $CHAMELEON_BUILD/testing/chameleon_${precision}testing -o ${algorithm} -P $p -t $nthr -g $ngpu -m $m -n $n -k $k -b $b</parameter> + </parameterset> +</jube> diff --git a/tools/bench/plafrim/parameters/sirocco/parameters-test.xml b/tools/bench/plafrim/parameters/sirocco/parameters-test.xml new file mode 100644 index 000000000..a998ec66b --- /dev/null +++ b/tools/bench/plafrim/parameters/sirocco/parameters-test.xml @@ -0,0 +1,54 @@ +<?xml version="1.0" encoding="UTF-8"?> +<jube> + <parameterset name="param_gemm"> + <parameter name="hostname" type="string">sirocco</parameter> + <parameter name="algorithm" type="string">gemm</parameter> + <parameter name="precision" type="string">s, d</parameter> + <parameter name="i_pq" type="int" >0</parameter> + <parameter name="p" mode="python" type="int" >[1, 2][$i_pq]</parameter> + <parameter name="q" mode="python" type="int" >[1, 1][$i_pq]</parameter> + <parameter name="nmpi" mode="python" type="int" >[1, 2][$i_pq]</parameter> + <parameter name="nthr" type="int" >29</parameter> + <parameter name="ngpu" type="int" >2</parameter> + <parameter name="b" type="int" >1600</parameter> + <parameter name="i_mn" type="int" >0, 1, 2</parameter> + <parameter name="m" mode="python" type="int" >[${nmpi}*${b}, ${nmpi}*2*${b}, ${nmpi}*4*${b}][$i_mn]</parameter> + <parameter name="k" mode="python" type="int" >${m}</parameter> + <parameter name="n" mode="python" type="int" >${m}</parameter> + <parameter name="command" type="string">mpiexec -np $nmpi $MPI_OPTIONS $CHAMELEON_BUILD/testing/chameleon_${precision}testing -o ${algorithm} -P $p -t $nthr -g $ngpu -m $m -n $n -k $k -b $b</parameter> + </parameterset> + <parameterset name="param_potrf"> + <parameter name="hostname" type="string">sirocco</parameter> + <parameter name="algorithm" type="string">potrf</parameter> + <parameter name="precision" type="string">s, d</parameter> + <parameter name="i_pq" type="int" >0</parameter> + <parameter name="p" mode="python" type="int" >[1, 2][$i_pq]</parameter> + <parameter name="q" mode="python" type="int" >[1, 1][$i_pq]</parameter> + <parameter name="nmpi" mode="python" type="int" >[1, 2][$i_pq]</parameter> + <parameter name="nthr" type="int" >29</parameter> + <parameter name="ngpu" type="int" >2</parameter> + <parameter name="b" type="int" >1600</parameter> + <parameter name="i_mn" type="int" >0, 1, 2</parameter> + <parameter name="m" mode="python" type="int" >[${nmpi}*${b}, ${nmpi}*2*${b}, ${nmpi}*4*${b}][$i_mn]</parameter> + <parameter name="n" mode="python" type="int" >${m}</parameter> + <parameter name="k" type="int" >1</parameter> + <parameter name="command" type="string">mpiexec -np $nmpi $MPI_OPTIONS $CHAMELEON_BUILD/testing/chameleon_${precision}testing -o ${algorithm} -P $p -t $nthr -g $ngpu -m $m -n $n -k $k -b $b</parameter> + </parameterset> + <parameterset name="param_geqrf"> + <parameter name="hostname" type="string">sirocco</parameter> + <parameter name="algorithm" type="string">geqrf_hqr</parameter> + <parameter name="precision" type="string">s, d</parameter> + <parameter name="i_pq" type="int" >0</parameter> + <parameter name="p" mode="python" type="int" >[1, 2][$i_pq]</parameter> + <parameter name="q" mode="python" type="int" >[1, 1][$i_pq]</parameter> + <parameter name="nmpi" mode="python" type="int" >[1, 2][$i_pq]</parameter> + <parameter name="nthr" type="int" >29</parameter> + <parameter name="ngpu" type="int" >2</parameter> + <parameter name="b" type="int" >1600</parameter> + <parameter name="i_mn" type="int" >0, 1, 2</parameter> + <parameter name="m" mode="python" type="int" >[${nmpi}*${b}, ${nmpi}*2*${b}, ${nmpi}*4*${b}][$i_mn]</parameter> + <parameter name="n" mode="python" type="int" >${m}</parameter> + <parameter name="k" type="int" >1</parameter> + <parameter name="command" type="string">mpiexec -np $nmpi $MPI_OPTIONS $CHAMELEON_BUILD/testing/chameleon_${precision}testing -o ${algorithm} -P $p -t $nthr -g $ngpu -m $m -n $n -k $k -b $b</parameter> + </parameterset> +</jube> diff --git a/tools/bench/plafrim/run.sh b/tools/bench/plafrim/run.sh index 335538b61..7efccc647 100755 --- a/tools/bench/plafrim/run.sh +++ b/tools/bench/plafrim/run.sh @@ -9,8 +9,6 @@ # @author Florent Pruvost # @date 2022-02-22 # -echo "######################### Chameleon benchmarks #########################" - set -x # Unset the binding environment of the CI for this specific case @@ -21,57 +19,54 @@ unset STARPU_WORKERS_NOBIND export XDG_CACHE_HOME=/tmp/guix-$$ # save guix commits +#guix describe --format=json > guix.json guix time-machine -C ./tools/bench/guix-channels.scm -- describe --format=json > guix.json # define env var depending on the node type -if [ $NODE = "bora" ] -then +if [[ "$NODE" == "bora" ]]; then export SLURM_CONSTRAINTS="bora,omnipath" export CHAMELEON_BUILD_OPTIONS="-DCHAMELEON_USE_MPI=ON -DCMAKE_BUILD_TYPE=Release" export STARPU_HOSTNAME="bora" -elif [ $NODE = "sirocco" ] -then - export SLURM_CONSTRAINTS="sirocco,omnipath,v100" +elif [[ "$NODE" == "sirocco" ]]; then + export SLURM_CONSTRAINTS="sirocco,v100" export CHAMELEON_BUILD_OPTIONS="-DCHAMELEON_USE_MPI=ON -DCHAMELEON_USE_CUDA=ON -DCMAKE_BUILD_TYPE=Release" export STARPU_HOSTNAME="sirocco" export LD_PRELOAD="/usr/lib64/libcuda.so" else - echo "$0: Please set the NODE environnement variable to bora or sirocco." + echo "$0: Please set the NODE environment variable to bora or sirocco." exit -1 fi # define env var and guix rule to use depending on the mpi vendor GUIX_ENV="chameleon" -if [ $NODE = "sirocco" ] -then - GUIX_ENV="chameleon-cuda" -fi export MPI_OPTIONS="" -if [ $MPI = "openmpi" ] -then +if [[ "$MPI" == "openmpi" ]]; then export MPI_OPTIONS="--bind-to board" GUIX_ENV_MPI="" - GUIX_ADHOC_MPI="openssh openmpi" -elif [ $MPI = "nmad" ] -then + GUIX_ADHOC_MPI="" + if [[ "$NODE" == "sirocco" ]]; then + GUIX_ENV="chameleon-cuda" + fi +elif [[ "$MPI" == "nmad" ]]; then export MPI_OPTIONS="-DPIOM_DEDICATED=1 -DPIOM_DEDICATED_WAIT=1 hwloc-bind --cpubind machine:0" GUIX_ENV_MPI="--with-input=openmpi=nmad" - GUIX_ADHOC_MPI="which gzip zlib tar inetutils util-linux procps openssh nmad" + GUIX_ADHOC_MPI="which gzip zlib tar inetutils util-linux procps nmad" else echo "$0: Please set the MPI environnement variable to openmpi or nmad." exit -1 fi -GUIX_ADHOC="coreutils gawk grep hwloc jube perl python python-click python-certifi python-elasticsearch python-gitpython python-matplotlib python-pandas python-seaborn r-ggplot2 r-plyr r-reshape2 sed slurm mkl@2019" +GUIX_ADHOC="coreutils gawk grep hwloc jube nss-certs openssh perl python python-click python-certifi python-elasticsearch python-gitpython python-matplotlib python-pandas python-seaborn r-ggplot2 r-plyr r-reshape2 sed slurm mkl@2019" GUIX_RULE="-D $GUIX_ENV $GUIX_ENV_MPI $GUIX_ADHOC $GUIX_ADHOC_MPI" # Submit jobs -# OpenMPI version +#exec guix shell --pure \ exec guix time-machine -C ./tools/bench/guix-channels.scm -- shell --pure \ --preserve=PLATFORM \ --preserve=NODE \ --preserve=LD_PRELOAD \ --preserve=^CI \ + --preserve=proxy$ \ --preserve=^SLURM \ --preserve=^JUBE \ --preserve=^MPI \ @@ -79,8 +74,10 @@ exec guix time-machine -C ./tools/bench/guix-channels.scm -- shell --pure \ --preserve=^CHAMELEON \ $GUIX_RULE \ -- /bin/bash --norc ./tools/bench/plafrim/slurm.sh - -echo "####################### End Chameleon benchmarks #######################" +err=$? # clean tmp rm -rf /tmp/guix-$$ + +# exit with error code from the guix command +exit $err diff --git a/tools/bench/plafrim/slurm.sh b/tools/bench/plafrim/slurm.sh index f45e06d8b..4c153a665 100755 --- a/tools/bench/plafrim/slurm.sh +++ b/tools/bench/plafrim/slurm.sh @@ -9,7 +9,6 @@ # @author Florent Pruvost # @date 2022-02-22 # -echo "######################### Chameleon benchmarks #########################" # Check the environment echo $PLATFORM @@ -23,27 +22,6 @@ env |grep ^CHAMELEON set -x -function wait_completion { - # Wait for completion of jobs - echo "JOB_LIST $JOB_LIST" - while [ "$NJOB" -gt 0 ] - do - for JOB in $JOB_LIST - do - IS_JOB_IN_QUEUE=`squeue |grep "$JOB"` - if [[ -z "$IS_JOB_IN_QUEUE" ]] - then - NJOB=$[NJOB-1] - JOB_LIST=`echo $JOB_LIST | sed "s#$JOB##"` - echo "JOB $JOB finished" - else - echo "$IS_JOB_IN_QUEUE" - fi - done - sleep 30 - done -} - # Parameters of the Slurm jobs TIME=02:00:00 PART=routage @@ -52,17 +30,12 @@ CONS=$SLURM_CONSTRAINTS EXCL= # Submit jobs -NJOB=0 -JOB_ID=`JOB_NAME=chameleon\-$NODE\-$MPI\-$NP && sbatch --job-name="$JOB_NAME" --output="$JOB_NAME.out" --error="$JOB_NAME.err" --nodes=$NP --time=$TIME --partition=$PART --constraint=$CONS --exclude=$EXCL --exclusive --ntasks-per-node=1 --threads-per-core=1 $CI_PROJECT_DIR/tools/bench/chameleon_guix.sh | sed "s#Submitted batch job ##"` -if [[ -n "$JOB_ID" ]] -then - JOB_LIST="$JOB_LIST $JOB_ID" - NJOB=$[NJOB+1] -fi - -# Wait for completion of jobs -wait_completion +export JOB_NAME=chameleon\-$NODE\-$MPI\-$NP +sbatch --wait --job-name="$JOB_NAME" --output="$JOB_NAME.out" --error="$JOB_NAME.err" --nodes=$NP --time=$TIME --partition=$PART --constraint=$CONS --exclude=$EXCL --exclusive --ntasks-per-node=1 --threads-per-core=1 ./tools/bench/chameleon_guix.sh +err=$? -echo "####################### End Chameleon benchmarks #######################" +cat chameleon\-$NODE\-$MPI\-$NP.err +cat chameleon\-$NODE\-$MPI\-$NP.out -exit 0 +# exit with error code from the sbatch command +exit $err -- GitLab