From 61e32c9eeb1937797e52c091efaea5e0dd5c2917 Mon Sep 17 00:00:00 2001 From: Nathan Paulin <nathan.paulin@inria.fr> Date: Wed, 26 Mar 2025 13:45:33 +0100 Subject: [PATCH] [sophia][esterel3] Add kwollect metrics --- .../sophia/clusters/esterel3/esterel3.json | 27 +++++++++++++++++++ .../clusters/esterel3/esterel3_metrics.yaml | 24 +++++++++++++++++ 2 files changed, 51 insertions(+) diff --git a/data/grid5000/sites/sophia/clusters/esterel3/esterel3.json b/data/grid5000/sites/sophia/clusters/esterel3/esterel3.json index 147ee11b544..d71d423203b 100644 --- a/data/grid5000/sites/sophia/clusters/esterel3/esterel3.json +++ b/data/grid5000/sites/sophia/clusters/esterel3/esterel3.json @@ -49,6 +49,33 @@ "port": 9100, "protocol": "prometheus" } + }, + { + "description": "Default subset of metrics from Prometheus Nvidia DCGM Exporter", + "name": "prom_nvgpu_default_metrics", + "optional_period": 15000, + "period": 0, + "source": { + "id": [ + "DCGM_FI_DEV_SM_CLOCK", + "DCGM_FI_DEV_MEM_CLOCK", + "DCGM_FI_DEV_GPU_TEMP", + "DCGM_FI_DEV_POWER_USAGE", + "DCGM_FI_DEV_MEM_COPY_UTIL" + ], + "port": 9400, + "protocol": "prometheus" + } + }, + { + "description": "All metrics from Prometheus Nvidia DCGM Exporter", + "name": "prom_nvgpu_all_metrics", + "optional_period": 15000, + "period": 0, + "source": { + "port": 9400, + "protocol": "prometheus" + } } ], "model": "Dell PowerEdge T630", diff --git a/input/grid5000/sites/sophia/clusters/esterel3/esterel3_metrics.yaml b/input/grid5000/sites/sophia/clusters/esterel3/esterel3_metrics.yaml index 729e9beb582..bbcc255a649 100644 --- a/input/grid5000/sites/sophia/clusters/esterel3/esterel3_metrics.yaml +++ b/input/grid5000/sites/sophia/clusters/esterel3/esterel3_metrics.yaml @@ -1,5 +1,6 @@ --- metrics: + - name: prom_default_metrics description: Default subset of metrics from Prometheus Node Exporter period: 0 @@ -31,6 +32,7 @@ metrics: - node_procs_blocked - node_procs_running - kwollect_custom + - name: prom_all_metrics description: All metrics from Prometheus Node Exporter period: 0 @@ -38,3 +40,25 @@ metrics: source: protocol: prometheus port: 9100 + + - name: prom_nvgpu_default_metrics + description: Default subset of metrics from Prometheus Nvidia DCGM Exporter + period: 0 + optional_period: 15000 + source: + protocol: prometheus + port: 9400 + id: + - DCGM_FI_DEV_SM_CLOCK + - DCGM_FI_DEV_MEM_CLOCK + - DCGM_FI_DEV_GPU_TEMP + - DCGM_FI_DEV_POWER_USAGE + - DCGM_FI_DEV_MEM_COPY_UTIL + + - name: prom_nvgpu_all_metrics + description: All metrics from Prometheus Nvidia DCGM Exporter + period: 0 + optional_period: 15000 + source: + protocol: prometheus + port: 9400 \ No newline at end of file -- GitLab