From 61e32c9eeb1937797e52c091efaea5e0dd5c2917 Mon Sep 17 00:00:00 2001
From: Nathan Paulin <nathan.paulin@inria.fr>
Date: Wed, 26 Mar 2025 13:45:33 +0100
Subject: [PATCH] [sophia][esterel3] Add kwollect metrics

---
 .../sophia/clusters/esterel3/esterel3.json    | 27 +++++++++++++++++++
 .../clusters/esterel3/esterel3_metrics.yaml   | 24 +++++++++++++++++
 2 files changed, 51 insertions(+)

diff --git a/data/grid5000/sites/sophia/clusters/esterel3/esterel3.json b/data/grid5000/sites/sophia/clusters/esterel3/esterel3.json
index 147ee11b544..d71d423203b 100644
--- a/data/grid5000/sites/sophia/clusters/esterel3/esterel3.json
+++ b/data/grid5000/sites/sophia/clusters/esterel3/esterel3.json
@@ -49,6 +49,33 @@
         "port": 9100,
         "protocol": "prometheus"
       }
+    },
+    {
+      "description": "Default subset of metrics from Prometheus Nvidia DCGM Exporter",
+      "name": "prom_nvgpu_default_metrics",
+      "optional_period": 15000,
+      "period": 0,
+      "source": {
+        "id": [
+          "DCGM_FI_DEV_SM_CLOCK",
+          "DCGM_FI_DEV_MEM_CLOCK",
+          "DCGM_FI_DEV_GPU_TEMP",
+          "DCGM_FI_DEV_POWER_USAGE",
+          "DCGM_FI_DEV_MEM_COPY_UTIL"
+        ],
+        "port": 9400,
+        "protocol": "prometheus"
+      }
+    },
+    {
+      "description": "All metrics from Prometheus Nvidia DCGM Exporter",
+      "name": "prom_nvgpu_all_metrics",
+      "optional_period": 15000,
+      "period": 0,
+      "source": {
+        "port": 9400,
+        "protocol": "prometheus"
+      }
     }
   ],
   "model": "Dell PowerEdge T630",
diff --git a/input/grid5000/sites/sophia/clusters/esterel3/esterel3_metrics.yaml b/input/grid5000/sites/sophia/clusters/esterel3/esterel3_metrics.yaml
index 729e9beb582..bbcc255a649 100644
--- a/input/grid5000/sites/sophia/clusters/esterel3/esterel3_metrics.yaml
+++ b/input/grid5000/sites/sophia/clusters/esterel3/esterel3_metrics.yaml
@@ -1,5 +1,6 @@
 ---
 metrics:
+
   - name: prom_default_metrics
     description: Default subset of metrics from Prometheus Node Exporter
     period: 0
@@ -31,6 +32,7 @@ metrics:
         - node_procs_blocked
         - node_procs_running
         - kwollect_custom
+
   - name: prom_all_metrics
     description: All metrics from Prometheus Node Exporter
     period: 0
@@ -38,3 +40,25 @@ metrics:
     source:
       protocol: prometheus
       port: 9100
+
+  - name: prom_nvgpu_default_metrics
+    description: Default subset of metrics from Prometheus Nvidia DCGM Exporter
+    period: 0
+    optional_period: 15000
+    source:
+      protocol: prometheus
+      port: 9400
+      id:
+        - DCGM_FI_DEV_SM_CLOCK
+        - DCGM_FI_DEV_MEM_CLOCK
+        - DCGM_FI_DEV_GPU_TEMP
+        - DCGM_FI_DEV_POWER_USAGE
+        - DCGM_FI_DEV_MEM_COPY_UTIL
+
+  - name: prom_nvgpu_all_metrics
+    description: All metrics from Prometheus Nvidia DCGM Exporter
+    period: 0
+    optional_period: 15000
+    source:
+      protocol: prometheus
+      port: 9400
\ No newline at end of file
-- 
GitLab