From e0b40e51f80eaea7ebe0e108ead88c2a78842978 Mon Sep 17 00:00:00 2001
From: Julien Lelaurain <julien.lelaurain@inria.fr>
Date: Tue, 1 Apr 2025 09:18:43 +0200
Subject: [PATCH] [sophia][esterel32] Add kwollect metrics

---
 .../sophia/clusters/esterel32/esterel32.json  | 27 +++++++++++++++++++
 .../clusters/esterel32/esterel32_metrics.yaml | 20 ++++++++++++++
 2 files changed, 47 insertions(+)

diff --git a/data/grid5000/sites/sophia/clusters/esterel32/esterel32.json b/data/grid5000/sites/sophia/clusters/esterel32/esterel32.json
index a30d4954ac5..04b3bec8be7 100644
--- a/data/grid5000/sites/sophia/clusters/esterel32/esterel32.json
+++ b/data/grid5000/sites/sophia/clusters/esterel32/esterel32.json
@@ -49,6 +49,33 @@
         "port": 9100,
         "protocol": "prometheus"
       }
+    },
+    {
+      "description": "Default subset of metrics from Prometheus Nvidia DCGM Exporter",
+      "name": "prom_nvgpu_default_metrics",
+      "optional_period": 15000,
+      "period": 0,
+      "source": {
+        "id": [
+          "DCGM_FI_DEV_SM_CLOCK",
+          "DCGM_FI_DEV_MEM_CLOCK",
+          "DCGM_FI_DEV_GPU_TEMP",
+          "DCGM_FI_DEV_POWER_USAGE",
+          "DCGM_FI_DEV_MEM_COPY_UTIL"
+        ],
+        "port": 9400,
+        "protocol": "prometheus"
+      }
+    },
+    {
+      "description": "All metrics from Prometheus Nvidia DCGM Exporter",
+      "name": "prom_nvgpu_all_metrics",
+      "optional_period": 15000,
+      "period": 0,
+      "source": {
+        "port": 9400,
+        "protocol": "prometheus"
+      }
     }
   ],
   "model": "Dell PowerEdge T640",
diff --git a/input/grid5000/sites/sophia/clusters/esterel32/esterel32_metrics.yaml b/input/grid5000/sites/sophia/clusters/esterel32/esterel32_metrics.yaml
index 729e9beb582..e2714b591b6 100644
--- a/input/grid5000/sites/sophia/clusters/esterel32/esterel32_metrics.yaml
+++ b/input/grid5000/sites/sophia/clusters/esterel32/esterel32_metrics.yaml
@@ -38,3 +38,23 @@ metrics:
     source:
       protocol: prometheus
       port: 9100
+  - name: prom_nvgpu_default_metrics
+    description: Default subset of metrics from Prometheus Nvidia DCGM Exporter
+    period: 0
+    optional_period: 15000
+    source:
+      protocol: prometheus
+      port: 9400
+      id:
+        - DCGM_FI_DEV_SM_CLOCK
+        - DCGM_FI_DEV_MEM_CLOCK
+        - DCGM_FI_DEV_GPU_TEMP
+        - DCGM_FI_DEV_POWER_USAGE
+        - DCGM_FI_DEV_MEM_COPY_UTIL
+  - name: prom_nvgpu_all_metrics
+    description: All metrics from Prometheus Nvidia DCGM Exporter
+    period: 0
+    optional_period: 15000
+    source:
+      protocol: prometheus
+      port: 9400
-- 
GitLab