From fae9cc7dd1f6ed07d2c52bd0df3bd958a539da52 Mon Sep 17 00:00:00 2001
From: Hugo Dominois <hugo.dominois@inria.fr>
Date: Tue, 25 Mar 2025 15:42:48 +0100
Subject: [PATCH 1/2] [sophia][esterel26] Add GPU device

---
 .../sophia/clusters/esterel26/esterel26.json  |  4 +-
 .../clusters/esterel26/nodes/esterel26-1.json | 70 +++++++++++++++++++
 .../clusters/esterel26/nodes/esterel26-1.yaml | 33 +++++++++
 3 files changed, 105 insertions(+), 2 deletions(-)

diff --git a/data/grid5000/sites/sophia/clusters/esterel26/esterel26.json b/data/grid5000/sites/sophia/clusters/esterel26/esterel26.json
index 733f18697a2..13491d7a144 100644
--- a/data/grid5000/sites/sophia/clusters/esterel26/esterel26.json
+++ b/data/grid5000/sites/sophia/clusters/esterel26/esterel26.json
@@ -53,8 +53,8 @@
   ],
   "model": "Dell PowerEdge T640",
   "nodes_count": 1,
-  "nodes_description": "2 CPUs Intel Xeon Silver 4216, 16 cores/CPU, 384GB RAM, 558GB HDD, 3575GB HDD, 1 x 1Gb Ethernet, 1 x 40Gb InfiniBand",
-  "priority": 202010,
+  "nodes_description": "2 CPUs Intel Xeon Silver 4216, 16 cores/CPU, 4 GPUs Quadro RTX 8000, 384GB RAM, 558GB HDD, 3575GB HDD, 1 x 1Gb Ethernet, 1 x 40Gb InfiniBand",
+  "priority": 202110,
   "queues": [
     "admin",
     "testing"
diff --git a/data/grid5000/sites/sophia/clusters/esterel26/nodes/esterel26-1.json b/data/grid5000/sites/sophia/clusters/esterel26/nodes/esterel26-1.json
index fd4f22045de..3bf570c4722 100644
--- a/data/grid5000/sites/sophia/clusters/esterel26/nodes/esterel26-1.json
+++ b/data/grid5000/sites/sophia/clusters/esterel26/nodes/esterel26-1.json
@@ -20,6 +20,76 @@
     "warranty_end": "2027-10-31"
   },
   "exotic": false,
+  "gpu_devices": {
+    "nvidia0": {
+      "compute_capability": "7.5",
+      "cores": 4608,
+      "cpu_affinity": 0,
+      "device": "/dev/nvidia0",
+      "memory": 48318382080,
+      "microarchitecture": "Turing",
+      "model": "Quadro RTX 8000",
+      "performance": {
+        "fp-16": 32620000000000,
+        "fp-32": 16310000000000,
+        "fp-64": 510000000000
+      },
+      "power_default_limit": "250.00 W",
+      "vbios_version": "90.02.4E.00.03",
+      "vendor": "Nvidia"
+    },
+    "nvidia1": {
+      "compute_capability": "7.5",
+      "cores": 4608,
+      "cpu_affinity": 0,
+      "device": "/dev/nvidia1",
+      "memory": 48318382080,
+      "microarchitecture": "Turing",
+      "model": "Quadro RTX 8000",
+      "performance": {
+        "fp-16": 32620000000000,
+        "fp-32": 16310000000000,
+        "fp-64": 510000000000
+      },
+      "power_default_limit": "250.00 W",
+      "vbios_version": "90.02.4E.00.03",
+      "vendor": "Nvidia"
+    },
+    "nvidia2": {
+      "compute_capability": "7.5",
+      "cores": 4608,
+      "cpu_affinity": 1,
+      "device": "/dev/nvidia2",
+      "memory": 48318382080,
+      "microarchitecture": "Turing",
+      "model": "Quadro RTX 8000",
+      "performance": {
+        "fp-16": 32620000000000,
+        "fp-32": 16310000000000,
+        "fp-64": 510000000000
+      },
+      "power_default_limit": "250.00 W",
+      "vbios_version": "90.02.4E.00.03",
+      "vendor": "Nvidia"
+    },
+    "nvidia3": {
+      "compute_capability": "7.5",
+      "cores": 4608,
+      "cpu_affinity": 1,
+      "device": "/dev/nvidia3",
+      "memory": 48318382080,
+      "microarchitecture": "Turing",
+      "model": "Quadro RTX 8000",
+      "performance": {
+        "fp-16": 32620000000000,
+        "fp-32": 16310000000000,
+        "fp-64": 510000000000
+      },
+      "power_default_limit": "250.00 W",
+      "vbios_version": "90.02.4E.00.03",
+      "vendor": "Nvidia"
+    }
+  },
   "main_memory": {
     "ram_size": 412316860416
   },
diff --git a/input/grid5000/sites/sophia/clusters/esterel26/nodes/esterel26-1.yaml b/input/grid5000/sites/sophia/clusters/esterel26/nodes/esterel26-1.yaml
index 0d7d8daefab..c34ba13547a 100644
--- a/input/grid5000/sites/sophia/clusters/esterel26/nodes/esterel26-1.yaml
+++ b/input/grid5000/sites/sophia/clusters/esterel26/nodes/esterel26-1.yaml
@@ -16,6 +16,39 @@ esterel26-1:
     manufacturer: Dell Inc.
     name: PowerEdge T640
     serial: 61GCH73
+  gpu_devices:
+    nvidia0:
+      cpu_affinity: 0
+      device: "/dev/nvidia0"
+      memory: 48318382080
+      model: Quadro RTX 8000
+      power_default_limit: 250.00 W
+      vbios_version: 90.02.4E.00.03
+      vendor: Nvidia
+    nvidia1:
+      cpu_affinity: 0
+      device: "/dev/nvidia1"
+      memory: 48318382080
+      model: Quadro RTX 8000
+      power_default_limit: 250.00 W
+      vbios_version: 90.02.4E.00.03
+      vendor: Nvidia
+    nvidia2:
+      cpu_affinity: 1
+      device: "/dev/nvidia2"
+      memory: 48318382080
+      model: Quadro RTX 8000
+      power_default_limit: 250.00 W
+      vbios_version: 90.02.4E.00.03
+      vendor: Nvidia
+    nvidia3:
+      cpu_affinity: 1
+      device: "/dev/nvidia3"
+      memory: 48318382080
+      model: Quadro RTX 8000
+      power_default_limit: 250.00 W
+      vbios_version: 90.02.4E.00.03
+      vendor: Nvidia
   main_memory:
     ram_size: 412316860416
   memory_devices:
-- 
GitLab


From 89cff71acddc3991aff550df0a456b8da45ea5f7 Mon Sep 17 00:00:00 2001
From: Hugo Dominois <hugo.dominois@inria.fr>
Date: Tue, 25 Mar 2025 15:44:01 +0100
Subject: [PATCH 2/2] [sophia][esterel26] Add GPU metrics

---
 .../sophia/clusters/esterel26/esterel26.json  | 27 +++++++++++++++++++
 .../clusters/esterel26/esterel26_metrics.yaml | 21 +++++++++++++++
 2 files changed, 48 insertions(+)

diff --git a/data/grid5000/sites/sophia/clusters/esterel26/esterel26.json b/data/grid5000/sites/sophia/clusters/esterel26/esterel26.json
index 13491d7a144..552a9c5d699 100644
--- a/data/grid5000/sites/sophia/clusters/esterel26/esterel26.json
+++ b/data/grid5000/sites/sophia/clusters/esterel26/esterel26.json
@@ -49,6 +49,33 @@
         "port": 9100,
         "protocol": "prometheus"
       }
+    },
+    {
+      "description": "Default subset of metrics from Prometheus Nvidia DCGM Exporter",
+      "name": "prom_nvgpu_default_metrics",
+      "optional_period": 15000,
+      "period": 0,
+      "source": {
+        "id": [
+          "DCGM_FI_DEV_SM_CLOCK",
+          "DCGM_FI_DEV_MEM_CLOCK",
+          "DCGM_FI_DEV_GPU_TEMP",
+          "DCGM_FI_DEV_POWER_USAGE",
+          "DCGM_FI_DEV_MEM_COPY_UTIL"
+        ],
+        "port": 9400,
+        "protocol": "prometheus"
+      }
+    },
+    {
+      "description": "All metrics from Prometheus Nvidia DCGM Exporter",
+      "name": "prom_nvgpu_all_metrics",
+      "optional_period": 15000,
+      "period": 0,
+      "source": {
+        "port": 9400,
+        "protocol": "prometheus"
+      }
     }
   ],
   "model": "Dell PowerEdge T640",
diff --git a/input/grid5000/sites/sophia/clusters/esterel26/esterel26_metrics.yaml b/input/grid5000/sites/sophia/clusters/esterel26/esterel26_metrics.yaml
index 729e9beb582..120c97fe096 100644
--- a/input/grid5000/sites/sophia/clusters/esterel26/esterel26_metrics.yaml
+++ b/input/grid5000/sites/sophia/clusters/esterel26/esterel26_metrics.yaml
@@ -38,3 +38,24 @@ metrics:
     source:
       protocol: prometheus
       port: 9100
+
+  - name: prom_nvgpu_default_metrics
+    description: Default subset of metrics from Prometheus Nvidia DCGM Exporter
+    period: 0
+    optional_period: 15000
+    source:
+      protocol: prometheus
+      port: 9400
+      id:
+        - DCGM_FI_DEV_SM_CLOCK
+        - DCGM_FI_DEV_MEM_CLOCK
+        - DCGM_FI_DEV_GPU_TEMP
+        - DCGM_FI_DEV_POWER_USAGE
+        - DCGM_FI_DEV_MEM_COPY_UTIL
+  - name: prom_nvgpu_all_metrics
+    description: All metrics from Prometheus Nvidia DCGM Exporter
+    period: 0
+    optional_period: 15000
+    source:
+      protocol: prometheus
+      port: 9400
-- 
GitLab