diff --git a/data/grid5000/sites/sophia/clusters/esterel26/esterel26.json b/data/grid5000/sites/sophia/clusters/esterel26/esterel26.json index 733f18697a2e565b59dcb53b26b20d9c73da9774..552a9c5d69971f07efac14e00077bb9d35a260c8 100644 --- a/data/grid5000/sites/sophia/clusters/esterel26/esterel26.json +++ b/data/grid5000/sites/sophia/clusters/esterel26/esterel26.json @@ -49,12 +49,39 @@ "port": 9100, "protocol": "prometheus" } + }, + { + "description": "Default subset of metrics from Prometheus Nvidia DCGM Exporter", + "name": "prom_nvgpu_default_metrics", + "optional_period": 15000, + "period": 0, + "source": { + "id": [ + "DCGM_FI_DEV_SM_CLOCK", + "DCGM_FI_DEV_MEM_CLOCK", + "DCGM_FI_DEV_GPU_TEMP", + "DCGM_FI_DEV_POWER_USAGE", + "DCGM_FI_DEV_MEM_COPY_UTIL" + ], + "port": 9400, + "protocol": "prometheus" + } + }, + { + "description": "All metrics from Prometheus Nvidia DCGM Exporter", + "name": "prom_nvgpu_all_metrics", + "optional_period": 15000, + "period": 0, + "source": { + "port": 9400, + "protocol": "prometheus" + } } ], "model": "Dell PowerEdge T640", "nodes_count": 1, - "nodes_description": "2 CPUs Intel Xeon Silver 4216, 16 cores/CPU, 384GB RAM, 558GB HDD, 3575GB HDD, 1 x 1Gb Ethernet, 1 x 40Gb InfiniBand", - "priority": 202010, + "nodes_description": "2 CPUs Intel Xeon Silver 4216, 16 cores/CPU, 4 GPUs Quadro RTX 8000, 384GB RAM, 558GB HDD, 3575GB HDD, 1 x 1Gb Ethernet, 1 x 40Gb InfiniBand", + "priority": 202110, "queues": [ "admin", "testing" diff --git a/data/grid5000/sites/sophia/clusters/esterel26/nodes/esterel26-1.json b/data/grid5000/sites/sophia/clusters/esterel26/nodes/esterel26-1.json index fd4f22045deeee98b5caa64ac81bce59b547daa1..3bf570c4722864c1d8c7c4fa2bc57e886aa7b86b 100644 --- a/data/grid5000/sites/sophia/clusters/esterel26/nodes/esterel26-1.json +++ b/data/grid5000/sites/sophia/clusters/esterel26/nodes/esterel26-1.json @@ -20,6 +20,76 @@ "warranty_end": "2027-10-31" }, "exotic": false, + "gpu_devices": { + "nvidia0": { + "compute_capability": "7.5", + "cores": 4608, + "cpu_affinity": 0, + "device": "/dev/nvidia0", + "memory": 48318382080, + "microarchitecture": "Turing", + "model": "Quadro RTX 8000", + "performance": { + "fp-16": 32620000000000, + "fp-32": 16310000000000, + "fp-64": 510000000000 + }, + "power_default_limit": "250.00 W", + "vbios_version": "90.02.4E.00.03", + "vendor": "Nvidia" + }, + "nvidia1": { + "compute_capability": "7.5", + "cores": 4608, + "cpu_affinity": 0, + "device": "/dev/nvidia1", + "memory": 48318382080, + "microarchitecture": "Turing", + "model": "Quadro RTX 8000", + "performance": { + "fp-16": 32620000000000, + "fp-32": 16310000000000, + "fp-64": 510000000000 + }, + "power_default_limit": "250.00 W", + "vbios_version": "90.02.4E.00.03", + "vendor": "Nvidia" + }, + "nvidia2": { + "compute_capability": "7.5", + "cores": 4608, + "cpu_affinity": 1, + "device": "/dev/nvidia2", + "memory": 48318382080, + "microarchitecture": "Turing", + "model": "Quadro RTX 8000", + "performance": { + "fp-16": 32620000000000, + "fp-32": 16310000000000, + "fp-64": 510000000000 + }, + "power_default_limit": "250.00 W", + "vbios_version": "90.02.4E.00.03", + "vendor": "Nvidia" + }, + "nvidia3": { + "compute_capability": "7.5", + "cores": 4608, + "cpu_affinity": 1, + "device": "/dev/nvidia3", + "memory": 48318382080, + "microarchitecture": "Turing", + "model": "Quadro RTX 8000", + "performance": { + "fp-16": 32620000000000, + "fp-32": 16310000000000, + "fp-64": 510000000000 + }, + "power_default_limit": "250.00 W", + "vbios_version": "90.02.4E.00.03", + "vendor": "Nvidia" + } + }, "main_memory": { "ram_size": 412316860416 }, diff --git a/input/grid5000/sites/sophia/clusters/esterel26/esterel26_metrics.yaml b/input/grid5000/sites/sophia/clusters/esterel26/esterel26_metrics.yaml index 729e9beb582834908e55a7537bd5ace8abbd1f3d..120c97fe0968e891e06d06aa334841de965daffa 100644 --- a/input/grid5000/sites/sophia/clusters/esterel26/esterel26_metrics.yaml +++ b/input/grid5000/sites/sophia/clusters/esterel26/esterel26_metrics.yaml @@ -38,3 +38,24 @@ metrics: source: protocol: prometheus port: 9100 + + - name: prom_nvgpu_default_metrics + description: Default subset of metrics from Prometheus Nvidia DCGM Exporter + period: 0 + optional_period: 15000 + source: + protocol: prometheus + port: 9400 + id: + - DCGM_FI_DEV_SM_CLOCK + - DCGM_FI_DEV_MEM_CLOCK + - DCGM_FI_DEV_GPU_TEMP + - DCGM_FI_DEV_POWER_USAGE + - DCGM_FI_DEV_MEM_COPY_UTIL + - name: prom_nvgpu_all_metrics + description: All metrics from Prometheus Nvidia DCGM Exporter + period: 0 + optional_period: 15000 + source: + protocol: prometheus + port: 9400 diff --git a/input/grid5000/sites/sophia/clusters/esterel26/nodes/esterel26-1.yaml b/input/grid5000/sites/sophia/clusters/esterel26/nodes/esterel26-1.yaml index 0d7d8daefabffc9cd01df2fee32a9765fc31f522..c34ba13547a92c6aacc6595e7aa480648ea1d183 100644 --- a/input/grid5000/sites/sophia/clusters/esterel26/nodes/esterel26-1.yaml +++ b/input/grid5000/sites/sophia/clusters/esterel26/nodes/esterel26-1.yaml @@ -16,6 +16,39 @@ esterel26-1: manufacturer: Dell Inc. name: PowerEdge T640 serial: 61GCH73 + gpu_devices: + nvidia0: + cpu_affinity: 0 + device: "/dev/nvidia0" + memory: 48318382080 + model: Quadro RTX 8000 + power_default_limit: 250.00 W + vbios_version: 90.02.4E.00.03 + vendor: Nvidia + nvidia1: + cpu_affinity: 0 + device: "/dev/nvidia1" + memory: 48318382080 + model: Quadro RTX 8000 + power_default_limit: 250.00 W + vbios_version: 90.02.4E.00.03 + vendor: Nvidia + nvidia2: + cpu_affinity: 1 + device: "/dev/nvidia2" + memory: 48318382080 + model: Quadro RTX 8000 + power_default_limit: 250.00 W + vbios_version: 90.02.4E.00.03 + vendor: Nvidia + nvidia3: + cpu_affinity: 1 + device: "/dev/nvidia3" + memory: 48318382080 + model: Quadro RTX 8000 + power_default_limit: 250.00 W + vbios_version: 90.02.4E.00.03 + vendor: Nvidia main_memory: ram_size: 412316860416 memory_devices: