diff --git a/data/grid5000/sites/lyon/clusters/hydra/hydra.json b/data/grid5000/sites/lyon/clusters/hydra/hydra.json index fc534df7bfbdd2f9251eda96b56de8e836c53c66..21f842ce3422c0cd8471455fa9b5d1797208bb91 100644 --- a/data/grid5000/sites/lyon/clusters/hydra/hydra.json +++ b/data/grid5000/sites/lyon/clusters/hydra/hydra.json @@ -2,10 +2,223 @@ "boot_type": "uefi", "created_at": "Thu, 15 Feb 2024 00:00:00 GMT", "exotic": true, - "kavlan": false, + "kavlan": true, "manufactured_at": "2024-02-15", "metrics": [ - + { + "description": "Temperature of Inlet_0 reported by BMC, in celsius", + "labels": { + "id": "inlet_0" + }, + "name": "bmc_other_temp_celsius", + "optional_period": 5000, + "period": 0, + "source": { + "id": 1, + "protocol": "ipmisensor" + } + }, + { + "description": "Temperature of Inlet_1 reported by BMC, in celsius", + "labels": { + "id": "inlet_1" + }, + "name": "bmc_other_temp_celsius", + "optional_period": 5000, + "period": 0, + "source": { + "id": 2, + "protocol": "ipmisensor" + } + }, + { + "description": "Temperature of MB_0 reported by BMC, in celsius", + "labels": { + "id": "mb_0" + }, + "name": "bmc_other_temp_celsius", + "optional_period": 5000, + "period": 0, + "source": { + "id": 3, + "protocol": "ipmisensor" + } + }, + { + "description": "Temperature of MB_1 reported by BMC, in celsius", + "labels": { + "id": "mb_1" + }, + "name": "bmc_other_temp_celsius", + "optional_period": 5000, + "period": 0, + "source": { + "id": 4, + "protocol": "ipmisensor" + } + }, + { + "description": "Temperature of outlet reported by BMC, in celsius", + "labels": { + "id": "outlet" + }, + "name": "bmc_other_temp_celsius", + "optional_period": 5000, + "period": 0, + "source": { + "id": 5, + "protocol": "ipmisensor" + } + }, + { + "description": "Temperature of SCM reported by BMC, in celsius", + "labels": { + "id": "scm" + }, + "name": "bmc_other_temp_celsius", + "optional_period": 5000, + "period": 0, + "source": { + "id": 6, + "protocol": "ipmisensor" + } + }, + { + "description": "Temperature of Psu 0 reported by BMC, in celsius", + "labels": { + "psu": "0" + }, + "name": "bmc_psu_temp_celsius", + "optional_period": 5000, + "period": 0, + "source": { + "id": 7, + "protocol": "ipmisensor" + } + }, + { + "description": "Temperature of Psu 1 reported by BMC, in celsius", + "labels": { + "psu": "1" + }, + "name": "bmc_psu_temp_celsius", + "optional_period": 5000, + "period": 0, + "source": { + "id": 8, + "protocol": "ipmisensor" + } + }, + { + "description": "Voltage of P12V_STB reported by BMC, in volt", + "labels": { + "id": "p12v_stb" + }, + "name": "bmc_other_voltage_volt", + "optional_period": 5000, + "period": 0, + "source": { + "id": 29, + "protocol": "ipmisensor" + } + }, + { + "description": "Voltage of P5V_STB reported by BMC, in volt", + "labels": { + "id": "p5v_stb" + }, + "name": "bmc_other_voltage_volt", + "optional_period": 5000, + "period": 0, + "source": { + "id": 30, + "protocol": "ipmisensor" + } + }, + { + "description": "Voltage of P3V_BAT reported by BMC, in volt", + "labels": { + "id": "p3v_bat" + }, + "name": "bmc_other_voltage_volt", + "optional_period": 5000, + "period": 0, + "source": { + "id": 34, + "protocol": "ipmisensor" + } + }, + { + "description": "Usage of BMC_CPU_Util reported by BMC, in percent", + "labels": { + "id": "bmc_cpu_util" + }, + "name": "bmc_other_usage_percent", + "optional_period": 5000, + "period": 0, + "source": { + "id": 78, + "protocol": "ipmisensor" + } + }, + { + "description": "Usage of BMC_MEM_Util reported by BMC, in percent", + "labels": { + "id": "bmc_mem_util" + }, + "name": "bmc_other_usage_percent", + "optional_period": 5000, + "period": 0, + "source": { + "id": 79, + "protocol": "ipmisensor" + } + }, + { + "description": "Default subset of metrics from Prometheus Node Exporter", + "name": "prom_default_metrics", + "optional_period": 15000, + "period": 0, + "source": { + "id": [ + "node_boot_time_seconds", + "node_cpu_scaling_frequency_hertz", + "node_cpu_seconds_total", + "node_filesystem_free_bytes", + "node_filesystem_size_bytes", + "node_load1", + "node_load15", + "node_load5", + "node_memory_Buffers_bytes", + "node_memory_Cached_bytes", + "node_memory_MemAvailable_bytes", + "node_memory_MemFree_bytes", + "node_memory_MemTotal_bytes", + "node_memory_Shmem_bytes", + "node_memory_SwapFree_bytes", + "node_memory_SwapTotal_bytes", + "node_network_receive_bytes_total", + "node_network_receive_packets_total", + "node_network_transmit_bytes_total", + "node_network_transmit_packets_total", + "node_procs_blocked", + "node_procs_running", + "kwollect_custom" + ], + "port": 9100, + "protocol": "prometheus" + } + }, + { + "description": "All metrics from Prometheus Node Exporter", + "name": "prom_all_metrics", + "optional_period": 15000, + "period": 0, + "source": { + "port": 9100, + "protocol": "prometheus" + } + } ], "model": "NVIDIA Grace Hopper", "priority": 202402, diff --git a/data/grid5000/sites/lyon/clusters/hydra/nodes/hydra-1.json b/data/grid5000/sites/lyon/clusters/hydra/nodes/hydra-1.json index b0d5f89b4602898f8669f91d1029aef037a616d1..997baa5068ed0c31f0c44dfcf22d12da3116f1e2 100644 --- a/data/grid5000/sites/lyon/clusters/hydra/nodes/hydra-1.json +++ b/data/grid5000/sites/lyon/clusters/hydra/nodes/hydra-1.json @@ -20,6 +20,50 @@ "warranty_end": "2027-02-15" }, "exotic": true, + "kavlan": { + "eth0": { + "kavlan-1": "192.168.201.1", + "kavlan-11": "10.7.214.1", + "kavlan-12": "10.11.214.1", + "kavlan-13": "10.15.214.1", + "kavlan-14": "10.19.214.1", + "kavlan-16": "10.27.214.1", + "kavlan-17": "10.31.214.1", + "kavlan-18": "10.35.214.1", + "kavlan-2": "192.168.217.1", + "kavlan-20": "10.43.214.1", + "kavlan-21": "10.47.214.1", + "kavlan-3": "192.168.233.1", + "kavlan-4": "10.12.9.1", + "kavlan-5": "10.12.73.1", + "kavlan-6": "10.12.137.1", + "kavlan-7": "10.12.201.1", + "kavlan-8": "10.13.9.1", + "kavlan-9": "10.13.73.1" + } + }, + "kavlan6": { + "eth0": { + "kavlan-1": "2001:660:4406:480:a::1", + "kavlan-11": "2001:660:4406:1a0:40a::1", + "kavlan-12": "2001:660:4406:2a0:40a::1", + "kavlan-13": "2001:660:4406:4a0:40a::1", + "kavlan-14": "2001:660:4406:5a0:40a::1", + "kavlan-16": "2001:660:4406:7a0:40a::1", + "kavlan-17": "2001:660:4406:9a0:40a::1", + "kavlan-18": "2001:660:4406:8a0:40a::1", + "kavlan-2": "2001:660:4406:481:a::1", + "kavlan-20": "2001:660:4406:3a0:40a::1", + "kavlan-21": "2001:660:4406:6a0:40a::1", + "kavlan-3": "2001:660:4406:482:a::1", + "kavlan-4": "2001:660:4406:490:a::1", + "kavlan-5": "2001:660:4406:491:a::1", + "kavlan-6": "2001:660:4406:492:a::1", + "kavlan-7": "2001:660:4406:493:a::1", + "kavlan-8": "2001:660:4406:494:a::1", + "kavlan-9": "2001:660:4406:495:a::1" + } + }, "main_memory": { "ram_size": 515396075520 }, @@ -45,7 +89,7 @@ "interface": "Ethernet", "ip": "172.16.57.1", "ip6": "2001:660:4406:400:a::1", - "kavlan": false, + "kavlan": true, "mac": "a0:88:c2:98:fc:82", "management": false, "model": "MT2910 Family [ConnectX-7]", diff --git a/data/grid5000/sites/lyon/clusters/hydra/nodes/hydra-2.json b/data/grid5000/sites/lyon/clusters/hydra/nodes/hydra-2.json index 083c47ea6f6c577cb9286e351626de1eb42281e1..53d2a401b199b99f0a4ca91a5e65603cec92cf33 100644 --- a/data/grid5000/sites/lyon/clusters/hydra/nodes/hydra-2.json +++ b/data/grid5000/sites/lyon/clusters/hydra/nodes/hydra-2.json @@ -20,6 +20,50 @@ "warranty_end": "2027-02-15" }, "exotic": true, + "kavlan": { + "eth0": { + "kavlan-1": "192.168.201.2", + "kavlan-11": "10.7.214.2", + "kavlan-12": "10.11.214.2", + "kavlan-13": "10.15.214.2", + "kavlan-14": "10.19.214.2", + "kavlan-16": "10.27.214.2", + "kavlan-17": "10.31.214.2", + "kavlan-18": "10.35.214.2", + "kavlan-2": "192.168.217.2", + "kavlan-20": "10.43.214.2", + "kavlan-21": "10.47.214.2", + "kavlan-3": "192.168.233.2", + "kavlan-4": "10.12.9.2", + "kavlan-5": "10.12.73.2", + "kavlan-6": "10.12.137.2", + "kavlan-7": "10.12.201.2", + "kavlan-8": "10.13.9.2", + "kavlan-9": "10.13.73.2" + } + }, + "kavlan6": { + "eth0": { + "kavlan-1": "2001:660:4406:480:a::2", + "kavlan-11": "2001:660:4406:1a0:40a::2", + "kavlan-12": "2001:660:4406:2a0:40a::2", + "kavlan-13": "2001:660:4406:4a0:40a::2", + "kavlan-14": "2001:660:4406:5a0:40a::2", + "kavlan-16": "2001:660:4406:7a0:40a::2", + "kavlan-17": "2001:660:4406:9a0:40a::2", + "kavlan-18": "2001:660:4406:8a0:40a::2", + "kavlan-2": "2001:660:4406:481:a::2", + "kavlan-20": "2001:660:4406:3a0:40a::2", + "kavlan-21": "2001:660:4406:6a0:40a::2", + "kavlan-3": "2001:660:4406:482:a::2", + "kavlan-4": "2001:660:4406:490:a::2", + "kavlan-5": "2001:660:4406:491:a::2", + "kavlan-6": "2001:660:4406:492:a::2", + "kavlan-7": "2001:660:4406:493:a::2", + "kavlan-8": "2001:660:4406:494:a::2", + "kavlan-9": "2001:660:4406:495:a::2" + } + }, "main_memory": { "ram_size": 515396075520 }, @@ -45,7 +89,7 @@ "interface": "Ethernet", "ip": "172.16.57.2", "ip6": "2001:660:4406:400:a::2", - "kavlan": false, + "kavlan": true, "mac": "a0:88:c2:98:6c:a0", "management": false, "model": "MT2910 Family [ConnectX-7]", diff --git a/data/grid5000/sites/lyon/clusters/hydra/nodes/hydra-3.json b/data/grid5000/sites/lyon/clusters/hydra/nodes/hydra-3.json index 251ff78f60b26e0fc339d9dd154e054fc1acdd27..ee3c98fac53a5a1bdec3d314a1e0a5181fcc15c9 100644 --- a/data/grid5000/sites/lyon/clusters/hydra/nodes/hydra-3.json +++ b/data/grid5000/sites/lyon/clusters/hydra/nodes/hydra-3.json @@ -20,6 +20,50 @@ "warranty_end": "2027-02-15" }, "exotic": true, + "kavlan": { + "eth0": { + "kavlan-1": "192.168.201.3", + "kavlan-11": "10.7.214.3", + "kavlan-12": "10.11.214.3", + "kavlan-13": "10.15.214.3", + "kavlan-14": "10.19.214.3", + "kavlan-16": "10.27.214.3", + "kavlan-17": "10.31.214.3", + "kavlan-18": "10.35.214.3", + "kavlan-2": "192.168.217.3", + "kavlan-20": "10.43.214.3", + "kavlan-21": "10.47.214.3", + "kavlan-3": "192.168.233.3", + "kavlan-4": "10.12.9.3", + "kavlan-5": "10.12.73.3", + "kavlan-6": "10.12.137.3", + "kavlan-7": "10.12.201.3", + "kavlan-8": "10.13.9.3", + "kavlan-9": "10.13.73.3" + } + }, + "kavlan6": { + "eth0": { + "kavlan-1": "2001:660:4406:480:a::3", + "kavlan-11": "2001:660:4406:1a0:40a::3", + "kavlan-12": "2001:660:4406:2a0:40a::3", + "kavlan-13": "2001:660:4406:4a0:40a::3", + "kavlan-14": "2001:660:4406:5a0:40a::3", + "kavlan-16": "2001:660:4406:7a0:40a::3", + "kavlan-17": "2001:660:4406:9a0:40a::3", + "kavlan-18": "2001:660:4406:8a0:40a::3", + "kavlan-2": "2001:660:4406:481:a::3", + "kavlan-20": "2001:660:4406:3a0:40a::3", + "kavlan-21": "2001:660:4406:6a0:40a::3", + "kavlan-3": "2001:660:4406:482:a::3", + "kavlan-4": "2001:660:4406:490:a::3", + "kavlan-5": "2001:660:4406:491:a::3", + "kavlan-6": "2001:660:4406:492:a::3", + "kavlan-7": "2001:660:4406:493:a::3", + "kavlan-8": "2001:660:4406:494:a::3", + "kavlan-9": "2001:660:4406:495:a::3" + } + }, "main_memory": { "ram_size": 515396075520 }, @@ -45,7 +89,7 @@ "interface": "Ethernet", "ip": "172.16.57.3", "ip6": "2001:660:4406:400:a::3", - "kavlan": false, + "kavlan": true, "mac": "a0:88:c2:99:18:42", "management": false, "model": "MT2910 Family [ConnectX-7]", diff --git a/data/grid5000/sites/lyon/clusters/hydra/nodes/hydra-4.json b/data/grid5000/sites/lyon/clusters/hydra/nodes/hydra-4.json index 8ccbb1812f6e1f2f19f0b915ce12cfaa448866ad..cdfd0ef391abc915b0b6bcb1ceffcdb80edff057 100644 --- a/data/grid5000/sites/lyon/clusters/hydra/nodes/hydra-4.json +++ b/data/grid5000/sites/lyon/clusters/hydra/nodes/hydra-4.json @@ -20,6 +20,50 @@ "warranty_end": "2027-02-15" }, "exotic": true, + "kavlan": { + "eth0": { + "kavlan-1": "192.168.201.4", + "kavlan-11": "10.7.214.4", + "kavlan-12": "10.11.214.4", + "kavlan-13": "10.15.214.4", + "kavlan-14": "10.19.214.4", + "kavlan-16": "10.27.214.4", + "kavlan-17": "10.31.214.4", + "kavlan-18": "10.35.214.4", + "kavlan-2": "192.168.217.4", + "kavlan-20": "10.43.214.4", + "kavlan-21": "10.47.214.4", + "kavlan-3": "192.168.233.4", + "kavlan-4": "10.12.9.4", + "kavlan-5": "10.12.73.4", + "kavlan-6": "10.12.137.4", + "kavlan-7": "10.12.201.4", + "kavlan-8": "10.13.9.4", + "kavlan-9": "10.13.73.4" + } + }, + "kavlan6": { + "eth0": { + "kavlan-1": "2001:660:4406:480:a::4", + "kavlan-11": "2001:660:4406:1a0:40a::4", + "kavlan-12": "2001:660:4406:2a0:40a::4", + "kavlan-13": "2001:660:4406:4a0:40a::4", + "kavlan-14": "2001:660:4406:5a0:40a::4", + "kavlan-16": "2001:660:4406:7a0:40a::4", + "kavlan-17": "2001:660:4406:9a0:40a::4", + "kavlan-18": "2001:660:4406:8a0:40a::4", + "kavlan-2": "2001:660:4406:481:a::4", + "kavlan-20": "2001:660:4406:3a0:40a::4", + "kavlan-21": "2001:660:4406:6a0:40a::4", + "kavlan-3": "2001:660:4406:482:a::4", + "kavlan-4": "2001:660:4406:490:a::4", + "kavlan-5": "2001:660:4406:491:a::4", + "kavlan-6": "2001:660:4406:492:a::4", + "kavlan-7": "2001:660:4406:493:a::4", + "kavlan-8": "2001:660:4406:494:a::4", + "kavlan-9": "2001:660:4406:495:a::4" + } + }, "main_memory": { "ram_size": 515396075520 }, @@ -45,7 +89,7 @@ "interface": "Ethernet", "ip": "172.16.57.4", "ip6": "2001:660:4406:400:a::4", - "kavlan": false, + "kavlan": true, "mac": "a0:88:c2:99:19:02", "management": false, "model": "MT2910 Family [ConnectX-7]", diff --git a/input/grid5000/sites/lyon/clusters/hydra/hydra.yaml b/input/grid5000/sites/lyon/clusters/hydra/hydra.yaml index 21bb04b150722ecbcab0de623ca7aff0c4690dd9..ecbbc633859a5cfeefffcd9d4ae1e60e278cac1e 100644 --- a/input/grid5000/sites/lyon/clusters/hydra/hydra.yaml +++ b/input/grid5000/sites/lyon/clusters/hydra/hydra.yaml @@ -1,7 +1,7 @@ --- model: NVIDIA Grace Hopper created_at: 2024-02-15 -kavlan: false +kavlan: true boot_type: uefi exotic: true queues: diff --git a/input/grid5000/sites/lyon/clusters/hydra/hydra_metrics.yaml b/input/grid5000/sites/lyon/clusters/hydra/hydra_metrics.yaml new file mode 100644 index 0000000000000000000000000000000000000000..b9217620efc42b0d64e3956c110e34043568817a --- /dev/null +++ b/input/grid5000/sites/lyon/clusters/hydra/hydra_metrics.yaml @@ -0,0 +1,159 @@ +--- +metrics: + + - name: bmc_other_temp_celsius + labels: {"id": "inlet_0"} + description: Temperature of Inlet_0 reported by BMC, in celsius + period: 0 + optional_period: 5000 + source: + protocol: ipmisensor + id: 1 #Temp_Inlet_0 + + - name: bmc_other_temp_celsius + labels: {"id": "inlet_1"} + description: Temperature of Inlet_1 reported by BMC, in celsius + period: 0 + optional_period: 5000 + source: + protocol: ipmisensor + id: 2 #Temp_Inlet_1 + + - name: bmc_other_temp_celsius + labels: {"id": "mb_0"} + description: Temperature of MB_0 reported by BMC, in celsius + period: 0 + optional_period: 5000 + source: + protocol: ipmisensor + id: 3 #Temp_MB_0 + + - name: bmc_other_temp_celsius + labels: {"id": "mb_1"} + description: Temperature of MB_1 reported by BMC, in celsius + period: 0 + optional_period: 5000 + source: + protocol: ipmisensor + id: 4 #Temp_MB_1 + + - name: bmc_other_temp_celsius + labels: {"id": "outlet"} + description: Temperature of outlet reported by BMC, in celsius + period: 0 + optional_period: 5000 + source: + protocol: ipmisensor + id: 5 #Temp_outlet + + - name: bmc_other_temp_celsius + labels: {"id": "scm"} + description: Temperature of SCM reported by BMC, in celsius + period: 0 + optional_period: 5000 + source: + protocol: ipmisensor + id: 6 #Temp_SCM + + - name: bmc_psu_temp_celsius + labels: {"psu": "0"} + description: Temperature of Psu 0 reported by BMC, in celsius + period: 0 + optional_period: 5000 + source: + protocol: ipmisensor + id: 7 #Temp_PSU0 + + - name: bmc_psu_temp_celsius + labels: {"psu": "1"} + description: Temperature of Psu 1 reported by BMC, in celsius + period: 0 + optional_period: 5000 + source: + protocol: ipmisensor + id: 8 #Temp_PSU1 + + - name: bmc_other_voltage_volt + labels: {"id": "p12v_stb"} + description: Voltage of P12V_STB reported by BMC, in volt + period: 0 + optional_period: 5000 + source: + protocol: ipmisensor + id: 29 #Volt_P12V_STB + + - name: bmc_other_voltage_volt + labels: {"id": "p5v_stb"} + description: Voltage of P5V_STB reported by BMC, in volt + period: 0 + optional_period: 5000 + source: + protocol: ipmisensor + id: 30 #Volt_P5V_STB + + - name: bmc_other_voltage_volt + labels: {"id": "p3v_bat"} + description: Voltage of P3V_BAT reported by BMC, in volt + period: 0 + optional_period: 5000 + source: + protocol: ipmisensor + id: 34 #Volt_P3V_BAT + + - name: bmc_other_usage_percent + labels: {"id": "bmc_cpu_util"} + description: Usage of BMC_CPU_Util reported by BMC, in percent + period: 0 + optional_period: 5000 + source: + protocol: ipmisensor + id: 78 #BMC_CPU_Util + + - name: bmc_other_usage_percent + labels: {"id": "bmc_mem_util"} + description: Usage of BMC_MEM_Util reported by BMC, in percent + period: 0 + optional_period: 5000 + source: + protocol: ipmisensor + id: 79 #BMC_MEM_Util + + - name: prom_default_metrics + description: Default subset of metrics from Prometheus Node Exporter + period: 0 + optional_period: 15000 + source: + protocol: prometheus + port: 9100 + id: + - node_boot_time_seconds + - node_cpu_scaling_frequency_hertz + - node_cpu_seconds_total + - node_filesystem_free_bytes + - node_filesystem_size_bytes + - node_load1 + - node_load15 + - node_load5 + - node_memory_Buffers_bytes + - node_memory_Cached_bytes + - node_memory_MemAvailable_bytes + - node_memory_MemFree_bytes + - node_memory_MemTotal_bytes + - node_memory_Shmem_bytes + - node_memory_SwapFree_bytes + - node_memory_SwapTotal_bytes + - node_network_receive_bytes_total + - node_network_receive_packets_total + - node_network_transmit_bytes_total + - node_network_transmit_packets_total + - node_procs_blocked + - node_procs_running + - kwollect_custom + + - name: prom_all_metrics + description: All metrics from Prometheus Node Exporter + period: 0 + optional_period: 15000 + source: + protocol: prometheus + port: 9100 diff --git a/input/grid5000/vlans.yaml b/input/grid5000/vlans.yaml index e477f33b80c535911860a1fef764e10044235c2b..9f6854517fb7a4bdaf3306e4494714c91e232267 100644 --- a/input/grid5000/vlans.yaml +++ b/input/grid5000/vlans.yaml @@ -90,6 +90,7 @@ vlans: local lyon pyxis eth0 0 0 6 0 local lyon neowise eth0 0 0 7 0 local lyon sirius eth0 0 0 8 0 + local lyon hydra eth0 0 0 9 0 local nancy gros eth0 0 0 0 0 local nancy gros eth1 0 0 1 0 local nancy gres eth0 0 0 2 0 @@ -149,6 +150,7 @@ vlans: global nancy gratouille eth0 0 0 19 14 global lyon neowise eth0 0 0 20 0 global lyon sirius eth0 0 0 21 0 + global lyon hydra eth0 0 0 22 0 global nancy grvingt eth0 0 0 24 0 global nancy gres eth0 0 0 25 0 global nancy grdix eth0 0 0 25 10 @@ -208,6 +210,7 @@ vlans: routed lyon pyxis eth0 0 12 6 0 routed lyon neowise eth0 0 12 7 0 routed lyon sirius eth0 0 12 8 0 + routed lyon hydra eth0 0 12 9 0 routed nancy gros eth0 0 16 0 0 routed nancy gros eth1 0 16 1 0 routed nancy grvingt eth0 0 16 3 0