From 324ed4e20811e464bea2aad46484e0194f0fa4d0 Mon Sep 17 00:00:00 2001 From: BERARD Benjamin <benjamin.berard@inria.fr> Date: Tue, 19 Oct 2021 19:09:41 +0200 Subject: [PATCH] [nancy/gruss] Add kwollect information --- .../sites/nancy/clusters/gruss/gruss.json | 663 +++++++++++++++++- .../nancy/network_equipments/sgruss.json | 74 ++ .../nancy/clusters/gruss/gruss_metrics.yaml | 430 ++++++++++++ .../grid5000/sites/nancy/networks/sgruss.yaml | 49 ++ 4 files changed, 1215 insertions(+), 1 deletion(-) create mode 100644 input/grid5000/sites/nancy/clusters/gruss/gruss_metrics.yaml diff --git a/data/grid5000/sites/nancy/clusters/gruss/gruss.json b/data/grid5000/sites/nancy/clusters/gruss/gruss.json index a75e3ae0ad6..961b37e8324 100644 --- a/data/grid5000/sites/nancy/clusters/gruss/gruss.json +++ b/data/grid5000/sites/nancy/clusters/gruss/gruss.json @@ -3,7 +3,668 @@ "exotic": false, "kavlan": true, "metrics": [ - + { + "description": "Temperature of Temp reported by BMC, in celsius", + "labels": { + "id": "temp" + }, + "name": "bmc_other_temp_celsius", + "optional_period": 5000, + "period": 0, + "source": { + "id": 1, + "protocol": "ipmisensor" + } + }, + { + "description": "Temperature of Temp reported by BMC, in celsius", + "labels": { + "id": "temp" + }, + "name": "bmc_other_temp_celsius", + "optional_period": 5000, + "period": 0, + "source": { + "id": 2, + "protocol": "ipmisensor" + } + }, + { + "description": "Temperature reported by BMC, in celsius", + "name": "bmc_ambient_temp_celsius", + "optional_period": 5000, + "period": 0, + "source": { + "id": 3, + "protocol": "ipmisensor" + } + }, + { + "description": "Speed of Fan1A reported by BMC, in rpm", + "labels": { + "id": "fan1a" + }, + "name": "bmc_other_speed_rpm", + "optional_period": 5000, + "period": 0, + "source": { + "id": 4, + "protocol": "ipmisensor" + } + }, + { + "description": "Speed of Fan2A reported by BMC, in rpm", + "labels": { + "id": "fan2a" + }, + "name": "bmc_other_speed_rpm", + "optional_period": 5000, + "period": 0, + "source": { + "id": 5, + "protocol": "ipmisensor" + } + }, + { + "description": "Speed of Fan3A reported by BMC, in rpm", + "labels": { + "id": "fan3a" + }, + "name": "bmc_other_speed_rpm", + "optional_period": 5000, + "period": 0, + "source": { + "id": 6, + "protocol": "ipmisensor" + } + }, + { + "description": "Speed of Fan4A reported by BMC, in rpm", + "labels": { + "id": "fan4a" + }, + "name": "bmc_other_speed_rpm", + "optional_period": 5000, + "period": 0, + "source": { + "id": 7, + "protocol": "ipmisensor" + } + }, + { + "description": "Speed of Fan5A reported by BMC, in rpm", + "labels": { + "id": "fan5a" + }, + "name": "bmc_other_speed_rpm", + "optional_period": 5000, + "period": 0, + "source": { + "id": 8, + "protocol": "ipmisensor" + } + }, + { + "description": "Speed of Fan6A reported by BMC, in rpm", + "labels": { + "id": "fan6a" + }, + "name": "bmc_other_speed_rpm", + "optional_period": 5000, + "period": 0, + "source": { + "id": 9, + "protocol": "ipmisensor" + } + }, + { + "description": "Speed of Fan1B reported by BMC, in rpm", + "labels": { + "id": "fan1b" + }, + "name": "bmc_other_speed_rpm", + "optional_period": 5000, + "period": 0, + "source": { + "id": 10, + "protocol": "ipmisensor" + } + }, + { + "description": "Speed of Fan2B reported by BMC, in rpm", + "labels": { + "id": "fan2b" + }, + "name": "bmc_other_speed_rpm", + "optional_period": 5000, + "period": 0, + "source": { + "id": 11, + "protocol": "ipmisensor" + } + }, + { + "description": "Speed of Fan3B reported by BMC, in rpm", + "labels": { + "id": "fan3b" + }, + "name": "bmc_other_speed_rpm", + "optional_period": 5000, + "period": 0, + "source": { + "id": 12, + "protocol": "ipmisensor" + } + }, + { + "description": "Speed of Fan4B reported by BMC, in rpm", + "labels": { + "id": "fan4b" + }, + "name": "bmc_other_speed_rpm", + "optional_period": 5000, + "period": 0, + "source": { + "id": 13, + "protocol": "ipmisensor" + } + }, + { + "description": "Speed of Fan5B reported by BMC, in rpm", + "labels": { + "id": "fan5b" + }, + "name": "bmc_other_speed_rpm", + "optional_period": 5000, + "period": 0, + "source": { + "id": 14, + "protocol": "ipmisensor" + } + }, + { + "description": "Speed of Fan6B reported by BMC, in rpm", + "labels": { + "id": "fan6b" + }, + "name": "bmc_other_speed_rpm", + "optional_period": 5000, + "period": 0, + "source": { + "id": 15, + "protocol": "ipmisensor" + } + }, + { + "description": "Current of Current 1 reported by BMC, in amp", + "labels": { + "id": "current1" + }, + "name": "bmc_other_current_amp", + "optional_period": 5000, + "period": 0, + "source": { + "id": 33, + "protocol": "ipmisensor" + } + }, + { + "description": "Current of Current 2 reported by BMC, in amp", + "labels": { + "id": "current2" + }, + "name": "bmc_other_current_amp", + "optional_period": 5000, + "period": 0, + "source": { + "id": 34, + "protocol": "ipmisensor" + } + }, + { + "description": "Voltage of Voltage 1 reported by BMC, in volt", + "labels": { + "id": "voltage1" + }, + "name": "bmc_other_voltage_volt", + "optional_period": 5000, + "period": 0, + "source": { + "id": 35, + "protocol": "ipmisensor" + } + }, + { + "description": "Voltage of Voltage 2 reported by BMC, in volt", + "labels": { + "id": "voltage2" + }, + "name": "bmc_other_voltage_volt", + "optional_period": 5000, + "period": 0, + "source": { + "id": 36, + "protocol": "ipmisensor" + } + }, + { + "description": "Power consumption of Pwr Consumption reported by BMC, in watt", + "labels": { + "id": "pwrconsumption" + }, + "name": "bmc_other_power_watt", + "optional_period": 5000, + "period": 0, + "source": { + "id": 43, + "protocol": "ipmisensor" + } + }, + { + "description": "Usage of CPU Usage reported by BMC, in percent", + "labels": { + "id": "cpuusage" + }, + "name": "bmc_other_usage_percent", + "optional_period": 5000, + "period": 0, + "source": { + "id": 49, + "protocol": "ipmisensor" + } + }, + { + "description": "Temperature of GPU1 Temp reported by BMC, in celsius", + "labels": { + "id": "gpu1temp" + }, + "name": "bmc_other_temp_celsius", + "optional_period": 5000, + "period": 0, + "source": { + "id": 161, + "protocol": "ipmisensor" + } + }, + { + "description": "Temperature of GPU2 Temp reported by BMC, in celsius", + "labels": { + "id": "gpu2temp" + }, + "name": "bmc_other_temp_celsius", + "optional_period": 5000, + "period": 0, + "source": { + "id": 162, + "protocol": "ipmisensor" + } + }, + { + "description": "Temperature of GPU3 Temp reported by BMC, in celsius", + "labels": { + "id": "gpu3temp" + }, + "name": "bmc_other_temp_celsius", + "optional_period": 5000, + "period": 0, + "source": { + "id": 163, + "protocol": "ipmisensor" + } + }, + { + "description": "Temperature of GPU4 Temp reported by BMC, in celsius", + "labels": { + "id": "gpu4temp" + }, + "name": "bmc_other_temp_celsius", + "optional_period": 5000, + "period": 0, + "source": { + "id": 164, + "protocol": "ipmisensor" + } + }, + { + "description": "Temperature of GPU5 Temp reported by BMC, in celsius", + "labels": { + "id": "gpu5temp" + }, + "name": "bmc_other_temp_celsius", + "optional_period": 5000, + "period": 0, + "source": { + "id": 165, + "protocol": "ipmisensor" + } + }, + { + "description": "Temperature of GPU6 Temp reported by BMC, in celsius", + "labels": { + "id": "gpu6temp" + }, + "name": "bmc_other_temp_celsius", + "optional_period": 5000, + "period": 0, + "source": { + "id": 166, + "protocol": "ipmisensor" + } + }, + { + "description": "Temperature of GPU7 Temp reported by BMC, in celsius", + "labels": { + "id": "gpu7temp" + }, + "name": "bmc_other_temp_celsius", + "optional_period": 5000, + "period": 0, + "source": { + "id": 167, + "protocol": "ipmisensor" + } + }, + { + "description": "Temperature of GPU8 Temp reported by BMC, in celsius", + "labels": { + "id": "gpu8temp" + }, + "name": "bmc_other_temp_celsius", + "optional_period": 5000, + "period": 0, + "source": { + "id": 168, + "protocol": "ipmisensor" + } + }, + { + "description": "Temperature reported by BMC, in celsius", + "name": "bmc_exhaust_temp_celsius", + "optional_period": 5000, + "period": 0, + "source": { + "id": 173, + "protocol": "ipmisensor" + } + }, + { + "description": "Voltage of VCORE VR reported by BMC, in volt", + "labels": { + "id": "vcorevr" + }, + "name": "bmc_other_voltage_volt", + "optional_period": 5000, + "period": 0, + "source": { + "id": 225, + "protocol": "ipmisensor" + } + }, + { + "description": "Voltage of VCORE VR reported by BMC, in volt", + "labels": { + "id": "vcorevr" + }, + "name": "bmc_other_voltage_volt", + "optional_period": 5000, + "period": 0, + "source": { + "id": 226, + "protocol": "ipmisensor" + } + }, + { + "description": "Voltage of MEMABCD VR reported by BMC, in volt", + "labels": { + "id": "memabcdvr" + }, + "name": "bmc_other_voltage_volt", + "optional_period": 5000, + "period": 0, + "source": { + "id": 227, + "protocol": "ipmisensor" + } + }, + { + "description": "Voltage of MEMEFGH VR reported by BMC, in volt", + "labels": { + "id": "memefghvr" + }, + "name": "bmc_other_voltage_volt", + "optional_period": 5000, + "period": 0, + "source": { + "id": 228, + "protocol": "ipmisensor" + } + }, + { + "description": "Voltage of MEMABCD VR reported by BMC, in volt", + "labels": { + "id": "memabcdvr" + }, + "name": "bmc_other_voltage_volt", + "optional_period": 5000, + "period": 0, + "source": { + "id": 229, + "protocol": "ipmisensor" + } + }, + { + "description": "Voltage of MEMEFGH VR reported by BMC, in volt", + "labels": { + "id": "memefghvr" + }, + "name": "bmc_other_voltage_volt", + "optional_period": 5000, + "period": 0, + "source": { + "id": 230, + "protocol": "ipmisensor" + } + }, + { + "description": "Speed of Fan7A reported by BMC, in rpm", + "labels": { + "id": "fan7a" + }, + "name": "bmc_other_speed_rpm", + "optional_period": 5000, + "period": 0, + "source": { + "id": 256, + "protocol": "ipmisensor" + } + }, + { + "description": "Speed of Fan8A reported by BMC, in rpm", + "labels": { + "id": "fan8a" + }, + "name": "bmc_other_speed_rpm", + "optional_period": 5000, + "period": 0, + "source": { + "id": 257, + "protocol": "ipmisensor" + } + }, + { + "description": "Speed of Fan9A reported by BMC, in rpm", + "labels": { + "id": "fan9a" + }, + "name": "bmc_other_speed_rpm", + "optional_period": 5000, + "period": 0, + "source": { + "id": 258, + "protocol": "ipmisensor" + } + }, + { + "description": "Default subset of metrics from Prometheus Node Exporter", + "name": "prom_default_metrics", + "optional_period": 15000, + "period": 0, + "source": { + "id": [ + "node_boot_time_seconds", + "node_cpu_frequency_hertz", + "node_cpu_seconds_total", + "node_filesystem_free_bytes", + "node_filesystem_size_bytes", + "node_load1", + "node_load15", + "node_load5", + "node_memory_Buffers_bytes", + "node_memory_Cached_bytes", + "node_memory_MemAvailable_bytes", + "node_memory_MemFree_bytes", + "node_memory_MemTotal_bytes", + "node_memory_Shmem_bytes", + "node_memory_SwapFree_bytes", + "node_memory_SwapTotal_bytes", + "node_network_receive_bytes_total", + "node_network_receive_packets_total", + "node_network_transmit_bytes_total", + "node_network_transmit_packets_total", + "node_procs_blocked", + "node_procs_running", + "kwollect_custom" + ], + "port": 9100, + "protocol": "prometheus" + } + }, + { + "description": "All metrics from Prometheus Node Exporter", + "name": "prom_all_metrics", + "optional_period": 15000, + "period": 0, + "source": { + "port": 9100, + "protocol": "prometheus" + } + }, + { + "description": "Default subset of metrics from Prometheus Nvidia DCGM Exporter", + "name": "prom_nvgpu_default_metrics", + "optional_period": 15000, + "period": 0, + "source": { + "id": [ + "DCGM_FI_DEV_SM_CLOCK", + "DCGM_FI_DEV_MEM_CLOCK", + "DCGM_FI_DEV_GPU_TEMP", + "DCGM_FI_DEV_POWER_USAGE", + "DCGM_FI_DEV_GPU_UTIL", + "DCGM_FI_DEV_MEM_COPY_UTIL" + ], + "port": 9400, + "protocol": "prometheus" + } + }, + { + "description": "All metrics from Prometheus Nvidia DCGM Exporter", + "name": "prom_nvgpu_all_metrics", + "optional_period": 15000, + "period": 0, + "source": { + "port": 9400, + "protocol": "prometheus" + } + }, + { + "description": "Input byte counter for the network device port", + "labels": { + "interface": "eth2" + }, + "name": "network_ifacein_bytes_total", + "period": 15000, + "source": { + "protocol": "network_equipment" + } + }, + { + "description": "Output byte counter for the network device port", + "labels": { + "interface": "eth2" + }, + "name": "network_ifaceout_bytes_total", + "period": 15000, + "source": { + "protocol": "network_equipment" + } + }, + { + "description": "Input packet counter for the network device port", + "labels": { + "interface": "eth2" + }, + "name": "network_ifacein_packets_total", + "period": 15000, + "source": { + "protocol": "network_equipment" + } + }, + { + "description": "Output packet counter for the network device port", + "labels": { + "interface": "eth2" + }, + "name": "network_ifaceout_packets_total", + "period": 15000, + "source": { + "protocol": "network_equipment" + } + }, + { + "description": "Input counter of discarded packets for the network device port", + "labels": { + "interface": "eth2" + }, + "name": "network_ifacein_packets_discard_total", + "period": 15000, + "source": { + "protocol": "network_equipment" + } + }, + { + "description": "Output counter of discarded packets for the network device port", + "labels": { + "interface": "eth2" + }, + "name": "network_ifaceout_packets_discard_total", + "period": 15000, + "source": { + "protocol": "network_equipment" + } + }, + { + "description": "Input counter of packet errors for the network device port", + "labels": { + "interface": "eth2" + }, + "name": "network_ifacein_packets_error_total", + "period": 15000, + "source": { + "protocol": "network_equipment" + } + }, + { + "description": "Output counter of packet errors for the network device port", + "labels": { + "interface": "eth2" + }, + "name": "network_ifaceout_packets_error_total", + "period": 15000, + "source": { + "protocol": "network_equipment" + } + } ], "model": "Dell PowerEdge R7525", "queues": [ diff --git a/data/grid5000/sites/nancy/network_equipments/sgruss.json b/data/grid5000/sites/nancy/network_equipments/sgruss.json index 03d3b42ebec..e9acf6fe0d5 100644 --- a/data/grid5000/sites/nancy/network_equipments/sgruss.json +++ b/data/grid5000/sites/nancy/network_equipments/sgruss.json @@ -100,6 +100,80 @@ "snmp_pattern": "ethernet1/%LINECARD%/%PORT%" } ], + "metrics": [ + { + "description": "Input byte counter for the network device port", + "name": "network_ifacein_bytes_total", + "period": 15000, + "source": { + "id": "1.3.6.1.2.1.2.2.1.10.{{ 1.3.6.1.2.1.2.2.1.2 == %SNMP_IFACE% }}", + "protocol": "snmp" + } + }, + { + "description": "Output byte counter for the network device port", + "name": "network_ifaceout_bytes_total", + "period": 15000, + "source": { + "id": "1.3.6.1.2.1.2.2.1.16.{{ 1.3.6.1.2.1.2.2.1.2 == %SNMP_IFACE% }}", + "protocol": "snmp" + } + }, + { + "description": "Input packet counter for the network device port", + "name": "network_ifacein_packets_total", + "period": 15000, + "source": { + "id": "1.3.6.1.2.1.2.2.1.11.{{ 1.3.6.1.2.1.2.2.1.2 == %SNMP_IFACE% }}", + "protocol": "snmp" + } + }, + { + "description": "Output packet counter for the network device port", + "name": "network_ifaceout_packets_total", + "period": 15000, + "source": { + "id": "1.3.6.1.2.1.2.2.1.17.{{ 1.3.6.1.2.1.2.2.1.2 == %SNMP_IFACE% }}", + "protocol": "snmp" + } + }, + { + "description": "Input counter of discarded packets for the network device port", + "name": "network_ifacein_packets_discard_total", + "period": 15000, + "source": { + "id": "1.3.6.1.2.1.2.2.1.13.{{ 1.3.6.1.2.1.2.2.1.2 == %SNMP_IFACE% }}", + "protocol": "snmp" + } + }, + { + "description": "Output counter of discarded packets for the network device port", + "name": "network_ifaceout_packets_discard_total", + "period": 15000, + "source": { + "id": "1.3.6.1.2.1.2.2.1.19.{{ 1.3.6.1.2.1.2.2.1.2 == %SNMP_IFACE% }}", + "protocol": "snmp" + } + }, + { + "description": "Input counter of packet errors for the network device port", + "name": "network_ifacein_packets_error_total", + "period": 15000, + "source": { + "id": "1.3.6.1.2.1.2.2.1.14.{{ 1.3.6.1.2.1.2.2.1.2 == %SNMP_IFACE% }}", + "protocol": "snmp" + } + }, + { + "description": "Output counter of packet errors for the network device port", + "name": "network_ifaceout_packets_error_total", + "period": 15000, + "source": { + "id": "1.3.6.1.2.1.2.2.1.20.{{ 1.3.6.1.2.1.2.2.1.2 == %SNMP_IFACE% }}", + "protocol": "snmp" + } + } + ], "model": "Dell S5224F-ON", "site": "nancy", "snmp_community": "public", diff --git a/input/grid5000/sites/nancy/clusters/gruss/gruss_metrics.yaml b/input/grid5000/sites/nancy/clusters/gruss/gruss_metrics.yaml new file mode 100644 index 00000000000..c0e6f07ad03 --- /dev/null +++ b/input/grid5000/sites/nancy/clusters/gruss/gruss_metrics.yaml @@ -0,0 +1,430 @@ +--- +metrics: + + # - name: bmc_ambient_temp_celsius + # description: Front node ambient temperature reported by BMC, in celsius + # period: 5000 + # source: + # protocol: ipmisensor + # id: FIXME #Inlet Temp + + #- name: bmc_node_power_watt + # description: Power consumption of node reported by BMC, in watt + # period: 5000 + # source: + # protocol: ipmisensor + # id: FIXME #Pwr Consumption + + - name: bmc_other_temp_celsius + labels: {"id": "temp"} + description: Temperature of Temp reported by BMC, in celsius + period: 0 + optional_period: 5000 + source: + protocol: ipmisensor + id: 1 #Temp + + - name: bmc_other_temp_celsius + labels: {"id": "temp"} + description: Temperature of Temp reported by BMC, in celsius + period: 0 + optional_period: 5000 + source: + protocol: ipmisensor + id: 2 #Temp + + - name: bmc_ambient_temp_celsius + description: Temperature reported by BMC, in celsius + period: 0 + optional_period: 5000 + source: + protocol: ipmisensor + id: 3 #Inlet Temp + + - name: bmc_other_speed_rpm + labels: {"id": "fan1a"} + description: Speed of Fan1A reported by BMC, in rpm + period: 0 + optional_period: 5000 + source: + protocol: ipmisensor + id: 4 #Fan1A + + - name: bmc_other_speed_rpm + labels: {"id": "fan2a"} + description: Speed of Fan2A reported by BMC, in rpm + period: 0 + optional_period: 5000 + source: + protocol: ipmisensor + id: 5 #Fan2A + + - name: bmc_other_speed_rpm + labels: {"id": "fan3a"} + description: Speed of Fan3A reported by BMC, in rpm + period: 0 + optional_period: 5000 + source: + protocol: ipmisensor + id: 6 #Fan3A + + - name: bmc_other_speed_rpm + labels: {"id": "fan4a"} + description: Speed of Fan4A reported by BMC, in rpm + period: 0 + optional_period: 5000 + source: + protocol: ipmisensor + id: 7 #Fan4A + + - name: bmc_other_speed_rpm + labels: {"id": "fan5a"} + description: Speed of Fan5A reported by BMC, in rpm + period: 0 + optional_period: 5000 + source: + protocol: ipmisensor + id: 8 #Fan5A + + - name: bmc_other_speed_rpm + labels: {"id": "fan6a"} + description: Speed of Fan6A reported by BMC, in rpm + period: 0 + optional_period: 5000 + source: + protocol: ipmisensor + id: 9 #Fan6A + + - name: bmc_other_speed_rpm + labels: {"id": "fan1b"} + description: Speed of Fan1B reported by BMC, in rpm + period: 0 + optional_period: 5000 + source: + protocol: ipmisensor + id: 10 #Fan1B + + - name: bmc_other_speed_rpm + labels: {"id": "fan2b"} + description: Speed of Fan2B reported by BMC, in rpm + period: 0 + optional_period: 5000 + source: + protocol: ipmisensor + id: 11 #Fan2B + + - name: bmc_other_speed_rpm + labels: {"id": "fan3b"} + description: Speed of Fan3B reported by BMC, in rpm + period: 0 + optional_period: 5000 + source: + protocol: ipmisensor + id: 12 #Fan3B + + - name: bmc_other_speed_rpm + labels: {"id": "fan4b"} + description: Speed of Fan4B reported by BMC, in rpm + period: 0 + optional_period: 5000 + source: + protocol: ipmisensor + id: 13 #Fan4B + + - name: bmc_other_speed_rpm + labels: {"id": "fan5b"} + description: Speed of Fan5B reported by BMC, in rpm + period: 0 + optional_period: 5000 + source: + protocol: ipmisensor + id: 14 #Fan5B + + - name: bmc_other_speed_rpm + labels: {"id": "fan6b"} + description: Speed of Fan6B reported by BMC, in rpm + period: 0 + optional_period: 5000 + source: + protocol: ipmisensor + id: 15 #Fan6B + + - name: bmc_other_current_amp + labels: {"id": "current1"} + description: Current of Current 1 reported by BMC, in amp + period: 0 + optional_period: 5000 + source: + protocol: ipmisensor + id: 33 #Current 1 + + - name: bmc_other_current_amp + labels: {"id": "current2"} + description: Current of Current 2 reported by BMC, in amp + period: 0 + optional_period: 5000 + source: + protocol: ipmisensor + id: 34 #Current 2 + + - name: bmc_other_voltage_volt + labels: {"id": "voltage1"} + description: Voltage of Voltage 1 reported by BMC, in volt + period: 0 + optional_period: 5000 + source: + protocol: ipmisensor + id: 35 #Voltage 1 + + - name: bmc_other_voltage_volt + labels: {"id": "voltage2"} + description: Voltage of Voltage 2 reported by BMC, in volt + period: 0 + optional_period: 5000 + source: + protocol: ipmisensor + id: 36 #Voltage 2 + + - name: bmc_other_power_watt + labels: {"id": "pwrconsumption"} + description: Power consumption of Pwr Consumption reported by BMC, in watt + period: 0 + optional_period: 5000 + source: + protocol: ipmisensor + id: 43 #Pwr Consumption + + - name: bmc_other_usage_percent + labels: {"id": "cpuusage"} + description: Usage of CPU Usage reported by BMC, in percent + period: 0 + optional_period: 5000 + source: + protocol: ipmisensor + id: 49 #CPU Usage + + - name: bmc_other_temp_celsius + labels: {"id": "gpu1temp"} + description: Temperature of GPU1 Temp reported by BMC, in celsius + period: 0 + optional_period: 5000 + source: + protocol: ipmisensor + id: 161 #GPU1 Temp + + - name: bmc_other_temp_celsius + labels: {"id": "gpu2temp"} + description: Temperature of GPU2 Temp reported by BMC, in celsius + period: 0 + optional_period: 5000 + source: + protocol: ipmisensor + id: 162 #GPU2 Temp + + - name: bmc_other_temp_celsius + labels: {"id": "gpu3temp"} + description: Temperature of GPU3 Temp reported by BMC, in celsius + period: 0 + optional_period: 5000 + source: + protocol: ipmisensor + id: 163 #GPU3 Temp + + - name: bmc_other_temp_celsius + labels: {"id": "gpu4temp"} + description: Temperature of GPU4 Temp reported by BMC, in celsius + period: 0 + optional_period: 5000 + source: + protocol: ipmisensor + id: 164 #GPU4 Temp + + - name: bmc_other_temp_celsius + labels: {"id": "gpu5temp"} + description: Temperature of GPU5 Temp reported by BMC, in celsius + period: 0 + optional_period: 5000 + source: + protocol: ipmisensor + id: 165 #GPU5 Temp + + - name: bmc_other_temp_celsius + labels: {"id": "gpu6temp"} + description: Temperature of GPU6 Temp reported by BMC, in celsius + period: 0 + optional_period: 5000 + source: + protocol: ipmisensor + id: 166 #GPU6 Temp + + - name: bmc_other_temp_celsius + labels: {"id": "gpu7temp"} + description: Temperature of GPU7 Temp reported by BMC, in celsius + period: 0 + optional_period: 5000 + source: + protocol: ipmisensor + id: 167 #GPU7 Temp + + - name: bmc_other_temp_celsius + labels: {"id": "gpu8temp"} + description: Temperature of GPU8 Temp reported by BMC, in celsius + period: 0 + optional_period: 5000 + source: + protocol: ipmisensor + id: 168 #GPU8 Temp + + - name: bmc_exhaust_temp_celsius + description: Temperature reported by BMC, in celsius + period: 0 + optional_period: 5000 + source: + protocol: ipmisensor + id: 173 #Exhaust Temp + + - name: bmc_other_voltage_volt + labels: {"id": "vcorevr"} + description: Voltage of VCORE VR reported by BMC, in volt + period: 0 + optional_period: 5000 + source: + protocol: ipmisensor + id: 225 #VCORE VR + + - name: bmc_other_voltage_volt + labels: {"id": "vcorevr"} + description: Voltage of VCORE VR reported by BMC, in volt + period: 0 + optional_period: 5000 + source: + protocol: ipmisensor + id: 226 #VCORE VR + + - name: bmc_other_voltage_volt + labels: {"id": "memabcdvr"} + description: Voltage of MEMABCD VR reported by BMC, in volt + period: 0 + optional_period: 5000 + source: + protocol: ipmisensor + id: 227 #MEMABCD VR + + - name: bmc_other_voltage_volt + labels: {"id": "memefghvr"} + description: Voltage of MEMEFGH VR reported by BMC, in volt + period: 0 + optional_period: 5000 + source: + protocol: ipmisensor + id: 228 #MEMEFGH VR + + - name: bmc_other_voltage_volt + labels: {"id": "memabcdvr"} + description: Voltage of MEMABCD VR reported by BMC, in volt + period: 0 + optional_period: 5000 + source: + protocol: ipmisensor + id: 229 #MEMABCD VR + + - name: bmc_other_voltage_volt + labels: {"id": "memefghvr"} + description: Voltage of MEMEFGH VR reported by BMC, in volt + period: 0 + optional_period: 5000 + source: + protocol: ipmisensor + id: 230 #MEMEFGH VR + + - name: bmc_other_speed_rpm + labels: {"id": "fan7a"} + description: Speed of Fan7A reported by BMC, in rpm + period: 0 + optional_period: 5000 + source: + protocol: ipmisensor + id: 256 #Fan7A + + - name: bmc_other_speed_rpm + labels: {"id": "fan8a"} + description: Speed of Fan8A reported by BMC, in rpm + period: 0 + optional_period: 5000 + source: + protocol: ipmisensor + id: 257 #Fan8A + + - name: bmc_other_speed_rpm + labels: {"id": "fan9a"} + description: Speed of Fan9A reported by BMC, in rpm + period: 0 + optional_period: 5000 + source: + protocol: ipmisensor + id: 258 #Fan9A + + - name: prom_default_metrics + description: Default subset of metrics from Prometheus Node Exporter + period: 0 + optional_period: 15000 + source: + protocol: prometheus + port: 9100 + id: + - node_boot_time_seconds + - node_cpu_frequency_hertz + - node_cpu_seconds_total + - node_filesystem_free_bytes + - node_filesystem_size_bytes + - node_load1 + - node_load15 + - node_load5 + - node_memory_Buffers_bytes + - node_memory_Cached_bytes + - node_memory_MemAvailable_bytes + - node_memory_MemFree_bytes + - node_memory_MemTotal_bytes + - node_memory_Shmem_bytes + - node_memory_SwapFree_bytes + - node_memory_SwapTotal_bytes + - node_network_receive_bytes_total + - node_network_receive_packets_total + - node_network_transmit_bytes_total + - node_network_transmit_packets_total + - node_procs_blocked + - node_procs_running + - kwollect_custom + + - name: prom_all_metrics + description: All metrics from Prometheus Node Exporter + period: 0 + optional_period: 15000 + source: + protocol: prometheus + port: 9100 + + - name: prom_nvgpu_default_metrics + description: Default subset of metrics from Prometheus Nvidia DCGM Exporter + period: 0 + optional_period: 15000 + source: + protocol: prometheus + port: 9400 + id: + - DCGM_FI_DEV_SM_CLOCK + - DCGM_FI_DEV_MEM_CLOCK + - DCGM_FI_DEV_GPU_TEMP + - DCGM_FI_DEV_POWER_USAGE + - DCGM_FI_DEV_GPU_UTIL + - DCGM_FI_DEV_MEM_COPY_UTIL + + - name: prom_nvgpu_all_metrics + description: All metrics from Prometheus Nvidia DCGM Exporter + period: 0 + optional_period: 15000 + source: + protocol: prometheus + port: 9400 + + diff --git a/input/grid5000/sites/nancy/networks/sgruss.yaml b/input/grid5000/sites/nancy/networks/sgruss.yaml index 3cb02bb79ca..2c1762061d5 100644 --- a/input/grid5000/sites/nancy/networks/sgruss.yaml +++ b/input/grid5000/sites/nancy/networks/sgruss.yaml @@ -5,6 +5,55 @@ sgruss: kind: switch site: nancy snmp_community: public + metrics: + - name: network_ifacein_bytes_total + description: Input byte counter for the network device port + period: 15000 + source: + protocol: snmp + id: 1.3.6.1.2.1.2.2.1.10.{{ 1.3.6.1.2.1.2.2.1.2 == %SNMP_IFACE% }} + - name: network_ifaceout_bytes_total + description: Output byte counter for the network device port + period: 15000 + source: + protocol: snmp + id: 1.3.6.1.2.1.2.2.1.16.{{ 1.3.6.1.2.1.2.2.1.2 == %SNMP_IFACE% }} + - name: network_ifacein_packets_total + description: Input packet counter for the network device port + period: 15000 + source: + protocol: snmp + id: 1.3.6.1.2.1.2.2.1.11.{{ 1.3.6.1.2.1.2.2.1.2 == %SNMP_IFACE% }} + - name: network_ifaceout_packets_total + description: Output packet counter for the network device port + period: 15000 + source: + protocol: snmp + id: 1.3.6.1.2.1.2.2.1.17.{{ 1.3.6.1.2.1.2.2.1.2 == %SNMP_IFACE% }} + - name: network_ifacein_packets_discard_total + description: Input counter of discarded packets for the network device port + period: 15000 + source: + protocol: snmp + id: 1.3.6.1.2.1.2.2.1.13.{{ 1.3.6.1.2.1.2.2.1.2 == %SNMP_IFACE% }} + - name: network_ifaceout_packets_discard_total + description: Output counter of discarded packets for the network device port + period: 15000 + source: + protocol: snmp + id: 1.3.6.1.2.1.2.2.1.19.{{ 1.3.6.1.2.1.2.2.1.2 == %SNMP_IFACE% }} + - name: network_ifacein_packets_error_total + description: Input counter of packet errors for the network device port + period: 15000 + source: + protocol: snmp + id: 1.3.6.1.2.1.2.2.1.14.{{ 1.3.6.1.2.1.2.2.1.2 == %SNMP_IFACE% }} + - name: network_ifaceout_packets_error_total + description: Output counter of packet errors for the network device port + period: 15000 + source: + protocol: snmp + id: 1.3.6.1.2.1.2.2.1.20.{{ 1.3.6.1.2.1.2.2.1.2 == %SNMP_IFACE% }} ip: 172.17.79.212 backplane_bps: 2160000000000 linecards: -- GitLab