diff --git a/data/grid5000/sites/rennes/clusters/roazhon4/roazhon4.json b/data/grid5000/sites/rennes/clusters/roazhon4/roazhon4.json index e4a20683b9ef972766eca9e5f856d3732d74efb4..66fa6eedbb23d01cf980afcb2c7fb2e35213a603 100644 --- a/data/grid5000/sites/rennes/clusters/roazhon4/roazhon4.json +++ b/data/grid5000/sites/rennes/clusters/roazhon4/roazhon4.json @@ -4,7 +4,251 @@ "exotic": false, "kavlan": false, "metrics": [ - + { + "description": "Front node ambient temperature reported by BMC, in celsius", + "name": "bmc_ambient_temp_celsius", + "period": 5000, + "scale_factor": 0.1, + "source": { + "id": "1.3.6.1.4.1.674.10892.5.4.700.20.1.6.1.3", + "protocol": "snmp" + } + }, + { + "description": "Power consumption of node reported by BMC, in watt", + "name": "bmc_node_power_watt", + "period": 5000, + "source": { + "id": "1.3.6.1.4.1.674.10892.5.4.600.30.1.6.1.{{ 1.3.6.1.4.1.674.10892.5.4.600.30.1.8.1 == System Board Pwr Consumption }}", + "protocol": "snmp" + } + }, + { + "description": "Voltage of PSU 1 reported by BMC, in volt", + "labels": { + "psu": "1" + }, + "name": "bmc_psu_voltage_volt", + "optional_period": 5000, + "period": 0, + "source": { + "id": "1.3.6.1.4.1.674.10892.5.4.600.12.1.16.1.1", + "protocol": "snmp" + } + }, + { + "description": "Voltage of PSU 2 reported by BMC, in volt", + "labels": { + "psu": "2" + }, + "name": "bmc_psu_voltage_volt", + "optional_period": 5000, + "period": 0, + "source": { + "id": "1.3.6.1.4.1.674.10892.5.4.600.12.1.16.1.2", + "protocol": "snmp" + } + }, + { + "description": "Current of PSU 1 reported by BMC, in amp", + "labels": { + "psu": "1" + }, + "name": "bmc_psu_current_amp", + "optional_period": 5000, + "period": 0, + "scale_factor": 0.1, + "source": { + "id": "1.3.6.1.4.1.674.10892.5.4.600.30.1.6.1.{{ 1.3.6.1.4.1.674.10892.5.4.600.30.1.8.1 == PS1 Current 1 }}", + "protocol": "snmp" + } + }, + { + "description": "Current of PSU 2 reported by BMC, in amp", + "labels": { + "psu": "2" + }, + "name": "bmc_psu_current_amp", + "optional_period": 5000, + "period": 0, + "scale_factor": 0.1, + "source": { + "id": "1.3.6.1.4.1.674.10892.5.4.600.30.1.6.1.{{ 1.3.6.1.4.1.674.10892.5.4.600.30.1.8.1 == PS2 Current 2 }}", + "protocol": "snmp" + } + }, + { + "description": "Cumulated power consumption of node reported by BMC, in watt-hour", + "name": "bmc_node_power_watthour_total", + "optional_period": 5000, + "period": 0, + "source": { + "id": "1.3.6.1.4.1.674.10892.5.4.600.60.1.7.1.1", + "protocol": "snmp" + } + }, + { + "description": "Speed of Fan 1A reported by BMC, in rpm", + "labels": { + "fan": "1a" + }, + "name": "bmc_fan_speed_rpm", + "optional_period": 5000, + "period": 0, + "source": { + "id": "1.3.6.1.4.1.674.10892.5.4.700.12.1.6.1.1", + "protocol": "snmp" + } + }, + { + "description": "Speed of Fan 2A reported by BMC, in rpm", + "labels": { + "fan": "2a" + }, + "name": "bmc_fan_speed_rpm", + "optional_period": 5000, + "period": 0, + "source": { + "id": "1.3.6.1.4.1.674.10892.5.4.700.12.1.6.1.2", + "protocol": "snmp" + } + }, + { + "description": "Speed of Fan 3A reported by BMC, in rpm", + "labels": { + "fan": "3a" + }, + "name": "bmc_fan_speed_rpm", + "optional_period": 5000, + "period": 0, + "source": { + "id": "1.3.6.1.4.1.674.10892.5.4.700.12.1.6.1.3", + "protocol": "snmp" + } + }, + { + "description": "Speed of Fan 4A reported by BMC, in rpm", + "labels": { + "fan": "4a" + }, + "name": "bmc_fan_speed_rpm", + "optional_period": 5000, + "period": 0, + "source": { + "id": "1.3.6.1.4.1.674.10892.5.4.700.12.1.6.1.4", + "protocol": "snmp" + } + }, + { + "description": "Speed of Fan 5A reported by BMC, in rpm", + "labels": { + "fan": "5a" + }, + "name": "bmc_fan_speed_rpm", + "optional_period": 5000, + "period": 0, + "source": { + "id": "1.3.6.1.4.1.674.10892.5.4.700.12.1.6.1.5", + "protocol": "snmp" + } + }, + { + "description": "Speed of Fan 6A reported by BMC, in rpm", + "labels": { + "fan": "6a" + }, + "name": "bmc_fan_speed_rpm", + "optional_period": 5000, + "period": 0, + "source": { + "id": "1.3.6.1.4.1.674.10892.5.4.700.12.1.6.1.6", + "protocol": "snmp" + } + }, + { + "description": "Temperature of CPU 1 reported by BMC, in celsius", + "labels": { + "cpu": "1" + }, + "name": "bmc_cpu_temp_celsius", + "optional_period": 5000, + "period": 0, + "scale_factor": 0.1, + "source": { + "id": "1.3.6.1.4.1.674.10892.5.4.700.20.1.6.1.1", + "protocol": "snmp" + } + }, + { + "description": "Temperature of CPU 2 reported by BMC, in celsius", + "labels": { + "cpu": "2" + }, + "name": "bmc_cpu_temp_celsius", + "optional_period": 5000, + "period": 0, + "scale_factor": 0.1, + "source": { + "id": "1.3.6.1.4.1.674.10892.5.4.700.20.1.6.1.2", + "protocol": "snmp" + } + }, + { + "description": "Temperature of exhausted air reported by BMC, in celsius", + "name": "bmc_exhaust_temp_celsius", + "optional_period": 5000, + "period": 0, + "scale_factor": 0.1, + "source": { + "id": "1.3.6.1.4.1.674.10892.5.4.700.20.1.6.1.4", + "protocol": "snmp" + } + }, + { + "description": "Default subset of metrics from Prometheus Node Exporter", + "name": "prom_default_metrics", + "optional_period": 15000, + "period": 0, + "source": { + "id": [ + "node_boot_time_seconds", + "node_cpu_scaling_frequency_hertz", + "node_cpu_seconds_total", + "node_filesystem_free_bytes", + "node_filesystem_size_bytes", + "node_load1", + "node_load15", + "node_load5", + "node_memory_Buffers_bytes", + "node_memory_Cached_bytes", + "node_memory_MemAvailable_bytes", + "node_memory_MemFree_bytes", + "node_memory_MemTotal_bytes", + "node_memory_Shmem_bytes", + "node_memory_SwapFree_bytes", + "node_memory_SwapTotal_bytes", + "node_network_receive_bytes_total", + "node_network_receive_packets_total", + "node_network_transmit_bytes_total", + "node_network_transmit_packets_total", + "node_procs_blocked", + "node_procs_running", + "kwollect_custom" + ], + "port": 9100, + "protocol": "prometheus" + } + }, + { + "description": "All metrics from Prometheus Node Exporter", + "name": "prom_all_metrics", + "optional_period": 15000, + "period": 0, + "source": { + "port": 9100, + "protocol": "prometheus" + } + } ], "model": "Dell PowerEdge R7525", "queues": [ diff --git a/input/grid5000/sites/rennes/clusters/roazhon4/roazhon4_metrics.yaml b/input/grid5000/sites/rennes/clusters/roazhon4/roazhon4_metrics.yaml new file mode 100644 index 0000000000000000000000000000000000000000..890bcec45188681cec3cf2c5158839b86532a937 --- /dev/null +++ b/input/grid5000/sites/rennes/clusters/roazhon4/roazhon4_metrics.yaml @@ -0,0 +1,219 @@ +--- +metrics: + + - name: bmc_ambient_temp_celsius + description: Front node ambient temperature reported by BMC, in celsius + period: 5000 + scale_factor: 0.1 + source: + protocol: snmp + id: 1.3.6.1.4.1.674.10892.5.4.700.20.1.6.1.3 + + - name: bmc_node_power_watt + description: Power consumption of node reported by BMC, in watt + period: 5000 + source: + protocol: snmp + id: >- + 1.3.6.1.4.1.674.10892.5.4.600.30.1.6.1.{{ + 1.3.6.1.4.1.674.10892.5.4.600.30.1.8.1 + == System Board Pwr Consumption }} + + - name: bmc_psu_voltage_volt + labels: {"psu": "1"} + description: Voltage of PSU 1 reported by BMC, in volt + period: 0 + optional_period: 5000 + source: + protocol: snmp + id: 1.3.6.1.4.1.674.10892.5.4.600.12.1.16.1.1 + + - name: bmc_psu_voltage_volt + labels: {"psu": "2"} + description: Voltage of PSU 2 reported by BMC, in volt + period: 0 + optional_period: 5000 + source: + protocol: snmp + id: 1.3.6.1.4.1.674.10892.5.4.600.12.1.16.1.2 + + - name: bmc_psu_current_amp + labels: {"psu": "1"} + description: Current of PSU 1 reported by BMC, in amp + period: 0 + optional_period: 5000 + scale_factor: 0.1 + source: + protocol: snmp + id: >- + 1.3.6.1.4.1.674.10892.5.4.600.30.1.6.1.{{ + 1.3.6.1.4.1.674.10892.5.4.600.30.1.8.1 + == PS1 Current 1 }} + + - name: bmc_psu_current_amp + labels: {"psu": "2"} + description: Current of PSU 2 reported by BMC, in amp + period: 0 + optional_period: 5000 + scale_factor: 0.1 + source: + protocol: snmp + id: >- + 1.3.6.1.4.1.674.10892.5.4.600.30.1.6.1.{{ + 1.3.6.1.4.1.674.10892.5.4.600.30.1.8.1 + == PS2 Current 2 }} + + - name: bmc_node_power_watthour_total + description: Cumulated power consumption of node reported by BMC, in watt-hour + period: 0 + optional_period: 5000 + source: + protocol: snmp + id: 1.3.6.1.4.1.674.10892.5.4.600.60.1.7.1.1 + + - name: bmc_fan_speed_rpm + labels: {"fan": "1a"} + description: Speed of Fan 1A reported by BMC, in rpm + period: 0 + optional_period: 5000 + source: + protocol: snmp + id: 1.3.6.1.4.1.674.10892.5.4.700.12.1.6.1.1 + + - name: bmc_fan_speed_rpm + labels: {"fan": "2a"} + description: Speed of Fan 2A reported by BMC, in rpm + period: 0 + optional_period: 5000 + source: + protocol: snmp + id: 1.3.6.1.4.1.674.10892.5.4.700.12.1.6.1.2 + + - name: bmc_fan_speed_rpm + labels: {"fan": "3a"} + description: Speed of Fan 3A reported by BMC, in rpm + period: 0 + optional_period: 5000 + source: + protocol: snmp + id: 1.3.6.1.4.1.674.10892.5.4.700.12.1.6.1.3 + + - name: bmc_fan_speed_rpm + labels: {"fan": "4a"} + description: Speed of Fan 4A reported by BMC, in rpm + period: 0 + optional_period: 5000 + source: + protocol: snmp + id: 1.3.6.1.4.1.674.10892.5.4.700.12.1.6.1.4 + + - name: bmc_fan_speed_rpm + labels: {"fan": "5a"} + description: Speed of Fan 5A reported by BMC, in rpm + period: 0 + optional_period: 5000 + source: + protocol: snmp + id: 1.3.6.1.4.1.674.10892.5.4.700.12.1.6.1.5 + + - name: bmc_fan_speed_rpm + labels: {"fan": "6a"} + description: Speed of Fan 6A reported by BMC, in rpm + period: 0 + optional_period: 5000 + source: + protocol: snmp + id: 1.3.6.1.4.1.674.10892.5.4.700.12.1.6.1.6 + + - name: bmc_cpu_temp_celsius + labels: {"cpu": "1"} + description: Temperature of CPU 1 reported by BMC, in celsius + period: 0 + optional_period: 5000 + scale_factor: 0.1 + source: + protocol: snmp + id: 1.3.6.1.4.1.674.10892.5.4.700.20.1.6.1.1 + + - name: bmc_cpu_temp_celsius + labels: {"cpu": "2"} + description: Temperature of CPU 2 reported by BMC, in celsius + period: 0 + optional_period: 5000 + scale_factor: 0.1 + source: + protocol: snmp + id: 1.3.6.1.4.1.674.10892.5.4.700.20.1.6.1.2 + + - name: bmc_exhaust_temp_celsius + description: Temperature of exhausted air reported by BMC, in celsius + period: 0 + optional_period: 5000 + scale_factor: 0.1 + source: + protocol: snmp + id: 1.3.6.1.4.1.674.10892.5.4.700.20.1.6.1.4 + + - name: prom_default_metrics + description: Default subset of metrics from Prometheus Node Exporter + period: 0 + optional_period: 15000 + source: + protocol: prometheus + port: 9100 + id: + - node_boot_time_seconds + - node_cpu_scaling_frequency_hertz + - node_cpu_seconds_total + - node_filesystem_free_bytes + - node_filesystem_size_bytes + - node_load1 + - node_load15 + - node_load5 + - node_memory_Buffers_bytes + - node_memory_Cached_bytes + - node_memory_MemAvailable_bytes + - node_memory_MemFree_bytes + - node_memory_MemTotal_bytes + - node_memory_Shmem_bytes + - node_memory_SwapFree_bytes + - node_memory_SwapTotal_bytes + - node_network_receive_bytes_total + - node_network_receive_packets_total + - node_network_transmit_bytes_total + - node_network_transmit_packets_total + - node_procs_blocked + - node_procs_running + - kwollect_custom + + - name: prom_all_metrics + description: All metrics from Prometheus Node Exporter + period: 0 + optional_period: 15000 + source: + protocol: prometheus + port: 9100 + + # Uncomment if cluster has GPU + # - name: prom_nvgpu_default_metrics + # description: Default subset of metrics from Prometheus Nvidia DCGM Exporter + # period: 0 + # optional_period: 15000 + # source: + # protocol: prometheus + # port: 9400 + # id: + # - DCGM_FI_DEV_SM_CLOCK + # - DCGM_FI_DEV_MEM_CLOCK + # - DCGM_FI_DEV_GPU_TEMP + # - DCGM_FI_DEV_POWER_USAGE + # - DCGM_FI_DEV_MEM_COPY_UTIL + + # - name: prom_nvgpu_all_metrics + # description: All metrics from Prometheus Nvidia DCGM Exporter + # period: 0 + # optional_period: 15000 + # source: + # protocol: prometheus + # port: 9400 +