diff --git a/data/grid5000/sites/nancy/clusters/gratouille/gratouille.json b/data/grid5000/sites/nancy/clusters/gratouille/gratouille.json index 0767286ec6b4cce28cf1bbcc883b090ff2983afc..1c805d713326e61141ba1f1929f1acb1f63f98bb 100644 --- a/data/grid5000/sites/nancy/clusters/gratouille/gratouille.json +++ b/data/grid5000/sites/nancy/clusters/gratouille/gratouille.json @@ -5,7 +5,278 @@ "kavlan": false, "manufactured_at": "2017-06-07", "metrics": [ - + { + "description": "Front node ambient temperature reported by BMC, in celsius", + "name": "bmc_ambient_temp_celsius", + "period": 5000, + "scale_factor": 0.1, + "source": { + "id": "1.3.6.1.4.1.674.10892.5.4.700.20.1.6.1.1", + "protocol": "snmp" + } + }, + { + "description": "Power consumption of node reported by BMC, in watt", + "name": "bmc_node_power_watt", + "period": 5000, + "source": { + "id": "1.3.6.1.4.1.674.10892.5.4.600.30.1.6.1.{{ 1.3.6.1.4.1.674.10892.5.4.600.30.1.8.1 == System Board Pwr Consumption }}", + "protocol": "snmp" + } + }, + { + "description": "Voltage of PSU 1 reported by BMC, in volt", + "labels": { + "psu": "1" + }, + "name": "bmc_psu_voltage_volt", + "optional_period": 5000, + "period": 0, + "source": { + "id": "1.3.6.1.4.1.674.10892.5.4.600.12.1.16.1.1", + "protocol": "snmp" + } + }, + { + "description": "Voltage of PSU 2 reported by BMC, in volt", + "labels": { + "psu": "2" + }, + "name": "bmc_psu_voltage_volt", + "optional_period": 5000, + "period": 0, + "source": { + "id": "1.3.6.1.4.1.674.10892.5.4.600.12.1.16.1.2", + "protocol": "snmp" + } + }, + { + "description": "Current of PSU 1 reported by BMC, in amp", + "labels": { + "psu": "1" + }, + "name": "bmc_psu_current_amp", + "optional_period": 5000, + "period": 0, + "scale_factor": 0.1, + "source": { + "id": "1.3.6.1.4.1.674.10892.5.4.600.30.1.6.1.{{ 1.3.6.1.4.1.674.10892.5.4.600.30.1.8.1 == PS1 Current 1 }}", + "protocol": "snmp" + } + }, + { + "description": "Current of PSU 2 reported by BMC, in amp", + "labels": { + "psu": "2" + }, + "name": "bmc_psu_current_amp", + "optional_period": 5000, + "period": 0, + "scale_factor": 0.1, + "source": { + "id": "1.3.6.1.4.1.674.10892.5.4.600.30.1.6.1.{{ 1.3.6.1.4.1.674.10892.5.4.600.30.1.8.1 == PS2 Current 2 }}", + "protocol": "snmp" + } + }, + { + "description": "Cumulated power consumption of node reported by BMC, in watt-hour", + "name": "bmc_node_power_watthour_total", + "optional_period": 5000, + "period": 0, + "source": { + "id": "1.3.6.1.4.1.674.10892.5.4.600.60.1.7.1.1", + "protocol": "snmp" + } + }, + { + "description": "Speed of Fan 1 reported by BMC, in rpm", + "labels": { + "fan": "1" + }, + "name": "bmc_fan_speed_rpm", + "optional_period": 5000, + "period": 0, + "source": { + "id": "1.3.6.1.4.1.674.10892.5.4.700.12.1.6.1.1", + "protocol": "snmp" + } + }, + { + "description": "Speed of Fan 2 reported by BMC, in rpm", + "labels": { + "fan": "2" + }, + "name": "bmc_fan_speed_rpm", + "optional_period": 5000, + "period": 0, + "source": { + "id": "1.3.6.1.4.1.674.10892.5.4.700.12.1.6.1.2", + "protocol": "snmp" + } + }, + { + "description": "Speed of Fan 3 reported by BMC, in rpm", + "labels": { + "fan": "3" + }, + "name": "bmc_fan_speed_rpm", + "optional_period": 5000, + "period": 0, + "source": { + "id": "1.3.6.1.4.1.674.10892.5.4.700.12.1.6.1.3", + "protocol": "snmp" + } + }, + { + "description": "Speed of Fan 4 reported by BMC, in rpm", + "labels": { + "fan": "4" + }, + "name": "bmc_fan_speed_rpm", + "optional_period": 5000, + "period": 0, + "source": { + "id": "1.3.6.1.4.1.674.10892.5.4.700.12.1.6.1.4", + "protocol": "snmp" + } + }, + { + "description": "Speed of Fan 5 reported by BMC, in rpm", + "labels": { + "fan": "5" + }, + "name": "bmc_fan_speed_rpm", + "optional_period": 5000, + "period": 0, + "source": { + "id": "1.3.6.1.4.1.674.10892.5.4.700.12.1.6.1.5", + "protocol": "snmp" + } + }, + { + "description": "Speed of Fan 6 reported by BMC, in rpm", + "labels": { + "fan": "6" + }, + "name": "bmc_fan_speed_rpm", + "optional_period": 5000, + "period": 0, + "source": { + "id": "1.3.6.1.4.1.674.10892.5.4.700.12.1.6.1.6", + "protocol": "snmp" + } + }, + { + "description": "Temperature of exhausted air reported by BMC, in celsius", + "name": "bmc_exhaust_temp_celsius", + "optional_period": 5000, + "period": 0, + "scale_factor": 0.1, + "source": { + "id": "1.3.6.1.4.1.674.10892.5.4.700.20.1.6.1.2", + "protocol": "snmp" + } + }, + { + "description": "Temperature of CPU 1 reported by BMC, in celsius", + "labels": { + "cpu": "1" + }, + "name": "bmc_cpu_temp_celsius", + "optional_period": 5000, + "period": 0, + "scale_factor": 0.1, + "source": { + "id": "1.3.6.1.4.1.674.10892.5.4.700.20.1.6.1.3", + "protocol": "snmp" + } + }, + { + "description": "Temperature of CPU 2 reported by BMC, in celsius", + "labels": { + "cpu": "2" + }, + "name": "bmc_cpu_temp_celsius", + "optional_period": 5000, + "period": 0, + "scale_factor": 0.1, + "source": { + "id": "1.3.6.1.4.1.674.10892.5.4.700.20.1.6.1.4", + "protocol": "snmp" + } + }, + { + "description": "Default subset of metrics from Prometheus Node Exporter", + "name": "prom_default_metrics", + "optional_period": 15000, + "period": 0, + "source": { + "id": [ + "node_boot_time_seconds", + "node_cpu_scaling_frequency_hertz", + "node_cpu_seconds_total", + "node_filesystem_free_bytes", + "node_filesystem_size_bytes", + "node_load1", + "node_load15", + "node_load5", + "node_memory_Buffers_bytes", + "node_memory_Cached_bytes", + "node_memory_MemAvailable_bytes", + "node_memory_MemFree_bytes", + "node_memory_MemTotal_bytes", + "node_memory_Shmem_bytes", + "node_memory_SwapFree_bytes", + "node_memory_SwapTotal_bytes", + "node_network_receive_bytes_total", + "node_network_receive_packets_total", + "node_network_transmit_bytes_total", + "node_network_transmit_packets_total", + "node_procs_blocked", + "node_procs_running", + "kwollect_custom" + ], + "port": 9100, + "protocol": "prometheus" + } + }, + { + "description": "All metrics from Prometheus Node Exporter", + "name": "prom_all_metrics", + "optional_period": 15000, + "period": 0, + "source": { + "port": 9100, + "protocol": "prometheus" + } + }, + { + "description": "Default subset of metrics from Prometheus Nvidia DCGM Exporter", + "name": "prom_nvgpu_default_metrics", + "optional_period": 15000, + "period": 0, + "source": { + "id": [ + "DCGM_FI_DEV_SM_CLOCK", + "DCGM_FI_DEV_MEM_CLOCK", + "DCGM_FI_DEV_GPU_TEMP", + "DCGM_FI_DEV_POWER_USAGE", + "DCGM_FI_DEV_MEM_COPY_UTIL" + ], + "port": 9400, + "protocol": "prometheus" + } + }, + { + "description": "All metrics from Prometheus Nvidia DCGM Exporter", + "name": "prom_nvgpu_all_metrics", + "optional_period": 15000, + "period": 0, + "source": { + "port": 9400, + "protocol": "prometheus" + } + } ], "model": "Dell PowerEdge R730", "priority": 201806, diff --git a/input/grid5000/sites/nancy/clusters/gratouille/gratouille_metrics.yaml b/input/grid5000/sites/nancy/clusters/gratouille/gratouille_metrics.yaml new file mode 100644 index 0000000000000000000000000000000000000000..7e2862a00e50a6e5101fdcdaea2d64f46a30925c --- /dev/null +++ b/input/grid5000/sites/nancy/clusters/gratouille/gratouille_metrics.yaml @@ -0,0 +1,215 @@ +--- +metrics: + + - name: bmc_ambient_temp_celsius + description: Front node ambient temperature reported by BMC, in celsius + period: 5000 + scale_factor: 0.1 + source: + protocol: snmp + id: 1.3.6.1.4.1.674.10892.5.4.700.20.1.6.1.1 + + - name: bmc_node_power_watt + description: Power consumption of node reported by BMC, in watt + period: 5000 + source: + protocol: snmp + id: >- + 1.3.6.1.4.1.674.10892.5.4.600.30.1.6.1.{{ + 1.3.6.1.4.1.674.10892.5.4.600.30.1.8.1 + == System Board Pwr Consumption }} + + - name: bmc_psu_voltage_volt + labels: {"psu": "1"} + description: Voltage of PSU 1 reported by BMC, in volt + period: 0 + optional_period: 5000 + source: + protocol: snmp + id: 1.3.6.1.4.1.674.10892.5.4.600.12.1.16.1.1 + + - name: bmc_psu_voltage_volt + labels: {"psu": "2"} + description: Voltage of PSU 2 reported by BMC, in volt + period: 0 + optional_period: 5000 + source: + protocol: snmp + id: 1.3.6.1.4.1.674.10892.5.4.600.12.1.16.1.2 + + - name: bmc_psu_current_amp + labels: {"psu": "1"} + description: Current of PSU 1 reported by BMC, in amp + period: 0 + optional_period: 5000 + scale_factor: 0.1 + source: + protocol: snmp + id: >- + 1.3.6.1.4.1.674.10892.5.4.600.30.1.6.1.{{ + 1.3.6.1.4.1.674.10892.5.4.600.30.1.8.1 == PS1 Current 1 }} + + - name: bmc_psu_current_amp + labels: {"psu": "2"} + description: Current of PSU 2 reported by BMC, in amp + period: 0 + optional_period: 5000 + scale_factor: 0.1 + source: + protocol: snmp + id: >- + 1.3.6.1.4.1.674.10892.5.4.600.30.1.6.1.{{ + 1.3.6.1.4.1.674.10892.5.4.600.30.1.8.1 == PS2 Current 2 }} + + - name: bmc_node_power_watthour_total + description: Cumulated power consumption of node reported by BMC, in watt-hour + period: 0 + optional_period: 5000 + source: + protocol: snmp + id: 1.3.6.1.4.1.674.10892.5.4.600.60.1.7.1.1 + + - name: bmc_fan_speed_rpm + labels: {"fan": "1"} + description: Speed of Fan 1 reported by BMC, in rpm + period: 0 + optional_period: 5000 + source: + protocol: snmp + id: 1.3.6.1.4.1.674.10892.5.4.700.12.1.6.1.1 + + - name: bmc_fan_speed_rpm + labels: {"fan": "2"} + description: Speed of Fan 2 reported by BMC, in rpm + period: 0 + optional_period: 5000 + source: + protocol: snmp + id: 1.3.6.1.4.1.674.10892.5.4.700.12.1.6.1.2 + + - name: bmc_fan_speed_rpm + labels: {"fan": "3"} + description: Speed of Fan 3 reported by BMC, in rpm + period: 0 + optional_period: 5000 + source: + protocol: snmp + id: 1.3.6.1.4.1.674.10892.5.4.700.12.1.6.1.3 + + - name: bmc_fan_speed_rpm + labels: {"fan": "4"} + description: Speed of Fan 4 reported by BMC, in rpm + period: 0 + optional_period: 5000 + source: + protocol: snmp + id: 1.3.6.1.4.1.674.10892.5.4.700.12.1.6.1.4 + + - name: bmc_fan_speed_rpm + labels: {"fan": "5"} + description: Speed of Fan 5 reported by BMC, in rpm + period: 0 + optional_period: 5000 + source: + protocol: snmp + id: 1.3.6.1.4.1.674.10892.5.4.700.12.1.6.1.5 + + - name: bmc_fan_speed_rpm + labels: {"fan": "6"} + description: Speed of Fan 6 reported by BMC, in rpm + period: 0 + optional_period: 5000 + source: + protocol: snmp + id: 1.3.6.1.4.1.674.10892.5.4.700.12.1.6.1.6 + + - name: bmc_exhaust_temp_celsius + description: Temperature of exhausted air reported by BMC, in celsius + period: 0 + optional_period: 5000 + scale_factor: 0.1 + source: + protocol: snmp + id: 1.3.6.1.4.1.674.10892.5.4.700.20.1.6.1.2 + + - name: bmc_cpu_temp_celsius + labels: {"cpu": "1"} + description: Temperature of CPU 1 reported by BMC, in celsius + period: 0 + optional_period: 5000 + scale_factor: 0.1 + source: + protocol: snmp + id: 1.3.6.1.4.1.674.10892.5.4.700.20.1.6.1.3 + + - name: bmc_cpu_temp_celsius + labels: {"cpu": "2"} + description: Temperature of CPU 2 reported by BMC, in celsius + period: 0 + optional_period: 5000 + scale_factor: 0.1 + source: + protocol: snmp + id: 1.3.6.1.4.1.674.10892.5.4.700.20.1.6.1.4 + + - name: prom_default_metrics + description: Default subset of metrics from Prometheus Node Exporter + period: 0 + optional_period: 15000 + source: + protocol: prometheus + port: 9100 + id: + - node_boot_time_seconds + - node_cpu_scaling_frequency_hertz + - node_cpu_seconds_total + - node_filesystem_free_bytes + - node_filesystem_size_bytes + - node_load1 + - node_load15 + - node_load5 + - node_memory_Buffers_bytes + - node_memory_Cached_bytes + - node_memory_MemAvailable_bytes + - node_memory_MemFree_bytes + - node_memory_MemTotal_bytes + - node_memory_Shmem_bytes + - node_memory_SwapFree_bytes + - node_memory_SwapTotal_bytes + - node_network_receive_bytes_total + - node_network_receive_packets_total + - node_network_transmit_bytes_total + - node_network_transmit_packets_total + - node_procs_blocked + - node_procs_running + - kwollect_custom + + - name: prom_all_metrics + description: All metrics from Prometheus Node Exporter + period: 0 + optional_period: 15000 + source: + protocol: prometheus + port: 9100 + + - name: prom_nvgpu_default_metrics + description: Default subset of metrics from Prometheus Nvidia DCGM Exporter + period: 0 + optional_period: 15000 + source: + protocol: prometheus + port: 9400 + id: + - DCGM_FI_DEV_SM_CLOCK + - DCGM_FI_DEV_MEM_CLOCK + - DCGM_FI_DEV_GPU_TEMP + - DCGM_FI_DEV_POWER_USAGE + - DCGM_FI_DEV_MEM_COPY_UTIL + + - name: prom_nvgpu_all_metrics + description: All metrics from Prometheus Nvidia DCGM Exporter + period: 0 + optional_period: 15000 + source: + protocol: prometheus + port: 9400