diff --git a/data/grid5000/sites/toulouse/clusters/montcalm/montcalm.json b/data/grid5000/sites/toulouse/clusters/montcalm/montcalm.json index ff54a74aaa5e8e1628e72861be30bdbfb92cc120..276547dc98189bf9fe8791a98594055a9fd85977 100644 --- a/data/grid5000/sites/toulouse/clusters/montcalm/montcalm.json +++ b/data/grid5000/sites/toulouse/clusters/montcalm/montcalm.json @@ -4,11 +4,84 @@ "exotic": false, "kavlan": false, "metrics": [ - + { + "description": "Default subset of metrics from Prometheus Node Exporter", + "name": "prom_default_metrics", + "optional_period": 15000, + "period": 0, + "source": { + "id": [ + "node_boot_time_seconds", + "node_cpu_scaling_frequency_hertz", + "node_cpu_seconds_total", + "node_filesystem_free_bytes", + "node_filesystem_size_bytes", + "node_load1", + "node_load15", + "node_load5", + "node_memory_Buffers_bytes", + "node_memory_Cached_bytes", + "node_memory_MemAvailable_bytes", + "node_memory_MemFree_bytes", + "node_memory_MemTotal_bytes", + "node_memory_Shmem_bytes", + "node_memory_SwapFree_bytes", + "node_memory_SwapTotal_bytes", + "node_network_receive_bytes_total", + "node_network_receive_packets_total", + "node_network_transmit_bytes_total", + "node_network_transmit_packets_total", + "node_procs_blocked", + "node_procs_running", + "kwollect_custom" + ], + "port": 9100, + "protocol": "prometheus" + } + }, + { + "description": "All metrics from Prometheus Node Exporter", + "name": "prom_all_metrics", + "optional_period": 15000, + "period": 0, + "source": { + "port": 9100, + "protocol": "prometheus" + } + }, + { + "description": "Default subset of metrics from Prometheus Nvidia DCGM Exporter", + "name": "prom_nvgpu_default_metrics", + "optional_period": 15000, + "period": 0, + "source": { + "id": [ + "DCGM_FI_DEV_SM_CLOCK", + "DCGM_FI_DEV_MEM_CLOCK", + "DCGM_FI_DEV_GPU_TEMP", + "DCGM_FI_DEV_POWER_USAGE", + "DCGM_FI_DEV_GPU_UTIL", + "DCGM_FI_DEV_MEM_COPY_UTIL" + ], + "port": 9400, + "protocol": "prometheus" + } + }, + { + "description": "All metrics from Prometheus Nvidia DCGM Exporter", + "name": "prom_nvgpu_all_metrics", + "optional_period": 15000, + "period": 0, + "source": { + "port": 9400, + "protocol": "prometheus" + } + } ], "model": "HPE Proliant DL360 Gen10+", "queues": [ - "admin" + "admin", + "testing" ], "type": "cluster", "uid": "montcalm" diff --git a/data/grid5000/sites/toulouse/clusters/montcalm/nodes/montcalm-1.json b/data/grid5000/sites/toulouse/clusters/montcalm/nodes/montcalm-1.json index 1a07a3a40cf5b570a054cdd1b3f8b10f831cbdab..43895f08f43eea0c503851254f130a6d4c7c09a8 100644 --- a/data/grid5000/sites/toulouse/clusters/montcalm/nodes/montcalm-1.json +++ b/data/grid5000/sites/toulouse/clusters/montcalm/nodes/montcalm-1.json @@ -174,7 +174,8 @@ "deploy": true, "max_walltime": 0, "queues": [ - "admin" + "admin", + "testing" ], "virtual": "ivt" }, diff --git a/data/grid5000/sites/toulouse/clusters/montcalm/nodes/montcalm-2.json b/data/grid5000/sites/toulouse/clusters/montcalm/nodes/montcalm-2.json index 392996647f0a49773728ebcdab39582c60b2114d..955973fbb3271585be2e712ba1c0d1d7d1e9e9d9 100644 --- a/data/grid5000/sites/toulouse/clusters/montcalm/nodes/montcalm-2.json +++ b/data/grid5000/sites/toulouse/clusters/montcalm/nodes/montcalm-2.json @@ -174,7 +174,8 @@ "deploy": true, "max_walltime": 0, "queues": [ - "admin" + "admin", + "testing" ], "virtual": "ivt" }, diff --git a/data/grid5000/sites/toulouse/clusters/montcalm/nodes/montcalm-3.json b/data/grid5000/sites/toulouse/clusters/montcalm/nodes/montcalm-3.json index 6746cd26f8cfc187390023d275a19662fec81ff8..fa4f15d39b81fe45c7981ff7ca5cf48802ffbcc9 100644 --- a/data/grid5000/sites/toulouse/clusters/montcalm/nodes/montcalm-3.json +++ b/data/grid5000/sites/toulouse/clusters/montcalm/nodes/montcalm-3.json @@ -174,7 +174,8 @@ "deploy": true, "max_walltime": 0, "queues": [ - "admin" + "admin", + "testing" ], "virtual": "ivt" }, diff --git a/data/grid5000/sites/toulouse/clusters/montcalm/nodes/montcalm-4.json b/data/grid5000/sites/toulouse/clusters/montcalm/nodes/montcalm-4.json index dcf0ee836a2f34ad9195adda84b7b89b447a7d81..f5e946568348d82ea7f0a83bbbeeca975f32cdd1 100644 --- a/data/grid5000/sites/toulouse/clusters/montcalm/nodes/montcalm-4.json +++ b/data/grid5000/sites/toulouse/clusters/montcalm/nodes/montcalm-4.json @@ -174,7 +174,8 @@ "deploy": true, "max_walltime": 0, "queues": [ - "admin" + "admin", + "testing" ], "virtual": "ivt" }, diff --git a/data/grid5000/sites/toulouse/clusters/montcalm/nodes/montcalm-5.json b/data/grid5000/sites/toulouse/clusters/montcalm/nodes/montcalm-5.json index 13a47413886c38aefa403cd1c7f8b594c92f1c50..dd4675d06442ebe3a2b22f27fd8534941e8591b7 100644 --- a/data/grid5000/sites/toulouse/clusters/montcalm/nodes/montcalm-5.json +++ b/data/grid5000/sites/toulouse/clusters/montcalm/nodes/montcalm-5.json @@ -174,7 +174,8 @@ "deploy": true, "max_walltime": 0, "queues": [ - "admin" + "admin", + "testing" ], "virtual": "ivt" }, diff --git a/data/grid5000/sites/toulouse/clusters/montcalm/nodes/montcalm-6.json b/data/grid5000/sites/toulouse/clusters/montcalm/nodes/montcalm-6.json index 59abbf52b5ba98c832ea5d417c4e941180d6ea47..fb7d90117313ab805d6b7febe72926511715a29b 100644 --- a/data/grid5000/sites/toulouse/clusters/montcalm/nodes/montcalm-6.json +++ b/data/grid5000/sites/toulouse/clusters/montcalm/nodes/montcalm-6.json @@ -174,7 +174,8 @@ "deploy": true, "max_walltime": 0, "queues": [ - "admin" + "admin", + "testing" ], "virtual": "ivt" }, diff --git a/data/grid5000/sites/toulouse/clusters/montcalm/nodes/montcalm-7.json b/data/grid5000/sites/toulouse/clusters/montcalm/nodes/montcalm-7.json index b03fb35fb72f4c7e0f5fc30b80273ff7d03e54e6..43fcb67732391bdd688bd566319df0765811e027 100644 --- a/data/grid5000/sites/toulouse/clusters/montcalm/nodes/montcalm-7.json +++ b/data/grid5000/sites/toulouse/clusters/montcalm/nodes/montcalm-7.json @@ -174,7 +174,8 @@ "deploy": true, "max_walltime": 0, "queues": [ - "admin" + "admin", + "testing" ], "virtual": "ivt" }, diff --git a/data/grid5000/sites/toulouse/clusters/montcalm/nodes/montcalm-8.json b/data/grid5000/sites/toulouse/clusters/montcalm/nodes/montcalm-8.json index ce3518cbc0879dc766b04dfbe5c24dd811e94091..21ce0e3e3e3c831500788676c373df221f5ff523 100644 --- a/data/grid5000/sites/toulouse/clusters/montcalm/nodes/montcalm-8.json +++ b/data/grid5000/sites/toulouse/clusters/montcalm/nodes/montcalm-8.json @@ -174,7 +174,8 @@ "deploy": true, "max_walltime": 0, "queues": [ - "admin" + "admin", + "testing" ], "virtual": "ivt" }, diff --git a/data/grid5000/sites/toulouse/clusters/montcalm/nodes/montcalm-9.json b/data/grid5000/sites/toulouse/clusters/montcalm/nodes/montcalm-9.json index 423d1b4feba466a07d401f1183cdb58f07de3a6f..29c0bca59cca1942d808420b057e6f8c3de51f12 100644 --- a/data/grid5000/sites/toulouse/clusters/montcalm/nodes/montcalm-9.json +++ b/data/grid5000/sites/toulouse/clusters/montcalm/nodes/montcalm-9.json @@ -174,7 +174,8 @@ "deploy": true, "max_walltime": 0, "queues": [ - "admin" + "admin", + "testing" ], "virtual": "ivt" }, diff --git a/data/grid5000/sites/toulouse/servers/srv-data.json b/data/grid5000/sites/toulouse/servers/srv-data.json index d43566891959e99809b3007cb0a678d94d5397e9..12d4a5587c81e57870b021b4e24146a84430c2a9 100644 --- a/data/grid5000/sites/toulouse/servers/srv-data.json +++ b/data/grid5000/sites/toulouse/servers/srv-data.json @@ -2,7 +2,8 @@ "alias": [ "nfs", "home-g5k", - "modules" + "modules", + "public" ], "kind": "physical", "network_adapters": { diff --git a/input/grid5000/sites/toulouse/clusters/montcalm/montcalm.yaml b/input/grid5000/sites/toulouse/clusters/montcalm/montcalm.yaml index be5de43a6a3102aab1e867d9e8e7feb7825b42de..d715dc3deeeac75c4163c59cc38da9b3a19f4559 100644 --- a/input/grid5000/sites/toulouse/clusters/montcalm/montcalm.yaml +++ b/input/grid5000/sites/toulouse/clusters/montcalm/montcalm.yaml @@ -5,6 +5,7 @@ boot_type: uefi exotic: false queues: - admin + - testing nodes: montcalm-[1-9]: supported_job_types: diff --git a/input/grid5000/sites/toulouse/clusters/montcalm/montcalm_metrics.yaml b/input/grid5000/sites/toulouse/clusters/montcalm/montcalm_metrics.yaml new file mode 100644 index 0000000000000000000000000000000000000000..ddd15665712b9f95932408c84bcffe1d1e242272 --- /dev/null +++ b/input/grid5000/sites/toulouse/clusters/montcalm/montcalm_metrics.yaml @@ -0,0 +1,65 @@ +--- +metrics: + - name: prom_default_metrics + description: Default subset of metrics from Prometheus Node Exporter + period: 0 + optional_period: 15000 + source: + protocol: prometheus + port: 9100 + id: + - node_boot_time_seconds + - node_cpu_scaling_frequency_hertz + - node_cpu_seconds_total + - node_filesystem_free_bytes + - node_filesystem_size_bytes + - node_load1 + - node_load15 + - node_load5 + - node_memory_Buffers_bytes + - node_memory_Cached_bytes + - node_memory_MemAvailable_bytes + - node_memory_MemFree_bytes + - node_memory_MemTotal_bytes + - node_memory_Shmem_bytes + - node_memory_SwapFree_bytes + - node_memory_SwapTotal_bytes + - node_network_receive_bytes_total + - node_network_receive_packets_total + - node_network_transmit_bytes_total + - node_network_transmit_packets_total + - node_procs_blocked + - node_procs_running + - kwollect_custom + + - name: prom_all_metrics + description: All metrics from Prometheus Node Exporter + period: 0 + optional_period: 15000 + source: + protocol: prometheus + port: 9100 + + - name: prom_nvgpu_default_metrics + description: Default subset of metrics from Prometheus Nvidia DCGM Exporter + period: 0 + optional_period: 15000 + source: + protocol: prometheus + port: 9400 + id: + - DCGM_FI_DEV_SM_CLOCK + - DCGM_FI_DEV_MEM_CLOCK + - DCGM_FI_DEV_GPU_TEMP + - DCGM_FI_DEV_POWER_USAGE + - DCGM_FI_DEV_GPU_UTIL + - DCGM_FI_DEV_MEM_COPY_UTIL + + - name: prom_nvgpu_all_metrics + description: All metrics from Prometheus Nvidia DCGM Exporter + period: 0 + optional_period: 15000 + source: + protocol: prometheus + port: 9400 + diff --git a/input/grid5000/sites/toulouse/servers/physical.yaml b/input/grid5000/sites/toulouse/servers/physical.yaml index 20867a2c7b5a35552965ca1813053adf31d9bec2..df9b54cba40baefd909829e17e973ef4e6693ed7 100644 --- a/input/grid5000/sites/toulouse/servers/physical.yaml +++ b/input/grid5000/sites/toulouse/servers/physical.yaml @@ -29,7 +29,7 @@ pve-2: srv-data: kind: physical serial: CZ22230FL8 - alias: [nfs, home-g5k, modules] + alias: [nfs, home-g5k, modules, public] network_adapters: bmc: ip: 172.17.126.3