diff --git a/data/grid5000/sites/sophia/clusters/esterel24/esterel24.json b/data/grid5000/sites/sophia/clusters/esterel24/esterel24.json index 0bf2ac9216c230ab3a2a500e2ab24121c513245a..354759704c5a7c28e8470496f832fb29db480fea 100644 --- a/data/grid5000/sites/sophia/clusters/esterel24/esterel24.json +++ b/data/grid5000/sites/sophia/clusters/esterel24/esterel24.json @@ -1,6 +1,6 @@ { "boot_type": "uefi", - "created_at": "Thu, 01 Jan 1970 00:00:00 GMT", + "created_at": "Thu, 27 Mar 2025 00:00:00 GMT", "exotic": false, "kavlan": false, "manufactured_at": "1970-01-01", @@ -49,11 +49,38 @@ "port": 9100, "protocol": "prometheus" } + }, + { + "description": "Default subset of metrics from Prometheus Nvidia DCGM Exporter", + "name": "prom_nvgpu_default_metrics", + "optional_period": 15000, + "period": 0, + "source": { + "id": [ + "DCGM_FI_DEV_SM_CLOCK", + "DCGM_FI_DEV_MEM_CLOCK", + "DCGM_FI_DEV_GPU_TEMP", + "DCGM_FI_DEV_POWER_USAGE", + "DCGM_FI_DEV_MEM_COPY_UTIL" + ], + "port": 9400, + "protocol": "prometheus" + } + }, + { + "description": "All metrics from Prometheus Nvidia DCGM Exporter", + "name": "prom_nvgpu_all_metrics", + "optional_period": 15000, + "period": 0, + "source": { + "port": 9400, + "protocol": "prometheus" + } } ], - "model": "Cluster Model", - "nodes_count": 2, - "nodes_description": "1 CPU Unknown vendor, 72 cores/CPU, 0GB RAM, 0GB SSD, 1 x 10Gb Ethernet", + "model": "Dell PowerEdge T640", + "nodes_count": 1, + "nodes_description": "1 CPU Intel Xeon Gold 6240R, 72 cores/CPU, 0GB RAM, 0GB SSD, 1 x 10Gb Ethernet", "priority": 197001, "queues": [ "admin", diff --git a/data/grid5000/sites/sophia/clusters/esterel24/nodes/esterel24-1.json b/data/grid5000/sites/sophia/clusters/esterel24/nodes/esterel24-1.json index a895b209d135b74a75bd6cafa2c5f482e981e3a0..45229384ccdfb59f59e9a08c98ad8c89a21214b0 100644 --- a/data/grid5000/sites/sophia/clusters/esterel24/nodes/esterel24-1.json +++ b/data/grid5000/sites/sophia/clusters/esterel24/nodes/esterel24-1.json @@ -23,7 +23,7 @@ "ram_size": 8 }, "management_tools": { - "bmc_vendor_tool": "ipmitool", + "bmc_vendor_tool": "racadm", "ipmitool": { "retries": 5 } @@ -44,7 +44,7 @@ "ip": "172.16.131.30", "ip6": "2001:660:4406:800:4::1e", "kavlan": false, - "mac": "f4:02:70:9d:75:92", + "mac": "70:b5:e8:c9:ff:d2", "management": false, "mountable": true, "mounted": true, @@ -60,7 +60,7 @@ "interface": "Ethernet", "ip": "172.17.131.30", "kavlan": false, - "mac": "f4:02:70:9d:75:a2", + "mac": "70:b5:e8:c9:ff:e2", "management": true, "mountable": false, "mounted": false, @@ -77,8 +77,8 @@ "turboboost_enabled": true }, "performance": { - "core_flops": 128, - "node_flops": 9216 + "core_flops": 76800000000, + "node_flops": 5529600000000 }, "processor": { "cache_l1": null, @@ -86,15 +86,15 @@ "cache_l1i": 8, "cache_l2": 8, "cache_l3": 8, - "clock_speed": 8, + "clock_speed": 2400000000, "ht_capable": true, "instruction_set": "x86-64", - "microarchitecture": "Haswell", + "microarchitecture": "Cascade Lake-SP", "microcode": "0xd000001", - "model": "Unknown", - "other_description": "description", - "vendor": "vendor", - "version": "vendor" + "model": "Intel Xeon", + "other_description": "Intel(R) Xeon(R) Gold 6240R CPU @ 2.60GHz", + "vendor": "Intel", + "version": "Gold 6240R" }, "redfish": true, "software": { diff --git a/data/grid5000/sites/sophia/clusters/esterel24/nodes/esterel24-2.json b/data/grid5000/sites/sophia/clusters/esterel24/nodes/esterel24-2.json deleted file mode 100644 index c715dd57cde9cb2bb4d3d37dfd8eb3f7f0265662..0000000000000000000000000000000000000000 --- a/data/grid5000/sites/sophia/clusters/esterel24/nodes/esterel24-2.json +++ /dev/null @@ -1,128 +0,0 @@ -{ - "architecture": { - "cpu_core_numbering": "contiguous", - "nb_cores": 72, - "nb_procs": 1, - "nb_threads": 72, - "platform_type": "x86_64" - }, - "bios": { - "release_date": "01/01/2000", - "vendor": "Unknown", - "version": 1 - }, - "bmc_version": "v1", - "chassis": { - "manufactured_at": "1970-01-01", - "manufacturer": "Unknown", - "name": "Unknown", - "warranty_end": "1970-01-01" - }, - "exotic": false, - "main_memory": { - "ram_size": 8 - }, - "management_tools": { - "bmc_vendor_tool": "ipmitool", - "ipmitool": { - "retries": 5 - } - }, - "memory_devices": [ - { - "device": "dimm_proc 1 dimm 1", - "size": 8, - "technology": "dram" - } - ], - "network_adapters": [ - { - "device": "eth0", - "driver": "mlx_core", - "enabled": true, - "interface": "Ethernet", - "ip": "172.16.131.31", - "ip6": "2001:660:4406:800:4::1f", - "kavlan": false, - "mac": "70:b5:e8:c9:ff:d2", - "management": false, - "mountable": true, - "mounted": true, - "name": "enp1s0f0np0", - "network_address": "esterel24-2.sophia.grid5000.fr", - "rate": 10000000000, - "switch": null, - "switch_port": null - }, - { - "device": "bmc", - "enabled": true, - "interface": "Ethernet", - "ip": "172.17.131.31", - "kavlan": false, - "mac": "70:b5:e8:c9:ff:e2", - "management": true, - "mountable": false, - "mounted": false, - "network_address": "esterel24-2-bmc.sophia.grid5000.fr" - } - ], - "nodeset": "esterel24", - "operating_system": { - "cstate_driver": "unknown", - "cstate_governor": "unknown", - "ht_enabled": true, - "pstate_driver": "unknwon", - "pstate_governor": "unknown", - "turboboost_enabled": true - }, - "performance": { - "core_flops": 128, - "node_flops": 9216 - }, - "processor": { - "cache_l1": null, - "cache_l1d": 8, - "cache_l1i": 8, - "cache_l2": 8, - "cache_l3": 8, - "clock_speed": 8, - "ht_capable": true, - "instruction_set": "x86-64", - "microarchitecture": "Haswell", - "microcode": "0xd000001", - "model": "Unknown", - "other_description": "description", - "vendor": "vendor", - "version": "vendor" - }, - "redfish": true, - "software": { - "forced-deployment-timestamp": 202007300948, - "postinstall-version": "1.2025032008", - "standard-environment": "debian11-x64-std" - }, - "storage_devices": [ - { - "by_id": "", - "by_path": "/dev/disk/by-path/dummy", - "id": "disk0", - "interface": "SAS", - "model": "unknown", - "size": 8, - "storage": "SSD", - "vendor": "Unknown" - } - ], - "supported_job_types": { - "besteffort": true, - "deploy": true, - "max_walltime": 0, - "queues": [ - "admin", - "testing" - ] - }, - "type": "node", - "uid": "esterel24-2" -} \ No newline at end of file diff --git a/input/grid5000/sites/sophia/clusters/esterel24/esterel24.yaml b/input/grid5000/sites/sophia/clusters/esterel24/esterel24.yaml index 09454b75df5688ca56836241bc5652ae8acf3850..40e03b7b002c1be54cf18d06c88b505a278845d5 100644 --- a/input/grid5000/sites/sophia/clusters/esterel24/esterel24.yaml +++ b/input/grid5000/sites/sophia/clusters/esterel24/esterel24.yaml @@ -1,24 +1,24 @@ --- -model: Cluster Model # TODO: change this value. -created_at: 1970-01-01 # TODO: change this value +model: Dell PowerEdge T640 +created_at: 2025-03-27 kavlan: false -boot_type: uefi # TODO: specify if 'uefi' (ideally) or 'bios' (legacy, if no other choice) -exotic: false # TODO: specify if 'true' or 'false' +boot_type: uefi +exotic: false queues: - admin - testing nodes: - esterel24-[1-2]: + esterel24-1: chassis: - manufactured_at: 1970-01-01 # TODO: if not Dell vendor, put date. if Dell, use rake gen:dell-product-data after g5k-checks import, and remove this line. - warranty_end: 1970-01-01 # TODO: if not Dell vendor, put date. if Dell, use rake gen:dell-product-data after g5k-checks import, and remove this line. + manufactured_at: 1970-01-01 + warranty_end: 1970-01-01 supported_job_types: deploy: true besteffort: true max_walltime: 0 processor: - microarchitecture: Haswell # TODO: replace with microarch name. - clock_speed: 8 # TODO: Replace with clock speed. + microarchitecture: Cascade Lake-SP + clock_speed: 2_400_000_000 network_adapters: bmc: interface: Ethernet @@ -35,7 +35,7 @@ nodes: interface: SAS by_path: "/dev/disk/by-path/dummy" # this path will have to change later. software: - standard-environment: debian11-x64-std # TODO: check that architecture is OK + standard-environment: debian11-x64-std management_tools: - bmc_vendor_tool: ipmitool # TODO: replace with bmc_vendor_tool (ipmitool, racadm) + bmc_vendor_tool: racadm nodeset: esterel24 diff --git a/input/grid5000/sites/sophia/clusters/esterel24/esterel24_metrics.yaml b/input/grid5000/sites/sophia/clusters/esterel24/esterel24_metrics.yaml index 729e9beb582834908e55a7537bd5ace8abbd1f3d..fc4d71df1e408960d315eb21daa66a5e6cfdae27 100644 --- a/input/grid5000/sites/sophia/clusters/esterel24/esterel24_metrics.yaml +++ b/input/grid5000/sites/sophia/clusters/esterel24/esterel24_metrics.yaml @@ -38,3 +38,25 @@ metrics: source: protocol: prometheus port: 9100 + + - name: prom_nvgpu_default_metrics + description: Default subset of metrics from Prometheus Nvidia DCGM Exporter + period: 0 + optional_period: 15000 + source: + protocol: prometheus + port: 9400 + id: + - DCGM_FI_DEV_SM_CLOCK + - DCGM_FI_DEV_MEM_CLOCK + - DCGM_FI_DEV_GPU_TEMP + - DCGM_FI_DEV_POWER_USAGE + - DCGM_FI_DEV_MEM_COPY_UTIL + + - name: prom_nvgpu_all_metrics + description: All metrics from Prometheus Nvidia DCGM Exporter + period: 0 + optional_period: 15000 + source: + protocol: prometheus + port: 9400 \ No newline at end of file diff --git a/input/grid5000/sites/sophia/clusters/esterel24/nodes.yaml.erb b/input/grid5000/sites/sophia/clusters/esterel24/nodes.yaml.erb index 983cbe9a325db21696ee8ebd28c7361f7118c4c5..7a1726c538cf4324e963bcd6fce145fd73e3260b 100644 --- a/input/grid5000/sites/sophia/clusters/esterel24/nodes.yaml.erb +++ b/input/grid5000/sites/sophia/clusters/esterel24/nodes.yaml.erb @@ -3,14 +3,12 @@ # If changes are needed, it might be better to edit the source data and regenerate using 'rake mass:create' cluster_name = "esterel24" - nodes_number = 2 # Size of the cluster (number of nodes) + nodes_number = 1 # Size of the cluster (number of nodes) # MAC addresses declaration mac_eth0_list = %w( -f4:02:70:9d:75:92 70:b5:e8:c9:ff:d2 ) mac_bmc_list = %w( -f4:02:70:9d:75:a2 70:b5:e8:c9:ff:e2 ) %> @@ -39,10 +37,10 @@ nodes: size: 8 # Fake size, will be replaced by g5k-checks technology: dram # Common memory technology, will be replaced by g5k-checks processor: - model: Unknown # Fake model name, will be replaced by g5k-checks - other_description: description # Fake description, will be replaced by g5k-checks - vendor: vendor # Fake vendor, will be replaced by g5k-checks - version: vendor # Fake version, will be replaced by g5k-checks + model: Intel Xeon # Fake model name, will be replaced by g5k-checks + other_description: Intel(R) Xeon(R) Gold 6240R CPU @ 2.60GHz # Fake description, will be replaced by g5k-checks + vendor: Intel # Fake vendor, will be replaced by g5k-checks + version: Gold 6240R # Fake version, will be replaced by g5k-checks cache_l1d: 8 # Fake cache, will be replaced by g5k-checks cache_l1i: 8 # Fake cache, will be replaced by g5k-checks cache_l2: 8 # Fake cache, will be replaced by g5k-checks