From c05142ebec7b79e3340fe23eba94fd0d0e886dc1 Mon Sep 17 00:00:00 2001 From: Pierre Jacquot <pierre.jacquot@inria.fr> Date: Thu, 19 Dec 2024 16:19:00 +0100 Subject: [PATCH 01/11] [rennes][abacus26] First description of abacus26 cluster --- .../rennes/clusters/abacus26/abacus26.json | 19 +++ .../clusters/abacus26/nodes/abacus26-1.json | 128 ++++++++++++++++++ .../rennes/clusters/abacus26/abacus26.yaml | 40 ++++++ .../rennes/clusters/abacus26/nodes.yaml.erb | 82 +++++++++++ 4 files changed, 269 insertions(+) create mode 100644 data/grid5000/sites/rennes/clusters/abacus26/abacus26.json create mode 100644 data/grid5000/sites/rennes/clusters/abacus26/nodes/abacus26-1.json create mode 100644 input/grid5000/sites/rennes/clusters/abacus26/abacus26.yaml create mode 100644 input/grid5000/sites/rennes/clusters/abacus26/nodes.yaml.erb diff --git a/data/grid5000/sites/rennes/clusters/abacus26/abacus26.json b/data/grid5000/sites/rennes/clusters/abacus26/abacus26.json new file mode 100644 index 00000000000..c4ae7d6eeef --- /dev/null +++ b/data/grid5000/sites/rennes/clusters/abacus26/abacus26.json @@ -0,0 +1,19 @@ +{ + "boot_type": "uefi", + "created_at": "Thu, 01 Jan 1970 00:00:00 GMT", + "exotic": false, + "kavlan": false, + "manufactured_at": "1970-01-01", + "metrics": [ + + ], + "model": "HPE ProLiant DL380 Gen11", + "queues": [ + "admin", + "testing" + ], + "redfish": true, + "type": "cluster", + "uid": "abacus26", + "warranty_end": "1970-01-01" +} \ No newline at end of file diff --git a/data/grid5000/sites/rennes/clusters/abacus26/nodes/abacus26-1.json b/data/grid5000/sites/rennes/clusters/abacus26/nodes/abacus26-1.json new file mode 100644 index 00000000000..f48ba7fcc2a --- /dev/null +++ b/data/grid5000/sites/rennes/clusters/abacus26/nodes/abacus26-1.json @@ -0,0 +1,128 @@ +{ + "architecture": { + "cpu_core_numbering": "contiguous", + "nb_cores": 72, + "nb_procs": 1, + "nb_threads": 72, + "platform_type": "x86_64" + }, + "bios": { + "release_date": "01/01/2000", + "vendor": "Unknown", + "version": 1 + }, + "bmc_version": "v1", + "chassis": { + "manufactured_at": "1970-01-01", + "manufacturer": "Unknown", + "name": "Unknown", + "warranty_end": "1970-01-01" + }, + "exotic": false, + "main_memory": { + "ram_size": 8 + }, + "management_tools": { + "bmc_vendor_tool": "ipmitool", + "ipmitool": { + "retries": 5 + } + }, + "memory_devices": [ + { + "device": "dimm_proc 1 dimm 1", + "size": 8, + "technology": "dram" + } + ], + "network_adapters": [ + { + "device": "eth0", + "driver": "mlx_core", + "enabled": true, + "interface": "Ethernet", + "ip": "172.16.109.29", + "ip6": "2001:660:4406:700:e::1d", + "kavlan": false, + "mac": "d4:04:e6:b7:7c:d6", + "management": false, + "mountable": true, + "mounted": true, + "name": "enp1s0f0np0", + "network_address": "abacus26-1.rennes.grid5000.fr", + "rate": 10000000000, + "switch": null, + "switch_port": null + }, + { + "device": "bmc", + "enabled": true, + "interface": "Ethernet", + "ip": "172.17.109.29", + "kavlan": false, + "mac": "5c:ed:8c:f6:a4:0c", + "management": true, + "mountable": false, + "mounted": false, + "network_address": "abacus26-1-bmc.rennes.grid5000.fr" + } + ], + "nodeset": "abacus-26", + "operating_system": { + "cstate_driver": "unknown", + "cstate_governor": "unknown", + "ht_enabled": true, + "pstate_driver": "unknwon", + "pstate_governor": "unknown", + "turboboost_enabled": true + }, + "performance": { + "core_flops": 128, + "node_flops": 9216 + }, + "processor": { + "cache_l1": null, + "cache_l1d": 8, + "cache_l1i": 8, + "cache_l2": 8, + "cache_l3": 8, + "clock_speed": 8, + "ht_capable": true, + "instruction_set": "x86-64", + "microarchitecture": "Haswell", + "microcode": "0xd000001", + "model": "Unknown", + "other_description": "description", + "vendor": "vendor", + "version": "vendor" + }, + "redfish": true, + "software": { + "forced-deployment-timestamp": 202007300948, + "postinstall-version": "1.2024101411", + "standard-environment": "debian11-x64-std" + }, + "storage_devices": [ + { + "by_id": "", + "by_path": "/dev/disk/by-path/dummy", + "id": "disk0", + "interface": "SAS", + "model": "unknown", + "size": 8, + "storage": "SSD", + "vendor": "Unknown" + } + ], + "supported_job_types": { + "besteffort": true, + "deploy": true, + "max_walltime": 0, + "queues": [ + "admin", + "testing" + ] + }, + "type": "node", + "uid": "abacus26-1" +} \ No newline at end of file diff --git a/input/grid5000/sites/rennes/clusters/abacus26/abacus26.yaml b/input/grid5000/sites/rennes/clusters/abacus26/abacus26.yaml new file mode 100644 index 00000000000..26de8193208 --- /dev/null +++ b/input/grid5000/sites/rennes/clusters/abacus26/abacus26.yaml @@ -0,0 +1,40 @@ +--- +model: HPE ProLiant DL380 Gen11 +created_at: 1970-01-01 # TODO: change this value +kavlan: false +boot_type: uefi +exotic: false +queues: + - admin + - testing +nodes: + abacus26-[1-1]: + chassis: + manufactured_at: 1970-01-01 # TODO: Put date. + warranty_end: 1970-01-01 # TODO: Put date. + supported_job_types: + deploy: true + besteffort: true + max_walltime: 0 + processor: + microarchitecture: Haswell # TODO: replace with microarch name. + clock_speed: 8 # TODO: Replace with clock speed. + network_adapters: + bmc: + enabled: true + mountable: false + mounted: false + eth0: + enabled: true + mountable: true + mounted: true + storage_devices: + disk0: # This field will have to be renamed later. + id: disk0 + interface: SAS + by_path: "/dev/disk/by-path/dummy" # this path will have to change later. + software: + standard-environment: debian11-x64-std # TODO: check that architecture is OK + management_tools: + bmc_vendor_tool: ipmitool + nodeset: abacus-26 diff --git a/input/grid5000/sites/rennes/clusters/abacus26/nodes.yaml.erb b/input/grid5000/sites/rennes/clusters/abacus26/nodes.yaml.erb new file mode 100644 index 00000000000..a7b1ddf305d --- /dev/null +++ b/input/grid5000/sites/rennes/clusters/abacus26/nodes.yaml.erb @@ -0,0 +1,82 @@ +<% + CLUSTER_NAME = "abacus26" + NODES_NUMBER = 1 + SITE_IPV4_INDEX = 13 + # MAC addresses declaration + MAC_ETH0_LIST = %w( + d4:04:e6:b7:7c:d6 + ) + + MAC_BMC_LIST = %w( + 5c:ed:8c:f6:a4:0c + ) +%> +--- +nodes: +<% (1..NODES_NUMBER).each { |i| %> + <%= CLUSTER_NAME %>-<%= i %>: + architecture: + nb_procs: 1 + nb_cores: 72 + nb_threads: 72 + platform_type: x86_64 + cpu_core_numbering: contiguous + bios: + release_date: 01/01/2000 # Fake date, will be replaced by g5k-checks + vendor: Unknown # Fake vendor, will be replaced by g5k-checks + version: 1 # Fake version, will be replaced by g5k-checks + bmc_version: v1 # Fake version, will be replaced by g5k-checks + chassis: + manufacturer: Unknown # Fake manufacturer, will be replaced by g5k-checks + name: Unknown # Fake name, will be replaced by g5k-checks + main_memory: + ram_size: 8 # Fake size, will be replaced by g5k-checks + memory_devices: + dimm: + size: 8 # Fake size, will be replaced by g5k-checks + technology: dram # Common memory technology, will be replaced by g5k-checks + processor: + model: Unknown # Fake model name, will be replaced by g5k-checks + other_description: description # Fake description, will be replaced by g5k-checks + vendor: vendor # Fake vendor, will be replaced by g5k-checks + version: vendor # Fake version, will be replaced by g5k-checks + cache_l1d: 8 # Fake cache, will be replaced by g5k-checks + cache_l1i: 8 # Fake cache, will be replaced by g5k-checks + cache_l2: 8 # Fake cache, will be replaced by g5k-checks + cache_l3: 8 # Fake cache, will be replaced by g5k-checks + instruction_set: x86-64 # Common instruction set, will be replaced by g5k-checks + microcode: "0xd000001" # Fake microcode, will be replaced by g5k-checks + ht_capable: true # Default ht capable value, will be replaced by g5k-checks + main_memory: + ram_size: 8 # Fake ram size, will be replaced by g5k-checks + memory_devices: + dimm_proc 1 dimm 1: # Fake dimm name, will be replaced by g5k-checks + size: 8 # Fake dimm size, will be replaced by g5k-checks + technology: dram # Default dimm technology, will be replaced by g5k-checks + operating_system: + cstate_driver: unknown # Fake driver, will be replaced by g5k-checks + cstate_governor: unknown # Fake governor, will be replaced by g5k-checks + ht_enabled: true # common value for hyper threading, will be replaced by g5k-checks + pstate_driver: unknwon # Fake driver, will be replaced by g5k-checks + pstate_governor: unknown # Fake driver, will be replaced by g5k-checks + turboboost_enabled: true # Default value for turboboost, will be replaced by g5k-checks + network_adapters: + bmc: + interface: Ethernet + management: true + mac: <%= MAC_BMC_LIST[i - 1] %> + ip: 172.17.109.29 + eth0: + interface: Ethernet + management: false + driver: mlx_core + name: enp1s0f0np0 + rate: 10000000000 + mac: <%= MAC_ETH0_LIST[i - 1] %> + ip: 172.16.109.29 + storage_devices: + disk0: + storage: SSD # Common value, will be replaced by g5k-checks + model: unknown # fake value, + size: 8 # Fake value, will be replaced by g5k-checks +<% } %> -- GitLab From cc5ec740125da0024b055f12f9692e069ec3b65c Mon Sep 17 00:00:00 2001 From: Pierre Jacquot <pierre.jacquot@inria.fr> Date: Thu, 19 Dec 2024 16:16:48 +0100 Subject: [PATCH 02/11] [rennes][abacus26] Update disk PCI path --- .../sites/rennes/clusters/abacus26/nodes/abacus26-1.json | 2 +- input/grid5000/sites/rennes/clusters/abacus26/abacus26.yaml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/data/grid5000/sites/rennes/clusters/abacus26/nodes/abacus26-1.json b/data/grid5000/sites/rennes/clusters/abacus26/nodes/abacus26-1.json index f48ba7fcc2a..8fc128db384 100644 --- a/data/grid5000/sites/rennes/clusters/abacus26/nodes/abacus26-1.json +++ b/data/grid5000/sites/rennes/clusters/abacus26/nodes/abacus26-1.json @@ -105,7 +105,7 @@ "storage_devices": [ { "by_id": "", - "by_path": "/dev/disk/by-path/dummy", + "by_path": "/dev/disk/by-path/pci-0000:e1:00.0-nvme-1", "id": "disk0", "interface": "SAS", "model": "unknown", diff --git a/input/grid5000/sites/rennes/clusters/abacus26/abacus26.yaml b/input/grid5000/sites/rennes/clusters/abacus26/abacus26.yaml index 26de8193208..e110a8f850a 100644 --- a/input/grid5000/sites/rennes/clusters/abacus26/abacus26.yaml +++ b/input/grid5000/sites/rennes/clusters/abacus26/abacus26.yaml @@ -32,7 +32,7 @@ nodes: disk0: # This field will have to be renamed later. id: disk0 interface: SAS - by_path: "/dev/disk/by-path/dummy" # this path will have to change later. + by_path: "/dev/disk/by-path/pci-0000:e1:00.0-nvme-1" software: standard-environment: debian11-x64-std # TODO: check that architecture is OK management_tools: -- GitLab From bd16bf0132231723dc3194c2004eb0e797b903f6 Mon Sep 17 00:00:00 2001 From: PENVEN Loris <loris.penven@irisa.fr> Date: Wed, 8 Jan 2025 15:06:13 +0100 Subject: [PATCH 03/11] [rennes][abacus26] Add predictable newtork interface mapping --- lib/refrepo/net_names_mapping.yaml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/lib/refrepo/net_names_mapping.yaml b/lib/refrepo/net_names_mapping.yaml index 04008ef2757..847e3df6f1d 100644 --- a/lib/refrepo/net_names_mapping.yaml +++ b/lib/refrepo/net_names_mapping.yaml @@ -74,6 +74,9 @@ abacus22: abacus25: ens10f0: eth0 ens10f1: eth1 +abacus26: + enp1s0f0np0: eth0 + ens15f1np1: eth1 abacus27: enp1s0f0np0: eth0 ens22f1np1: eth1 -- GitLab From ddc4c340793be54ae4622ed257734ea610c7cec2 Mon Sep 17 00:00:00 2001 From: PENVEN Loris <loris.penven@irisa.fr> Date: Wed, 8 Jan 2025 16:00:01 +0100 Subject: [PATCH 04/11] [rennes][abacus26] G5k-checks import --- .../clusters/abacus26/nodes/abacus26-1.yaml | 118 ++++++++++++++++++ 1 file changed, 118 insertions(+) create mode 100644 input/grid5000/sites/rennes/clusters/abacus26/nodes/abacus26-1.yaml diff --git a/input/grid5000/sites/rennes/clusters/abacus26/nodes/abacus26-1.yaml b/input/grid5000/sites/rennes/clusters/abacus26/nodes/abacus26-1.yaml new file mode 100644 index 00000000000..fc3e6a82dec --- /dev/null +++ b/input/grid5000/sites/rennes/clusters/abacus26/nodes/abacus26-1.yaml @@ -0,0 +1,118 @@ +# Generated by g5k-checks (g5k-checks -m api) +--- +abacus26-1: + architecture: + cpu_core_numbering: contiguous + nb_cores: 48 + nb_procs: 2 + nb_threads: 96 + platform_type: x86_64 + bios: + release_date: 06/19/2024 + vendor: HPE + version: 2.22 + bmc_version: '1.59' + chassis: + manufacturer: HPE + name: ProLiant DL380 Gen11 + serial: CZ2D1P0F51 + gpu_devices: + nvidia0: + cpu_affinity: 0 + device: "/dev/nvidia0" + memory: 48305799168 + model: L40S + power_default_limit: 350.00 W + vbios_version: 95.02.66.00.11 + vendor: Nvidia + nvidia1: + cpu_affinity: 1 + device: "/dev/nvidia1" + memory: 48305799168 + model: L40S + power_default_limit: 350.00 W + vbios_version: 95.02.66.00.11 + vendor: Nvidia + main_memory: + ram_size: 274877906944 + memory_devices: + dimm_proc 1 dimm 10: + size: 34359738368 + technology: dram + dimm_proc 1 dimm 14: + size: 34359738368 + technology: dram + dimm_proc 1 dimm 3: + size: 34359738368 + technology: dram + dimm_proc 1 dimm 7: + size: 34359738368 + technology: dram + dimm_proc 2 dimm 10: + size: 34359738368 + technology: dram + dimm_proc 2 dimm 14: + size: 34359738368 + technology: dram + dimm_proc 2 dimm 3: + size: 34359738368 + technology: dram + dimm_proc 2 dimm 7: + size: 34359738368 + technology: dram + network_adapters: + bmc: + ip: 172.17.109.29 + mac: 5c:ed:8c:f6:a4:0c + management: true + eth0: + driver: bnxt_en + firmware_version: 228.0.116.0/pkg 228.1.111.0 + interface: Ethernet + ip: 172.16.109.29 + mac: d4:04:e6:b7:7c:d6 + management: false + model: BCM57414 NetXtreme-E 10Gb/25Gb RDMA Ethernet Controller + name: enp1s0f0np0 + rate: 10000000000 + sriov: false + sriov_totalvfs: 0 + vendor: Broadcom Inc. and subsidiaries + eth1: + driver: bnxt_en + firmware_version: 228.0.116.0/pkg 228.1.111.0 + interface: Ethernet + mac: d4:04:e6:b7:7c:d7 + management: false + model: BCM57414 NetXtreme-E 10Gb/25Gb RDMA Ethernet Controller + name: ens15f1np1 + sriov: false + sriov_totalvfs: 0 + vendor: Broadcom Inc. and subsidiaries + operating_system: + cstate_driver: intel_idle + cstate_governor: menu + ht_enabled: true + pstate_driver: intel_pstate + pstate_governor: performance + turboboost_enabled: true + processor: + cache_l1d: 49152 + cache_l1i: 32768 + cache_l2: 2097152 + cache_l3: 62914560 + ht_capable: true + instruction_set: x86-64 + microcode: '0x2b0005d2' + model: Intel Xeon + other_description: Intel(R) Xeon(R) Gold 6442Y + vendor: Intel + storage_devices: + pci-0000:e1:00.0-nvme-1: + by_id: "/dev/disk/by-id/nvme-eui.37305430584073080025384e00000002" + by_path: "/dev/disk/by-path/pci-0000:e1:00.0-nvme-1" + model: MO001600KYDMU + size: 1600321314816 + storage: SSD + supported_job_types: + virtual: ivt -- GitLab From c8f51b9486dafbf9a3cfb786924a74197f801710 Mon Sep 17 00:00:00 2001 From: PENVEN Loris <loris.penven@irisa.fr> Date: Wed, 8 Jan 2025 16:30:09 +0100 Subject: [PATCH 05/11] [rennes][abacus26] modified disk name in cluster yaml --- .../clusters/abacus26/nodes/abacus26-1.json | 162 ++++++++++++++---- .../rennes/clusters/abacus26/abacus26.yaml | 7 +- .../rennes/clusters/abacus26/nodes.yaml.erb | 82 --------- 3 files changed, 129 insertions(+), 122 deletions(-) delete mode 100644 input/grid5000/sites/rennes/clusters/abacus26/nodes.yaml.erb diff --git a/data/grid5000/sites/rennes/clusters/abacus26/nodes/abacus26-1.json b/data/grid5000/sites/rennes/clusters/abacus26/nodes/abacus26-1.json index 8fc128db384..e3a2cec2b19 100644 --- a/data/grid5000/sites/rennes/clusters/abacus26/nodes/abacus26-1.json +++ b/data/grid5000/sites/rennes/clusters/abacus26/nodes/abacus26-1.json @@ -1,26 +1,63 @@ { "architecture": { "cpu_core_numbering": "contiguous", - "nb_cores": 72, - "nb_procs": 1, - "nb_threads": 72, + "nb_cores": 48, + "nb_procs": 2, + "nb_threads": 96, "platform_type": "x86_64" }, "bios": { - "release_date": "01/01/2000", - "vendor": "Unknown", - "version": 1 + "release_date": "06/19/2024", + "vendor": "HPE", + "version": 2.22 }, - "bmc_version": "v1", + "bmc_version": "1.59", "chassis": { "manufactured_at": "1970-01-01", - "manufacturer": "Unknown", - "name": "Unknown", + "manufacturer": "HPE", + "name": "ProLiant DL380 Gen11", + "serial": "CZ2D1P0F51", "warranty_end": "1970-01-01" }, "exotic": false, + "gpu_devices": { + "nvidia0": { + "compute_capability": "8.9", + "cores": 18176, + "cpu_affinity": 0, + "device": "/dev/nvidia0", + "memory": 48305799168, + "microarchitecture": "Ada Lovelace", + "model": "L40S", + "performance": { + "fp-16": 91610000000000, + "fp-32": 91610000000000, + "fp-64": 1431000000000 + }, + "power_default_limit": "350.00 W", + "vbios_version": "95.02.66.00.11", + "vendor": "Nvidia" + }, + "nvidia1": { + "compute_capability": "8.9", + "cores": 18176, + "cpu_affinity": 1, + "device": "/dev/nvidia1", + "memory": 48305799168, + "microarchitecture": "Ada Lovelace", + "model": "L40S", + "performance": { + "fp-16": 91610000000000, + "fp-32": 91610000000000, + "fp-64": 1431000000000 + }, + "power_default_limit": "350.00 W", + "vbios_version": "95.02.66.00.11", + "vendor": "Nvidia" + } + }, "main_memory": { - "ram_size": 8 + "ram_size": 274877906944 }, "management_tools": { "bmc_vendor_tool": "ipmitool", @@ -30,34 +67,87 @@ }, "memory_devices": [ { - "device": "dimm_proc 1 dimm 1", - "size": 8, + "device": "dimm_proc 1 dimm 10", + "size": 34359738368, + "technology": "dram" + }, + { + "device": "dimm_proc 1 dimm 14", + "size": 34359738368, + "technology": "dram" + }, + { + "device": "dimm_proc 1 dimm 3", + "size": 34359738368, + "technology": "dram" + }, + { + "device": "dimm_proc 1 dimm 7", + "size": 34359738368, + "technology": "dram" + }, + { + "device": "dimm_proc 2 dimm 10", + "size": 34359738368, + "technology": "dram" + }, + { + "device": "dimm_proc 2 dimm 14", + "size": 34359738368, + "technology": "dram" + }, + { + "device": "dimm_proc 2 dimm 3", + "size": 34359738368, + "technology": "dram" + }, + { + "device": "dimm_proc 2 dimm 7", + "size": 34359738368, "technology": "dram" } ], "network_adapters": [ { "device": "eth0", - "driver": "mlx_core", + "driver": "bnxt_en", "enabled": true, + "firmware_version": "228.0.116.0/pkg 228.1.111.0", "interface": "Ethernet", "ip": "172.16.109.29", "ip6": "2001:660:4406:700:e::1d", "kavlan": false, "mac": "d4:04:e6:b7:7c:d6", "management": false, + "model": "BCM57414 NetXtreme-E 10Gb/25Gb RDMA Ethernet Controller", "mountable": true, "mounted": true, "name": "enp1s0f0np0", "network_address": "abacus26-1.rennes.grid5000.fr", "rate": 10000000000, + "sriov": false, + "sriov_totalvfs": 0, "switch": null, - "switch_port": null + "switch_port": null, + "vendor": "Broadcom Inc. and subsidiaries" + }, + { + "device": "eth1", + "driver": "bnxt_en", + "firmware_version": "228.0.116.0/pkg 228.1.111.0", + "interface": "Ethernet", + "kavlan": false, + "mac": "d4:04:e6:b7:7c:d7", + "management": false, + "model": "BCM57414 NetXtreme-E 10Gb/25Gb RDMA Ethernet Controller", + "name": "ens15f1np1", + "sriov": false, + "sriov_totalvfs": 0, + "vendor": "Broadcom Inc. and subsidiaries" }, { "device": "bmc", "enabled": true, - "interface": "Ethernet", "ip": "172.17.109.29", "kavlan": false, "mac": "5c:ed:8c:f6:a4:0c", @@ -69,49 +159,48 @@ ], "nodeset": "abacus-26", "operating_system": { - "cstate_driver": "unknown", - "cstate_governor": "unknown", + "cstate_driver": "intel_idle", + "cstate_governor": "menu", "ht_enabled": true, - "pstate_driver": "unknwon", - "pstate_governor": "unknown", + "pstate_driver": "intel_pstate", + "pstate_governor": "performance", "turboboost_enabled": true }, "performance": { "core_flops": 128, - "node_flops": 9216 + "node_flops": 6144 }, "processor": { "cache_l1": null, - "cache_l1d": 8, - "cache_l1i": 8, - "cache_l2": 8, - "cache_l3": 8, + "cache_l1d": 49152, + "cache_l1i": 32768, + "cache_l2": 2097152, + "cache_l3": 62914560, "clock_speed": 8, "ht_capable": true, "instruction_set": "x86-64", "microarchitecture": "Haswell", - "microcode": "0xd000001", - "model": "Unknown", - "other_description": "description", - "vendor": "vendor", - "version": "vendor" + "microcode": "0x2b0005d2", + "model": "Intel Xeon", + "other_description": "Intel(R) Xeon(R) Gold 6442Y", + "vendor": "Intel" }, "redfish": true, "software": { "forced-deployment-timestamp": 202007300948, - "postinstall-version": "1.2024101411", + "postinstall-version": "1.2024112508", "standard-environment": "debian11-x64-std" }, "storage_devices": [ { - "by_id": "", + "by_id": "/dev/disk/by-id/nvme-eui.37305430584073080025384e00000002", "by_path": "/dev/disk/by-path/pci-0000:e1:00.0-nvme-1", "id": "disk0", - "interface": "SAS", - "model": "unknown", - "size": 8, + "interface": "NVME", + "model": "MO001600KYDMU", + "size": 1600321314816, "storage": "SSD", - "vendor": "Unknown" + "vendor": "Samsung" } ], "supported_job_types": { @@ -121,7 +210,8 @@ "queues": [ "admin", "testing" - ] + ], + "virtual": "ivt" }, "type": "node", "uid": "abacus26-1" diff --git a/input/grid5000/sites/rennes/clusters/abacus26/abacus26.yaml b/input/grid5000/sites/rennes/clusters/abacus26/abacus26.yaml index e110a8f850a..26505808195 100644 --- a/input/grid5000/sites/rennes/clusters/abacus26/abacus26.yaml +++ b/input/grid5000/sites/rennes/clusters/abacus26/abacus26.yaml @@ -29,10 +29,9 @@ nodes: mountable: true mounted: true storage_devices: - disk0: # This field will have to be renamed later. - id: disk0 - interface: SAS - by_path: "/dev/disk/by-path/pci-0000:e1:00.0-nvme-1" + pci-0000:e1:00.0-nvme-1: + id: disk0 + interface: NVME software: standard-environment: debian11-x64-std # TODO: check that architecture is OK management_tools: diff --git a/input/grid5000/sites/rennes/clusters/abacus26/nodes.yaml.erb b/input/grid5000/sites/rennes/clusters/abacus26/nodes.yaml.erb deleted file mode 100644 index a7b1ddf305d..00000000000 --- a/input/grid5000/sites/rennes/clusters/abacus26/nodes.yaml.erb +++ /dev/null @@ -1,82 +0,0 @@ -<% - CLUSTER_NAME = "abacus26" - NODES_NUMBER = 1 - SITE_IPV4_INDEX = 13 - # MAC addresses declaration - MAC_ETH0_LIST = %w( - d4:04:e6:b7:7c:d6 - ) - - MAC_BMC_LIST = %w( - 5c:ed:8c:f6:a4:0c - ) -%> ---- -nodes: -<% (1..NODES_NUMBER).each { |i| %> - <%= CLUSTER_NAME %>-<%= i %>: - architecture: - nb_procs: 1 - nb_cores: 72 - nb_threads: 72 - platform_type: x86_64 - cpu_core_numbering: contiguous - bios: - release_date: 01/01/2000 # Fake date, will be replaced by g5k-checks - vendor: Unknown # Fake vendor, will be replaced by g5k-checks - version: 1 # Fake version, will be replaced by g5k-checks - bmc_version: v1 # Fake version, will be replaced by g5k-checks - chassis: - manufacturer: Unknown # Fake manufacturer, will be replaced by g5k-checks - name: Unknown # Fake name, will be replaced by g5k-checks - main_memory: - ram_size: 8 # Fake size, will be replaced by g5k-checks - memory_devices: - dimm: - size: 8 # Fake size, will be replaced by g5k-checks - technology: dram # Common memory technology, will be replaced by g5k-checks - processor: - model: Unknown # Fake model name, will be replaced by g5k-checks - other_description: description # Fake description, will be replaced by g5k-checks - vendor: vendor # Fake vendor, will be replaced by g5k-checks - version: vendor # Fake version, will be replaced by g5k-checks - cache_l1d: 8 # Fake cache, will be replaced by g5k-checks - cache_l1i: 8 # Fake cache, will be replaced by g5k-checks - cache_l2: 8 # Fake cache, will be replaced by g5k-checks - cache_l3: 8 # Fake cache, will be replaced by g5k-checks - instruction_set: x86-64 # Common instruction set, will be replaced by g5k-checks - microcode: "0xd000001" # Fake microcode, will be replaced by g5k-checks - ht_capable: true # Default ht capable value, will be replaced by g5k-checks - main_memory: - ram_size: 8 # Fake ram size, will be replaced by g5k-checks - memory_devices: - dimm_proc 1 dimm 1: # Fake dimm name, will be replaced by g5k-checks - size: 8 # Fake dimm size, will be replaced by g5k-checks - technology: dram # Default dimm technology, will be replaced by g5k-checks - operating_system: - cstate_driver: unknown # Fake driver, will be replaced by g5k-checks - cstate_governor: unknown # Fake governor, will be replaced by g5k-checks - ht_enabled: true # common value for hyper threading, will be replaced by g5k-checks - pstate_driver: unknwon # Fake driver, will be replaced by g5k-checks - pstate_governor: unknown # Fake driver, will be replaced by g5k-checks - turboboost_enabled: true # Default value for turboboost, will be replaced by g5k-checks - network_adapters: - bmc: - interface: Ethernet - management: true - mac: <%= MAC_BMC_LIST[i - 1] %> - ip: 172.17.109.29 - eth0: - interface: Ethernet - management: false - driver: mlx_core - name: enp1s0f0np0 - rate: 10000000000 - mac: <%= MAC_ETH0_LIST[i - 1] %> - ip: 172.16.109.29 - storage_devices: - disk0: - storage: SSD # Common value, will be replaced by g5k-checks - model: unknown # fake value, - size: 8 # Fake value, will be replaced by g5k-checks -<% } %> -- GitLab From 0e16d40457bd5e624fa6ecce89f02d70134b01fe Mon Sep 17 00:00:00 2001 From: PENVEN Loris <loris.penven@irisa.fr> Date: Thu, 9 Jan 2025 11:06:00 +0100 Subject: [PATCH 06/11] [rennes][abacus26] Add missing fields to abacus26-1.yaml --- .../sites/rennes/clusters/abacus26/nodes/abacus26-1.yaml | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/input/grid5000/sites/rennes/clusters/abacus26/nodes/abacus26-1.yaml b/input/grid5000/sites/rennes/clusters/abacus26/nodes/abacus26-1.yaml index fc3e6a82dec..735e0327e11 100644 --- a/input/grid5000/sites/rennes/clusters/abacus26/nodes/abacus26-1.yaml +++ b/input/grid5000/sites/rennes/clusters/abacus26/nodes/abacus26-1.yaml @@ -65,6 +65,7 @@ abacus26-1: ip: 172.17.109.29 mac: 5c:ed:8c:f6:a4:0c management: true + interface: ipmitool eth0: driver: bnxt_en firmware_version: 228.0.116.0/pkg 228.1.111.0 @@ -89,6 +90,9 @@ abacus26-1: sriov: false sriov_totalvfs: 0 vendor: Broadcom Inc. and subsidiaries + enabled: false + mountable: false + mounted: false operating_system: cstate_driver: intel_idle cstate_governor: menu @@ -107,6 +111,7 @@ abacus26-1: model: Intel Xeon other_description: Intel(R) Xeon(R) Gold 6442Y vendor: Intel + version: 6442Y storage_devices: pci-0000:e1:00.0-nvme-1: by_id: "/dev/disk/by-id/nvme-eui.37305430584073080025384e00000002" -- GitLab From 87f244bd7447ebb3aa9c4d3a06276e3ea8392eeb Mon Sep 17 00:00:00 2001 From: PENVEN Loris <loris.penven@irisa.fr> Date: Thu, 9 Jan 2025 11:13:35 +0100 Subject: [PATCH 07/11] [rennes][abacus26] Fix bmc interface to Ethernet + Regenerate refapi --- .../sites/rennes/clusters/abacus26/nodes/abacus26-1.json | 7 ++++++- .../sites/rennes/clusters/abacus26/nodes/abacus26-1.yaml | 2 +- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/data/grid5000/sites/rennes/clusters/abacus26/nodes/abacus26-1.json b/data/grid5000/sites/rennes/clusters/abacus26/nodes/abacus26-1.json index e3a2cec2b19..c21831fc5c1 100644 --- a/data/grid5000/sites/rennes/clusters/abacus26/nodes/abacus26-1.json +++ b/data/grid5000/sites/rennes/clusters/abacus26/nodes/abacus26-1.json @@ -134,12 +134,15 @@ { "device": "eth1", "driver": "bnxt_en", + "enabled": false, "firmware_version": "228.0.116.0/pkg 228.1.111.0", "interface": "Ethernet", "kavlan": false, "mac": "d4:04:e6:b7:7c:d7", "management": false, "model": "BCM57414 NetXtreme-E 10Gb/25Gb RDMA Ethernet Controller", + "mountable": false, + "mounted": false, "name": "ens15f1np1", "sriov": false, "sriov_totalvfs": 0, @@ -148,6 +151,7 @@ { "device": "bmc", "enabled": true, + "interface": "Ethernet", "ip": "172.17.109.29", "kavlan": false, "mac": "5c:ed:8c:f6:a4:0c", @@ -183,7 +187,8 @@ "microcode": "0x2b0005d2", "model": "Intel Xeon", "other_description": "Intel(R) Xeon(R) Gold 6442Y", - "vendor": "Intel" + "vendor": "Intel", + "version": "6442Y" }, "redfish": true, "software": { diff --git a/input/grid5000/sites/rennes/clusters/abacus26/nodes/abacus26-1.yaml b/input/grid5000/sites/rennes/clusters/abacus26/nodes/abacus26-1.yaml index 735e0327e11..a94ffc13d85 100644 --- a/input/grid5000/sites/rennes/clusters/abacus26/nodes/abacus26-1.yaml +++ b/input/grid5000/sites/rennes/clusters/abacus26/nodes/abacus26-1.yaml @@ -65,7 +65,7 @@ abacus26-1: ip: 172.17.109.29 mac: 5c:ed:8c:f6:a4:0c management: true - interface: ipmitool + interface: Ethernet eth0: driver: bnxt_en firmware_version: 228.0.116.0/pkg 228.1.111.0 -- GitLab From 0743709fbf1ad0250779a266d76453938e4945be Mon Sep 17 00:00:00 2001 From: PENVEN Loris <loris.penven@irisa.fr> Date: Fri, 17 Jan 2025 10:39:16 +0100 Subject: [PATCH 08/11] [rennes][abacus26] Add creation, warranty and manufacturing date --- data/grid5000/sites/rennes/clusters/abacus26/abacus26.json | 6 +++--- .../sites/rennes/clusters/abacus26/nodes/abacus26-1.json | 4 ++-- input/grid5000/sites/rennes/clusters/abacus26/abacus26.yaml | 6 +++--- 3 files changed, 8 insertions(+), 8 deletions(-) diff --git a/data/grid5000/sites/rennes/clusters/abacus26/abacus26.json b/data/grid5000/sites/rennes/clusters/abacus26/abacus26.json index c4ae7d6eeef..5ff981574de 100644 --- a/data/grid5000/sites/rennes/clusters/abacus26/abacus26.json +++ b/data/grid5000/sites/rennes/clusters/abacus26/abacus26.json @@ -1,9 +1,9 @@ { "boot_type": "uefi", - "created_at": "Thu, 01 Jan 1970 00:00:00 GMT", + "created_at": "Fri, 17 Jan 2025 00:00:00 GMT", "exotic": false, "kavlan": false, - "manufactured_at": "1970-01-01", + "manufactured_at": "2024-10-08", "metrics": [ ], @@ -15,5 +15,5 @@ "redfish": true, "type": "cluster", "uid": "abacus26", - "warranty_end": "1970-01-01" + "warranty_end": "2031-10-14" } \ No newline at end of file diff --git a/data/grid5000/sites/rennes/clusters/abacus26/nodes/abacus26-1.json b/data/grid5000/sites/rennes/clusters/abacus26/nodes/abacus26-1.json index c21831fc5c1..d65a5a82e97 100644 --- a/data/grid5000/sites/rennes/clusters/abacus26/nodes/abacus26-1.json +++ b/data/grid5000/sites/rennes/clusters/abacus26/nodes/abacus26-1.json @@ -13,11 +13,11 @@ }, "bmc_version": "1.59", "chassis": { - "manufactured_at": "1970-01-01", + "manufactured_at": "2024-10-08", "manufacturer": "HPE", "name": "ProLiant DL380 Gen11", "serial": "CZ2D1P0F51", - "warranty_end": "1970-01-01" + "warranty_end": "2031-10-14" }, "exotic": false, "gpu_devices": { diff --git a/input/grid5000/sites/rennes/clusters/abacus26/abacus26.yaml b/input/grid5000/sites/rennes/clusters/abacus26/abacus26.yaml index 26505808195..5c3442d09fb 100644 --- a/input/grid5000/sites/rennes/clusters/abacus26/abacus26.yaml +++ b/input/grid5000/sites/rennes/clusters/abacus26/abacus26.yaml @@ -1,6 +1,6 @@ --- model: HPE ProLiant DL380 Gen11 -created_at: 1970-01-01 # TODO: change this value +created_at: 2025-01-17 # TODO: change this e kavlan: false boot_type: uefi exotic: false @@ -10,8 +10,8 @@ queues: nodes: abacus26-[1-1]: chassis: - manufactured_at: 1970-01-01 # TODO: Put date. - warranty_end: 1970-01-01 # TODO: Put date. + manufactured_at: 2024-10-08 + warranty_end: 2031-10-14 supported_job_types: deploy: true besteffort: true -- GitLab From 686b7b4387cd8cc6638bfe3967e74d49f8f21975 Mon Sep 17 00:00:00 2001 From: Laurent Pouilloux <laurent.pouilloux@inria.fr> Date: Fri, 17 Jan 2025 11:33:09 +0100 Subject: [PATCH 09/11] [rennes][abacus26] update JSON file for new PI and new priority --- data/grid5000/sites/rennes/clusters/abacus26/abacus26.json | 1 + .../sites/rennes/clusters/abacus26/nodes/abacus26-1.json | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/data/grid5000/sites/rennes/clusters/abacus26/abacus26.json b/data/grid5000/sites/rennes/clusters/abacus26/abacus26.json index 5ff981574de..899d0175734 100644 --- a/data/grid5000/sites/rennes/clusters/abacus26/abacus26.json +++ b/data/grid5000/sites/rennes/clusters/abacus26/abacus26.json @@ -8,6 +8,7 @@ ], "model": "HPE ProLiant DL380 Gen11", + "priority": 202510, "queues": [ "admin", "testing" diff --git a/data/grid5000/sites/rennes/clusters/abacus26/nodes/abacus26-1.json b/data/grid5000/sites/rennes/clusters/abacus26/nodes/abacus26-1.json index d65a5a82e97..72ce28ac0d4 100644 --- a/data/grid5000/sites/rennes/clusters/abacus26/nodes/abacus26-1.json +++ b/data/grid5000/sites/rennes/clusters/abacus26/nodes/abacus26-1.json @@ -193,7 +193,7 @@ "redfish": true, "software": { "forced-deployment-timestamp": 202007300948, - "postinstall-version": "1.2024112508", + "postinstall-version": "1.2025011610", "standard-environment": "debian11-x64-std" }, "storage_devices": [ -- GitLab From ccf21355fd28fc34b48b2f1cc3726f5eb897a4b5 Mon Sep 17 00:00:00 2001 From: PENVEN Loris <loris.penven@irisa.fr> Date: Fri, 17 Jan 2025 15:27:53 +0100 Subject: [PATCH 10/11] [rennes][abacus26] Update g5k checks imports after firmware update --- .../rennes/clusters/abacus26/nodes/abacus26-1.json | 12 ++++++------ .../rennes/clusters/abacus26/nodes/abacus26-1.yaml | 14 +++++++------- 2 files changed, 13 insertions(+), 13 deletions(-) diff --git a/data/grid5000/sites/rennes/clusters/abacus26/nodes/abacus26-1.json b/data/grid5000/sites/rennes/clusters/abacus26/nodes/abacus26-1.json index 72ce28ac0d4..eb7f2b0ea80 100644 --- a/data/grid5000/sites/rennes/clusters/abacus26/nodes/abacus26-1.json +++ b/data/grid5000/sites/rennes/clusters/abacus26/nodes/abacus26-1.json @@ -7,11 +7,11 @@ "platform_type": "x86_64" }, "bios": { - "release_date": "06/19/2024", + "release_date": "10/04/2024", "vendor": "HPE", - "version": 2.22 + "version": 2.34 }, - "bmc_version": "1.59", + "bmc_version": "1.65", "chassis": { "manufactured_at": "2024-10-08", "manufacturer": "HPE", @@ -112,7 +112,7 @@ "device": "eth0", "driver": "bnxt_en", "enabled": true, - "firmware_version": "228.0.116.0/pkg 228.1.111.0", + "firmware_version": "230.0.168.0/pkg 230.1.123.0", "interface": "Ethernet", "ip": "172.16.109.29", "ip6": "2001:660:4406:700:e::1d", @@ -135,7 +135,7 @@ "device": "eth1", "driver": "bnxt_en", "enabled": false, - "firmware_version": "228.0.116.0/pkg 228.1.111.0", + "firmware_version": "230.0.168.0/pkg 230.1.123.0", "interface": "Ethernet", "kavlan": false, "mac": "d4:04:e6:b7:7c:d7", @@ -184,7 +184,7 @@ "ht_capable": true, "instruction_set": "x86-64", "microarchitecture": "Haswell", - "microcode": "0x2b0005d2", + "microcode": "0x2b000603", "model": "Intel Xeon", "other_description": "Intel(R) Xeon(R) Gold 6442Y", "vendor": "Intel", diff --git a/input/grid5000/sites/rennes/clusters/abacus26/nodes/abacus26-1.yaml b/input/grid5000/sites/rennes/clusters/abacus26/nodes/abacus26-1.yaml index a94ffc13d85..3e4d1830c4b 100644 --- a/input/grid5000/sites/rennes/clusters/abacus26/nodes/abacus26-1.yaml +++ b/input/grid5000/sites/rennes/clusters/abacus26/nodes/abacus26-1.yaml @@ -8,10 +8,10 @@ abacus26-1: nb_threads: 96 platform_type: x86_64 bios: - release_date: 06/19/2024 + release_date: 10/04/2024 vendor: HPE - version: 2.22 - bmc_version: '1.59' + version: 2.34 + bmc_version: '1.65' chassis: manufacturer: HPE name: ProLiant DL380 Gen11 @@ -65,10 +65,10 @@ abacus26-1: ip: 172.17.109.29 mac: 5c:ed:8c:f6:a4:0c management: true - interface: Ethernet + interface: Ethernet eth0: driver: bnxt_en - firmware_version: 228.0.116.0/pkg 228.1.111.0 + firmware_version: 230.0.168.0/pkg 230.1.123.0 interface: Ethernet ip: 172.16.109.29 mac: d4:04:e6:b7:7c:d6 @@ -81,7 +81,7 @@ abacus26-1: vendor: Broadcom Inc. and subsidiaries eth1: driver: bnxt_en - firmware_version: 228.0.116.0/pkg 228.1.111.0 + firmware_version: 230.0.168.0/pkg 230.1.123.0 interface: Ethernet mac: d4:04:e6:b7:7c:d7 management: false @@ -107,7 +107,7 @@ abacus26-1: cache_l3: 62914560 ht_capable: true instruction_set: x86-64 - microcode: '0x2b0005d2' + microcode: '0x2b000603' model: Intel Xeon other_description: Intel(R) Xeon(R) Gold 6442Y vendor: Intel -- GitLab From 1b8c7a596a6f8bf2085d76036fdebdf492004f05 Mon Sep 17 00:00:00 2001 From: PENVEN Loris <loris.penven@irisa.fr> Date: Mon, 20 Jan 2025 10:35:33 +0100 Subject: [PATCH 11/11] [rennes][abacus26] Add clock speed and microarchitecture to cluster description --- .../sites/rennes/clusters/abacus26/nodes/abacus26-1.json | 8 ++++---- .../grid5000/sites/rennes/clusters/abacus26/abacus26.yaml | 8 ++++---- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/data/grid5000/sites/rennes/clusters/abacus26/nodes/abacus26-1.json b/data/grid5000/sites/rennes/clusters/abacus26/nodes/abacus26-1.json index eb7f2b0ea80..7ddee48ba1f 100644 --- a/data/grid5000/sites/rennes/clusters/abacus26/nodes/abacus26-1.json +++ b/data/grid5000/sites/rennes/clusters/abacus26/nodes/abacus26-1.json @@ -171,8 +171,8 @@ "turboboost_enabled": true }, "performance": { - "core_flops": 128, - "node_flops": 6144 + "core_flops": 83200000000, + "node_flops": 3993600000000 }, "processor": { "cache_l1": null, @@ -180,10 +180,10 @@ "cache_l1i": 32768, "cache_l2": 2097152, "cache_l3": 62914560, - "clock_speed": 8, + "clock_speed": 2600000000, "ht_capable": true, "instruction_set": "x86-64", - "microarchitecture": "Haswell", + "microarchitecture": "Sapphire Rapids", "microcode": "0x2b000603", "model": "Intel Xeon", "other_description": "Intel(R) Xeon(R) Gold 6442Y", diff --git a/input/grid5000/sites/rennes/clusters/abacus26/abacus26.yaml b/input/grid5000/sites/rennes/clusters/abacus26/abacus26.yaml index 5c3442d09fb..7c7b8d9bb55 100644 --- a/input/grid5000/sites/rennes/clusters/abacus26/abacus26.yaml +++ b/input/grid5000/sites/rennes/clusters/abacus26/abacus26.yaml @@ -1,6 +1,6 @@ --- model: HPE ProLiant DL380 Gen11 -created_at: 2025-01-17 # TODO: change this e +created_at: 2025-01-17 kavlan: false boot_type: uefi exotic: false @@ -17,8 +17,8 @@ nodes: besteffort: true max_walltime: 0 processor: - microarchitecture: Haswell # TODO: replace with microarch name. - clock_speed: 8 # TODO: Replace with clock speed. + microarchitecture: Sapphire Rapids + clock_speed: 2600000000 network_adapters: bmc: enabled: true @@ -33,7 +33,7 @@ nodes: id: disk0 interface: NVME software: - standard-environment: debian11-x64-std # TODO: check that architecture is OK + standard-environment: debian11-x64-std management_tools: bmc_vendor_tool: ipmitool nodeset: abacus-26 -- GitLab