diff --git a/data/grid5000/sites/sophia/clusters/esterel22/esterel22.json b/data/grid5000/sites/sophia/clusters/esterel22/esterel22.json index 71411b8181566b1c40bb5787b1e488f4d1972dca..1dc838e957334c8a503593bc18be11349c405fcd 100644 --- a/data/grid5000/sites/sophia/clusters/esterel22/esterel22.json +++ b/data/grid5000/sites/sophia/clusters/esterel22/esterel22.json @@ -1,9 +1,9 @@ { "boot_type": "uefi", - "created_at": "Thu, 01 Jan 1970 00:00:00 GMT", + "created_at": "Wed, 09 Apr 2025 00:00:00 GMT", "exotic": false, "kavlan": false, - "manufactured_at": "1970-01-01", + "manufactured_at": "2019-10-07", "metrics": [ { "description": "Default subset of metrics from Prometheus Node Exporter", @@ -49,12 +49,22 @@ "port": 9100, "protocol": "prometheus" } + }, + { + "description": "All metrics from Prometheus Nvidia DCGM Exporter", + "name": "prom_nvgpu_all_metrics", + "optional_period": 15000, + "period": 0, + "source": { + "port": 9400, + "protocol": "prometheus" + } } ], - "model": "Cluster Model", + "model": "Dell PowerEdge T640", "nodes_count": 1, - "nodes_description": "1 CPU Unknown vendor, 72 cores/CPU, 0GB RAM, 2x0GB SSD, 1 x 10Gb Ethernet", - "priority": 197001, + "nodes_description": "2 CPUs Intel Xeon Gold 6240, 18 cores/CPU, 4 GPUs Quadro RTX 6000, 384GB RAM, 558GB HDD, 3575GB HDD, 1 x 1Gb Ethernet, 1 x 40Gb InfiniBand", + "priority": 202010, "queues": [ "admin", "testing" @@ -62,5 +72,5 @@ "redfish": true, "type": "cluster", "uid": "esterel22", - "warranty_end": "1970-01-01" -} \ No newline at end of file + "warranty_end": "2026-10-08" +} diff --git a/data/grid5000/sites/sophia/clusters/esterel22/nodes/esterel22-1.json b/data/grid5000/sites/sophia/clusters/esterel22/nodes/esterel22-1.json index 0b7d49b601cadc77fc4da58dc5e36707ddc59268..384e486944fcd6cd48c2c3f53001ab94a1ccee34 100644 --- a/data/grid5000/sites/sophia/clusters/esterel22/nodes/esterel22-1.json +++ b/data/grid5000/sites/sophia/clusters/esterel22/nodes/esterel22-1.json @@ -1,58 +1,226 @@ { "architecture": { - "cpu_core_numbering": "contiguous", - "nb_cores": 72, - "nb_procs": 1, + "cpu_core_numbering": "round-robin", + "nb_cores": 36, + "nb_procs": 2, "nb_threads": 72, "platform_type": "x86_64" }, "bios": { - "release_date": "01/01/2000", - "vendor": "Unknown", - "version": 1 + "release_date": "01/09/2025", + "vendor": "Dell Inc.", + "version": "2.23.0" }, - "bmc_version": "v1", + "bmc_version": "7.00.00.181", "chassis": { - "manufactured_at": "1970-01-01", - "manufacturer": "Unknown", - "name": "Unknown", - "warranty_end": "1970-01-01" + "manufactured_at": "2019-10-07", + "manufacturer": "Dell Inc.", + "name": "PowerEdge T640", + "serial": "1D58RZ2", + "warranty_end": "2026-10-08" }, "exotic": false, + "gpu_devices": { + "nvidia0": { + "compute_capability": "7.5", + "cores": 4608, + "cpu_affinity": 0, + "device": "/dev/nvidia0", + "memory": 24159191040, + "microarchitecture": "Turing", + "model": "Quadro RTX 6000", + "performance": { + "fp-16": 32620000000000, + "fp-32": 16310000000000, + "fp-64": 510000000000 + }, + "power_default_limit": "260.00 W", + "vbios_version": "90.02.15.00.04", + "vendor": "Nvidia" + }, + "nvidia1": { + "compute_capability": "7.5", + "cores": 4608, + "cpu_affinity": 0, + "device": "/dev/nvidia1", + "memory": 24159191040, + "microarchitecture": "Turing", + "model": "Quadro RTX 6000", + "performance": { + "fp-16": 32620000000000, + "fp-32": 16310000000000, + "fp-64": 510000000000 + }, + "power_default_limit": "260.00 W", + "vbios_version": "90.02.15.00.04", + "vendor": "Nvidia" + }, + "nvidia2": { + "compute_capability": "7.5", + "cores": 4608, + "cpu_affinity": 1, + "device": "/dev/nvidia2", + "memory": 24159191040, + "microarchitecture": "Turing", + "model": "Quadro RTX 6000", + "performance": { + "fp-16": 32620000000000, + "fp-32": 16310000000000, + "fp-64": 510000000000 + }, + "power_default_limit": "260.00 W", + "vbios_version": "90.02.15.00.04", + "vendor": "Nvidia" + }, + "nvidia3": { + "compute_capability": "7.5", + "cores": 4608, + "cpu_affinity": 1, + "device": "/dev/nvidia3", + "memory": 24159191040, + "microarchitecture": "Turing", + "model": "Quadro RTX 6000", + "performance": { + "fp-16": 32620000000000, + "fp-32": 16310000000000, + "fp-64": 510000000000 + }, + "power_default_limit": "260.00 W", + "vbios_version": "90.02.15.00.04", + "vendor": "Nvidia" + } + }, "main_memory": { - "ram_size": 8 + "ram_size": 412316860416 }, "management_tools": { - "bmc_vendor_tool": "ipmitool", + "bmc_vendor_tool": "racadm", "ipmitool": { "retries": 5 } }, "memory_devices": [ { - "device": "dimm_proc 1 dimm 1", - "size": 8, + "device": "dimm_a1", + "size": 34359738368, + "technology": "dram" + }, + { + "device": "dimm_a2", + "size": 34359738368, + "technology": "dram" + }, + { + "device": "dimm_a3", + "size": 34359738368, + "technology": "dram" + }, + { + "device": "dimm_a4", + "size": 34359738368, + "technology": "dram" + }, + { + "device": "dimm_a5", + "size": 34359738368, + "technology": "dram" + }, + { + "device": "dimm_a6", + "size": 34359738368, + "technology": "dram" + }, + { + "device": "dimm_b1", + "size": 34359738368, + "technology": "dram" + }, + { + "device": "dimm_b2", + "size": 34359738368, + "technology": "dram" + }, + { + "device": "dimm_b3", + "size": 34359738368, + "technology": "dram" + }, + { + "device": "dimm_b4", + "size": 34359738368, + "technology": "dram" + }, + { + "device": "dimm_b5", + "size": 34359738368, + "technology": "dram" + }, + { + "device": "dimm_b6", + "size": 34359738368, "technology": "dram" } ], "network_adapters": [ { "device": "eth0", - "driver": "mlx_core", + "driver": "bnxt_en", "enabled": true, + "firmware_version": "232.0.156.9/pkg 23.21.13.39", "interface": "Ethernet", "ip": "172.16.131.28", "ip6": "2001:660:4406:800:4::1c", "kavlan": false, "mac": "f4:02:70:9d:f4:d8", "management": false, + "model": "BCM57416 NetXtreme-E Dual-Media 10G RDMA Ethernet Controller", "mountable": true, "mounted": true, "name": "enp1s0f0np0", "network_address": "esterel22-1.sophia.grid5000.fr", - "rate": 10000000000, + "rate": 1000000000, + "sriov": false, + "sriov_totalvfs": 0, "switch": "sw-2", - "switch_port": "1/1/32" + "switch_port": "1/1/32", + "vendor": "Broadcom Inc. and subsidiaries" + }, + { + "device": "eth1", + "driver": "bnxt_en", + "enabled": false, + "firmware_version": "232.0.156.9/pkg 23.21.13.39", + "interface": "Ethernet", + "kavlan": false, + "mac": "f4:02:70:9d:f4:d9", + "management": false, + "model": "BCM57416 NetXtreme-E Dual-Media 10G RDMA Ethernet Controller", + "mountable": false, + "mounted": false, + "name": "eno2np1", + "sriov": false, + "sriov_totalvfs": 0, + "vendor": "Broadcom Inc. and subsidiaries" + }, + { + "device": "ib0", + "driver": "mlx4_core", + "enabled": true, + "firmware_version": "2.42.5000", + "guid": "0xb8599f0300a0d111", + "interface": "InfiniBand", + "ip": "172.18.131.28", + "kavlan": false, + "mac": "b8:59:9f:03:00:a0:d1:11", + "management": false, + "model": "MT27500 Family [ConnectX-3]", + "mountable": true, + "mounted": true, + "name": "ib0", + "netmask": "255.255.240.0", + "network_address": "esterel22-1-ib0.sophia.grid5000.fr", + "rate": 40000000000, + "vendor": "Mellanox Technologies" }, { "device": "bmc", @@ -69,32 +237,32 @@ ], "nodeset": "esterel22", "operating_system": { - "cstate_driver": "unknown", - "cstate_governor": "unknown", + "cstate_driver": "intel_idle", + "cstate_governor": "menu", "ht_enabled": true, - "pstate_driver": "unknwon", - "pstate_governor": "unknown", + "pstate_driver": "intel_pstate", + "pstate_governor": "performance", "turboboost_enabled": true }, "performance": { - "core_flops": 128, - "node_flops": 9216 + "core_flops": 83200000000, + "node_flops": 2995200000000 }, "processor": { "cache_l1": null, - "cache_l1d": 8, - "cache_l1i": 8, - "cache_l2": 8, - "cache_l3": 8, - "clock_speed": 8, + "cache_l1d": 32768, + "cache_l1i": 32768, + "cache_l2": 1048576, + "cache_l3": 25952256, + "clock_speed": 2600000000, "ht_capable": true, "instruction_set": "x86-64", - "microarchitecture": "Haswell", - "microcode": "0xd000001", - "model": "Unknown", - "other_description": "description", - "vendor": "vendor", - "version": "vendor" + "microarchitecture": "Cascade Lake-SP", + "microcode": "0x5003801", + "model": "Intel Xeon", + "other_description": "Intel(R) Xeon(R) Gold 6240 CPU @ 2.60GHz", + "vendor": "Intel", + "version": "Gold 6240" }, "redfish": true, "software": { @@ -104,24 +272,26 @@ }, "storage_devices": [ { - "by_id": "", + "by_id": "/dev/disk/by-id/wwn-0x64cd98f094f41d002566cb7511b3990e", "by_path": "/dev/disk/by-path/pci-0000:19:00.0-scsi-0:2:0:0", + "firmware_version": 4.3, "id": "disk0", "interface": "SAS", - "model": "unknown", - "size": 8, - "storage": "SSD", - "vendor": "Unknown" + "model": "PERC H730P Adp", + "size": 599550590976, + "storage": "HDD", + "vendor": "Dell" }, { - "by_id": "", + "by_id": "/dev/disk/by-id/wwn-0x64cd98f094f41d002566d27306fac4a7", "by_path": "/dev/disk/by-path/pci-0000:19:00.0-scsi-0:2:1:0", + "firmware_version": 4.3, "id": "disk1", "interface": "SAS", - "model": "unknown", - "size": 8, - "storage": "SSD", - "vendor": "Unknown" + "model": "PERC H730P Adp", + "size": 3838627020800, + "storage": "HDD", + "vendor": "Dell" } ], "supported_job_types": { @@ -131,8 +301,9 @@ "queues": [ "admin", "testing" - ] + ], + "virtual": "ivt" }, "type": "node", "uid": "esterel22-1" -} \ No newline at end of file +} diff --git a/input/grid5000/dell-product-data.yaml b/input/grid5000/dell-product-data.yaml index 234b29ea30630d1d8a2635a8fcbf12c7ea4b89e9..83c2e2e57c14a51880d9c0ad8e63357c6af527d5 100644 --- a/input/grid5000/dell-product-data.yaml +++ b/input/grid5000/dell-product-data.yaml @@ -2278,6 +2278,12 @@ sites: chassis: manufactured_at: 2016-05-04 warranty_end: 2021-05-04 + esterel22: + nodes: + esterel22-1: + chassis: + manufactured_at: 2019-10-07 + warranty_end: 2026-10-08 esterel24: nodes: esterel24-1: @@ -2296,18 +2302,18 @@ sites: chassis: manufactured_at: 2016-06-06 warranty_end: 2021-06-06 - esterel4: - nodes: - esterel4-1: - chassis: - manufactured_at: 2016-06-08 - warranty_end: 2021-06-08 esterel32: nodes: esterel32-1: chassis: manufactured_at: 2020-08-09 warranty_end: 2027-08-10 + esterel4: + nodes: + esterel4-1: + chassis: + manufactured_at: 2016-06-08 + warranty_end: 2021-06-08 esterel5: nodes: esterel5-1: diff --git a/input/grid5000/ipv4.yaml b/input/grid5000/ipv4.yaml index ee9059ac6ad7ea499705b755672ae8332f4ae4e6..c22ce1dd20d4e2ae6be2d78806c087eb0e3a5fef 100644 --- a/input/grid5000/ipv4.yaml +++ b/input/grid5000/ipv4.yaml @@ -191,6 +191,7 @@ ipv4: sophia esterel20 eth0 0 0 3 24 sophia esterel21 eth0 0 0 3 26 sophia esterel22 eth0 0 0 3 27 + sophia esterel22 ib0 0 0 3 27 sophia esterel23 eth0 0 0 3 28 sophia esterel24 eth0 0 0 3 30 sophia esterel24 ib0 0 0 3 30 diff --git a/input/grid5000/sites/sophia/clusters/esterel22/esterel22.yaml b/input/grid5000/sites/sophia/clusters/esterel22/esterel22.yaml index 253bc917772f0ab361f93c8c55dc3934fd90d3a4..991cb6b58585b2b37382f9e4e3e95840d035c6eb 100644 --- a/input/grid5000/sites/sophia/clusters/esterel22/esterel22.yaml +++ b/input/grid5000/sites/sophia/clusters/esterel22/esterel22.yaml @@ -1,24 +1,21 @@ --- -model: Cluster Model # TODO: change this value. -created_at: 1970-01-01 # TODO: change this value +model: Dell PowerEdge T640 +created_at: 2025-04-09 kavlan: false -boot_type: uefi # TODO: specify if 'uefi' (ideally) or 'bios' (legacy, if no other choice) -exotic: false # TODO: specify if 'true' or 'false' +boot_type: uefi +exotic: false queues: - admin - testing nodes: esterel22-1: - chassis: - manufactured_at: 1970-01-01 # TODO: if not Dell vendor, put date. if Dell, use rake gen:dell-product-data after g5k-checks import, and remove this line. - warranty_end: 1970-01-01 # TODO: if not Dell vendor, put date. if Dell, use rake gen:dell-product-data after g5k-checks import, and remove this line. supported_job_types: deploy: true besteffort: true max_walltime: 0 processor: - microarchitecture: Haswell # TODO: replace with microarch name. - clock_speed: 8 # TODO: Replace with clock speed. + microarchitecture: Cascade Lake-SP + clock_speed: 2600000000 network_adapters: bmc: interface: Ethernet @@ -29,17 +26,24 @@ nodes: enabled: true mountable: true mounted: true + eth1: + enabled: false + mountable: false + mounted: false + ib0: + mounted: true + enabled: true + mountable: true + netmask: 255.255.240.0 storage_devices: - disk0: # This field will have to be renamed later. + pci-0000:19:00.0-scsi-0:2:0:0: id: disk0 interface: SAS - by_path: "/dev/disk/by-path/pci-0000:19:00.0-scsi-0:2:0:0" # this path will have to change later. - disk1: + pci-0000:19:00.0-scsi-0:2:1:0: id: disk1 interface: SAS - by_path: "/dev/disk/by-path/pci-0000:19:00.0-scsi-0:2:1:0" software: - standard-environment: debian11-x64-std # TODO: check that architecture is OK + standard-environment: debian11-x64-std management_tools: - bmc_vendor_tool: ipmitool # TODO: replace with bmc_vendor_tool (ipmitool, racadm) + bmc_vendor_tool: racadm nodeset: esterel22 diff --git a/input/grid5000/sites/sophia/clusters/esterel22/esterel22_metrics.yaml b/input/grid5000/sites/sophia/clusters/esterel22/esterel22_metrics.yaml index 729e9beb582834908e55a7537bd5ace8abbd1f3d..5fafdb330dbfa8c991ea84bcc0d598f872007c41 100644 --- a/input/grid5000/sites/sophia/clusters/esterel22/esterel22_metrics.yaml +++ b/input/grid5000/sites/sophia/clusters/esterel22/esterel22_metrics.yaml @@ -38,3 +38,10 @@ metrics: source: protocol: prometheus port: 9100 + - name: prom_nvgpu_all_metrics + description: All metrics from Prometheus Nvidia DCGM Exporter + period: 0 + optional_period: 15000 + source: + protocol: prometheus + port: 9400 diff --git a/input/grid5000/sites/sophia/clusters/esterel22/nodes/esterel22-1.yaml b/input/grid5000/sites/sophia/clusters/esterel22/nodes/esterel22-1.yaml new file mode 100644 index 0000000000000000000000000000000000000000..28471e0557112fedcb901ffcaac297c544ac5934 --- /dev/null +++ b/input/grid5000/sites/sophia/clusters/esterel22/nodes/esterel22-1.yaml @@ -0,0 +1,167 @@ +# Generated by g5k-checks (g5k-checks -m api) +--- +esterel22-1: + architecture: + cpu_core_numbering: round-robin + nb_cores: 36 + nb_procs: 2 + nb_threads: 72 + platform_type: x86_64 + bios: + release_date: 01/09/2025 + vendor: Dell Inc. + version: 2.23.0 + bmc_version: 7.00.00.181 + chassis: + manufacturer: Dell Inc. + name: PowerEdge T640 + serial: 1D58RZ2 + gpu_devices: + nvidia0: + cpu_affinity: 0 + device: "/dev/nvidia0" + memory: 24159191040 + model: Quadro RTX 6000 + power_default_limit: 260.00 W + vbios_version: 90.02.15.00.04 + vendor: Nvidia + nvidia1: + cpu_affinity: 0 + device: "/dev/nvidia1" + memory: 24159191040 + model: Quadro RTX 6000 + power_default_limit: 260.00 W + vbios_version: 90.02.15.00.04 + vendor: Nvidia + nvidia2: + cpu_affinity: 1 + device: "/dev/nvidia2" + memory: 24159191040 + model: Quadro RTX 6000 + power_default_limit: 260.00 W + vbios_version: 90.02.15.00.04 + vendor: Nvidia + nvidia3: + cpu_affinity: 1 + device: "/dev/nvidia3" + memory: 24159191040 + model: Quadro RTX 6000 + power_default_limit: 260.00 W + vbios_version: 90.02.15.00.04 + vendor: Nvidia + main_memory: + ram_size: 412316860416 + memory_devices: + dimm_a1: + size: 34359738368 + technology: dram + dimm_a2: + size: 34359738368 + technology: dram + dimm_a3: + size: 34359738368 + technology: dram + dimm_a4: + size: 34359738368 + technology: dram + dimm_a5: + size: 34359738368 + technology: dram + dimm_a6: + size: 34359738368 + technology: dram + dimm_b1: + size: 34359738368 + technology: dram + dimm_b2: + size: 34359738368 + technology: dram + dimm_b3: + size: 34359738368 + technology: dram + dimm_b4: + size: 34359738368 + technology: dram + dimm_b5: + size: 34359738368 + technology: dram + dimm_b6: + size: 34359738368 + technology: dram + network_adapters: + bmc: + ip: 172.17.131.28 + mac: f4:02:70:9d:f4:e8 + management: true + eth0: + driver: bnxt_en + firmware_version: 232.0.156.9/pkg 23.21.13.39 + interface: Ethernet + ip: 172.16.131.28 + mac: f4:02:70:9d:f4:d8 + management: false + model: BCM57416 NetXtreme-E Dual-Media 10G RDMA Ethernet Controller + name: enp1s0f0np0 + rate: 1000000000 + sriov: false + sriov_totalvfs: 0 + vendor: Broadcom Inc. and subsidiaries + eth1: + driver: bnxt_en + firmware_version: 232.0.156.9/pkg 23.21.13.39 + interface: Ethernet + mac: f4:02:70:9d:f4:d9 + management: false + model: BCM57416 NetXtreme-E Dual-Media 10G RDMA Ethernet Controller + name: eno2np1 + sriov: false + sriov_totalvfs: 0 + vendor: Broadcom Inc. and subsidiaries + ib0: + driver: mlx4_core + firmware_version: 2.42.5000 + guid: '0xb8599f0300a0d111' + interface: InfiniBand + ip: 172.18.131.28 + mac: b8:59:9f:03:00:a0:d1:11 + management: false + model: MT27500 Family [ConnectX-3] + name: ib0 + rate: 40000000000 + vendor: Mellanox Technologies + operating_system: + cstate_driver: intel_idle + cstate_governor: menu + ht_enabled: true + pstate_driver: intel_pstate + pstate_governor: performance + turboboost_enabled: true + processor: + cache_l1d: 32768 + cache_l1i: 32768 + cache_l2: 1048576 + cache_l3: 25952256 + ht_capable: true + instruction_set: x86-64 + microcode: '0x5003801' + model: Intel Xeon + other_description: Intel(R) Xeon(R) Gold 6240 CPU @ 2.60GHz + vendor: Intel + version: Gold 6240 + storage_devices: + pci-0000:19:00.0-scsi-0:2:0:0: + by_id: "/dev/disk/by-id/wwn-0x64cd98f094f41d002566cb7511b3990e" + by_path: "/dev/disk/by-path/pci-0000:19:00.0-scsi-0:2:0:0" + firmware_version: 4.3 + model: PERC H730P Adp + size: 599550590976 + storage: HDD + pci-0000:19:00.0-scsi-0:2:1:0: + by_id: "/dev/disk/by-id/wwn-0x64cd98f094f41d002566d27306fac4a7" + by_path: "/dev/disk/by-path/pci-0000:19:00.0-scsi-0:2:1:0" + firmware_version: 4.3 + model: PERC H730P Adp + size: 3838627020800 + storage: HDD + supported_job_types: + virtual: ivt diff --git a/lib/refrepo/net_names_mapping.yaml b/lib/refrepo/net_names_mapping.yaml index d04d0b5fed722848bf968d4017297824e54b76fa..60022e9c773e295ba09daf989e18a04c238dae9e 100644 --- a/lib/refrepo/net_names_mapping.yaml +++ b/lib/refrepo/net_names_mapping.yaml @@ -451,6 +451,10 @@ esterel12: eno1: eth0 eno2: eth1 ibp130s0: ib0 +esterel22: + enp1s0f0np0: eth0 + eno2np1: eth1 + ibp137s0: ib0 esterel24: enp1s0f0np0: eth0 eno2np1: eth1