Mentions légales du service

Skip to content
Snippets Groups Projects

[valid/oar-properties] check GPU allocation on cores (#15085)

Merged Lucas Nussbaum requested to merge wrong-gpu-alloc into master
All threads resolved!
1 file
+ 20
0
Compare changes
  • Side-by-side
  • Inline
@@ -9,6 +9,11 @@ IGNORED_PROPERTIES=%w{}
# Propriétés qui devraient être présentes
G5K_PROPERTIES=%w{api_timestamp available_upto besteffort chassis chunks cluster cluster_priority comment core core_count cpu cpuarch cpucore cpufreq cpuset cputype cpu_count deploy desktop_computing disk disk_reservation_count diskpath disktype drain eth_count eth_kavlan_count eth_rate exotic expiry_date finaud_decision gpu gpudevice gpu_model gpu_count grub host ib ib_count ib_rate id ip last_available_upto last_job_date links maintenance max_walltime memcore memcpu memnode mic myri myri_count myri_rate network_address next_finaud_decision next_state nodemodel production rconsole scheduler_priority slash_16 slash_17 slash_18 slash_19 slash_20 slash_21 slash_22 state state_num subnet_address subnet_prefix suspended_jobs switch thread_count type virtual vlan wattmeter opa_count opa_rate}.sort - IGNORED_PROPERTIES
# abacus15 a été retired avant qu'on puisse corriger le cpu_affinity des GPUs,
# par conséquent on ne peut plus regénérer ses propriétés OAR il faut l'ignorer
# lors de la validation.
WRONG_GPU_EXCEPTIONS=%w{abacus15-1.rennes.grid5000.fr abacus15-2.rennes.grid5000.fr}.freeze
module RefRepo::Valid::OarProperties
def self.check(options)
@@ -121,6 +126,21 @@ module RefRepo::Valid::OarProperties
puts "id cpu core cpuset"
pp(host_resources.map { |e| [e['id'], e['cpu'], e['core'], e['cpuset'] ] })
end
# if a node has GPUs, then all resources must be affected to a GPU
host_resources = default_resources.select { |e| e['host'] == host && !WRONG_GPU_EXCEPTIONS.include?(e['host']) }
if host_resources.find { |r| r['gpu'] }
if host_resources.find { |r| r['gpu'].nil? }
puts "ERROR: #{host} has GPU(s), but some resources have no GPUs affected. Reserving all GPUs should reserve the whole node, and reserving 1/N GPUs should reserve 1/N of the node."
host_resources.group_by { |r| r['gpu'] }.each_pair do |gpu, prop|
cpus = prop.map { |e| e['cpu'] }.uniq.sort.join(',')
cores = prop.map { |e| e['core'] }.uniq.sort.join(',')
rids = prop.map { |e| e['resource_id'] }.uniq.sort.join(',')
puts "gpu=#{gpu ? gpu : 'NULL'} for resources with: cpu=#{cpus} core=#{cores} resource_id=#{rids}"
end
ret = false
end
end
end
end
return ret
Loading