Commit 185e7f36 authored by Florent Didier's avatar Florent Didier
Browse files

[dev] Add disk properties for disk reservation

parent 7b9dbbb2
......@@ -109,7 +109,7 @@ class ::Hash
self[key_x] = deep_merge_entries(deep_copy(value_ab), value_x).clone
}
end
# Delete entry "PREFIX[a-b]"
self.delete(key_ab)
keys.delete(key_ab)
......@@ -143,9 +143,10 @@ class ::Hash
# Custom iterator. Only consider entries corresponding to cluster_list and node_list. Sorted by node_uid.
def each_filtered_node_uid(cluster_list, node_list)
self.each_sort_by_node_uid { |node_uid, properties|
self.each_sort_by_node_uid { |key, properties|
node_uid, = key
cluster_uid = node_uid.split(/-/).first
if (! cluster_list || cluster_list.include?(cluster_uid)) &&
(! node_list || node_list.include?(node_uid))
yield node_uid, properties
......
#!/usr/bin/ruby
# coding: utf-8
require 'pp'
require 'erb'
......@@ -12,96 +13,134 @@ require 'set'
require 'uri'
require 'net/https'
class MissingProperty < StandardError; end
class MissingProperty < StandardError; end
MiB = 1024**2
# Convert node properties from the reference repo hash to oar properties
def get_ids(host)
node_uid, site_uid, grid_uid, _tdl = host.split('.')
cluster_uid, node_num = node_uid.split('-')
ids = { 'node_uid' => node_uid, 'site_uid' => site_uid, 'grid_uid' => grid_uid, 'cluster_uid' => cluster_uid, 'node_num' => node_num }
return ids
end
# Get all node properties of a given site from the reference repo hash
# See also: https://www.grid5000.fr/mediawiki/index.php/Reference_Repository
def get_node_properties(cluster_uid, cluster, node_uid, node)
h = {} # ouput
def get_ref_default_properties(_site_uid, site)
properties = {}
site['clusters'].each do |cluster_uid, cluster|
cluster['nodes'].each do |node_uid, node|
begin
properties[node_uid] = get_ref_node_properties_internal(cluster_uid, cluster, node_uid, node)
rescue MissingProperty => e
puts "Error while processing node #{node_uid}: #{e}"
end
end
end
return properties
end
def get_ref_disk_properties(site_uid, site)
properties = {}
site['clusters'].each do |cluster_uid, cluster|
cluster['nodes'].each do |node_uid, node|
begin
properties.merge!(get_ref_disk_properties_internal(site_uid, cluster_uid, node_uid, node))
rescue MissingProperty => e
puts "Error while processing node #{node_uid}: #{e}"
end
end
end
return properties
end
# Generates the properties of a single node
def get_ref_node_properties_internal(cluster_uid, cluster, node_uid, node)
h = {}
if node['status'] == 'retired'
# For dead nodes, additional information is most likely missing
# from the ref-repository: just return the state
h['state'] = 'Dead'
return h # for dead nodes, additional information is most likely missing from the ref-repository, just return the state
return h
end
main_network_adapter = node['network_adapters'].find{|k, na| /^eth[0-9]*$/.match(k) && na['enabled'] && na['mounted'] && !na['management'] }
main_network_adapter = node['network_adapters'].find { |k, na| /^eth[0-9]*$/.match(k) && na['enabled'] && na['mounted'] && !na['management'] }
raise MissingProperty, "Node #{node_uid} does not have a main network_adapter (ie. an ethernet interface with enabled=true && mounted==true && management==false)" unless main_network_adapter
# h['host'] = main_network_adapter['network_address']
#TODO raise MissingProperty, "Node #{node_uid} has no network_address" unless h['host']
h['ip'] = main_network_adapter[1]['ip']
h['ip'] = main_network_adapter[1]['ip']
raise MissingProperty, "Node #{node_uid} has no IP" unless h['ip']
h['cluster'] = cluster_uid
h['nodemodel'] = cluster['model']
h['switch'] = main_network_adapter[1]['switch']
h['besteffort'] = node['supported_job_types']['besteffort']
h['deploy'] = node['supported_job_types']['deploy']
h['ip_virtual'] = node['supported_job_types']['virtual'] == 'ivt'
h['virtual'] = node['supported_job_types']['virtual']
h['cpuarch'] = node['architecture']['platform_type']
h['cpucore'] = node['architecture']['nb_cores']/node['architecture']['nb_procs']
h['cputype'] = [node['processor']['model'], node['processor']['version']].join(' ')
h['cpufreq'] = node['processor']['clock_speed']/1_000_000_000.0
h['disktype'] = (node['storage_devices'].first[1] || {})['interface']
h['cluster'] = cluster_uid
h['nodemodel'] = cluster['model']
h['switch'] = main_network_adapter[1]['switch']
h['besteffort'] = node['supported_job_types']['besteffort']
h['deploy'] = node['supported_job_types']['deploy']
h['virtual'] = node['supported_job_types']['virtual']
h['cpuarch'] = node['architecture']['platform_type']
h['cpucore'] = node['architecture']['nb_cores'] / node['architecture']['nb_procs']
h['cputype'] = [node['processor']['model'], node['processor']['version']].join(' ')
h['cpufreq'] = node['processor']['clock_speed'] / 1_000_000_000.0
h['disktype'] = (node['storage_devices'].first[1] || {})['interface']
# ETH
ni_mountable = node['network_adapters'].select{|k, na| /^eth[0-9]*$/.match(k) && (na['enabled'] == true || na['mounted'] == true || na['mountable'] == true)}.values
ni_fastest = ni_mountable.max_by{|na| na['rate']}
ni_mountable = node['network_adapters'].select { |k, na| /^eth[0-9]*$/.match(k) && (na['enabled'] == true || na['mounted'] == true || na['mountable'] == true) }.values
ni_fastest = ni_mountable.max_by { |na| na['rate'] }
h['eth_count'] = ni_mountable.length
h['eth_rate'] = ni_fastest['rate'] / 1_000_000_000
puts "#{node_uid}: Warning - no rate info for the eth interface" if h['eth_count'] > 0 && h['eth_rate'] == 0
# INFINIBAND
ni_mountable = node['network_adapters'].select{|k, na| /^ib[0-9]*(\.[0-9]*)?$/.match(k) && (na['enabled'] == true || na['mounted'] == true || na['mountable'] == true)}.values
ni_fastest = ni_mountable.max_by{|na| na['rate']}
ib_map = {0 => 'NO', 10 => 'SDR', 20 => 'DDR', 40 => 'QDR', 56 => 'FDR'}
ni_mountable = node['network_adapters'].select { |k, na| /^ib[0-9]*(\.[0-9]*)?$/.match(k) && (na['enabled'] == true || na['mounted'] == true || na['mountable'] == true) }.values
ni_fastest = ni_mountable.max_by { |na| na['rate'] }
ib_map = { 0 => 'NO', 10 => 'SDR', 20 => 'DDR', 40 => 'QDR', 56 => 'FDR' }
h['ib_count'] = ni_mountable.length
h['ib_rate'] = ni_mountable.length > 0 ? ni_fastest['rate'] / 1_000_000_000 : 0
h['ib'] = ib_map[h['ib_rate']]
h['ib'] = ib_map[h['ib_rate']]
puts "#{node_uid}: Warning - no rate info for the ib interface" if h['ib_count'] > 0 && h['ib_rate'] == 0
# MYRINET
ni_mountable = node['network_adapters'].select{|k, na| /^myri[0-9]*$/.match(k) && (na['enabled'] == true || na['mounted'] == true || na['mountable'] == true)}.values
ni_fastest = ni_mountable.max_by{|na| na['rate']}
myri_map = {0 => 'NO', 2 => 'Myrinet-2000', 10 => 'Myri-10G'}
ni_mountable = node['network_adapters'].select { |k, na| /^myri[0-9]*$/.match(k) && (na['enabled'] == true || na['mounted'] == true || na['mountable'] == true) }.values
ni_fastest = ni_mountable.max_by { |na| na['rate'] }
myri_map = { 0 => 'NO', 2 => 'Myrinet-2000', 10 => 'Myri-10G' }
h['myri_count'] = ni_mountable.length
h['myri_rate'] = ni_mountable.length > 0 ? ni_fastest['rate'] / 1_000_000_000 : 0
h['myri'] = myri_map[h['myri_rate']]
h['myri'] = myri_map[h['myri_rate']]
puts "#{node_uid}: Warning - no rate info for the myri interface" if h['myri_count'] > 0 && h['myri_rate'] == 0
#
h['memcore'] = node['main_memory']['ram_size']/node['architecture']['nb_cores']/MiB
h['memcpu'] = node['main_memory']['ram_size']/node['architecture']['nb_procs']/MiB
h['memnode'] = node['main_memory']['ram_size']/MiB
h['memcore'] = node['main_memory']['ram_size'] / node['architecture']['nb_cores']/MiB
h['memcpu'] = node['main_memory']['ram_size'] / node['architecture']['nb_procs']/MiB
h['memnode'] = node['main_memory']['ram_size'] / MiB
if node.key?('gpu') && node['gpu']['gpu'] == true
h['gpu'] = node['gpu']['gpu_model']
h['gpu'] = node['gpu']['gpu_model']
h['gpu_count'] = node['gpu']['gpu_count']
else
h['gpu'] = false
h['gpu_count'] = 0
end
if (node['mic'])
h['mic'] = "YES"
else
h['mic'] = "NO"
end
h['mic'] = if node['mic']
'YES'
else
'NO'
end
node['monitoring'] ||= {}
h['wattmeter'] = case node['monitoring']['wattmeter'] when true; true; when false; false when nil; false; else node['monitoring']['wattmeter'].upcase end
h['wattmeter'] = case node['monitoring']['wattmeter']
when true then true
when false then false
when nil then false
else node['monitoring']['wattmeter'].upcase
end
h['cluster_priority'] = (cluster['priority'] || Time.parse(cluster['created_at'].to_s).strftime('%Y%m')).to_i
h['production'] = false # default
h['production'] = node['supported_job_types']['queues'].include?('production') if node['supported_job_types'] && node['supported_job_types'].has_key?('queues')
......@@ -110,45 +149,168 @@ def get_node_properties(cluster_uid, cluster, node_uid, node)
h['maintenance'] = false # default
h['maintenance'] = node['supported_job_types']['queues'].include?('testing') if node['supported_job_types'] && node['supported_job_types'].has_key?('queues')
# Disk reservation
h['disk_reservation_count'] = node['storage_devices'].select { |_k, v| v['reservation'] }.length
# convert booleans to YES/NO string
h.each {|k,v|
h.each do |k, v|
if v == true
h[k] = 'YES'
elsif v == false
h[k] = 'NO'
elsif v.is_a? Float
h[k] = "#{v}"
h[k] = v.to_s
end
}
end
return h
end
#
#
#
def get_nodelist_properties(site_uid, site)
properties = {} # output
site['clusters'].each do |cluster_uid, cluster|
def get_ref_disk_properties_internal(site_uid, cluster_uid, node_uid, node)
properties = {}
node['storage_devices'].to_a.each_with_index do |v, index|
_device_uid, device = v
if index > 0 && device['reservation']
key = [node_uid, index]
h = {}
node_address = [node_uid, site_uid, 'grid5000.fr'].join('.')
h['cluster'] = cluster_uid
h['host'] = node_address
h['network_address'] = node_address
h['disk'] = index
h['diskpath'] = device['by_path']
h['cpuset'] = "disk-#{index}"
properties[key] = h
end
end
properties
end
cluster['nodes'].each do |node_uid, node|
def get_oar_default_properties(site_uid, filename, options)
oarnodes = get_oar_data(site_uid, filename, options)
begin
properties[node_uid] = get_node_properties(cluster_uid, cluster, node_uid, node)
rescue MissingProperty => e
puts "Error while processing node #{node_uid}: #{e}"
end
# Handle the two possible input format from oarnodes -Y:
# given by a file, and from the OAR API
if oarnodes.is_a?(Hash)
oarnodes = oarnodes.select { |_k, v| v['type'] == 'default' }.map { |v| [get_ids(v['host'])['node_uid'], v] }.to_h
elsif oarnodes.is_a?(Array)
oarnodes = oarnodes.select { |v| v['type'] == 'default' }.map { |v| [get_ids(v['host'])['node_uid'], v] }.to_h
else
raise 'Invalid input format for OAR properties'
end
return oarnodes
end
def get_oar_disk_properties(site_uid, filename, options)
oarnodes = get_oar_data(site_uid, filename, options)
# Handle the two possible input format from oarnodes -Y:
# given by a file, and from the OAR API
if oarnodes.is_a?(Hash)
oarnodes = oarnodes.select { |_k, v| v['type'] == 'disk' }.map { |v| [[v['host'].split('.').first, v['disk']], v] }.to_h
elsif oarnodes.is_a?(Array)
oarnodes = oarnodes.select { |v| v['type'] == 'disk' }.map { |v| [[v['host'].split('.').first, v['disk']], v] }.to_h
else
raise 'Invalid input format for OAR properties'
end
return oarnodes
end
# Get all data from the OAR database
def get_oar_data(site_uid, filename, options)
oarnodes = ''
if filename && File.exist?(filename)
# Read OAR properties from file
puts "Reading OAR resources properties from file #{filename}" if options[:verbose]
oarnodes = YAML.load(File.open(filename, 'rb') { |f| f.read })
else
api_uri = URI.parse('https://api.grid5000.fr/stable/sites/' + site_uid + '/internal/oarapi/resources/details.json?limit=999999')
# Download the OAR properties from the OAR API (through G5K API)
puts "Downloading resources properties from #{api_uri} ..." if options[:verbose]
http = Net::HTTP.new(api_uri.host, Net::HTTP.https_default_port)
http.use_ssl = true
request = Net::HTTP::Get.new(api_uri.request_uri)
# For outside g5k network access
if options[:api][:user] && options[:api][:pwd]
request.basic_auth(options[:api][:user], options[:api][:pwd])
end
response = http.request(request)
raise "Failed to fetch resources properties from API: \n#{response.body}\n" unless response.code.to_i == 200
puts '... done' if options[:verbose]
oarnodes = JSON.parse(response.body)
if filename
puts "Saving OAR resources properties as #{filename}" if options[:verbose]
File.write(filename, YAML.dump(oarnodes))
end
end
return properties
# Adapt from the format of the OAR API
oarnodes = oarnodes['items'] if oarnodes.key?('items')
return oarnodes
end
def ignore_keys()
# Return a list of properties as a hash: { property1 => String, property2 => Fixnum, ... }
# We detect the type of the property (Fixnum/String) by looking at the existing values
def get_property_keys(properties)
properties_keys = {}
properties.each do |_site_uid, site_properties|
site_properties.each do |type, type_properties|
properties_keys.merge!(get_property_keys_internal(type, type_properties))
end
end
return properties_keys
end
def get_property_keys_internal(_type, type_properties)
properties_keys = {}
type_properties.each do |key, node_properties|
# Differenciate between 'default' type (key = node_uid)
# and 'disk' type (key = [node_uid, disk_id])
node_uid, = key
next if node_uid.nil?
node_properties.each do |k, v|
next if properties_keys.key?(k)
next if NilClass === v
# also skip detection if 'v == false' because it seems that if a varchar property
# only as 'NO' values, it might be interpreted as a boolean
# (see the ib property at nantes: ib: NO in the YAML instead of ib: 'NO')
next if v == false
properties_keys[k] = v.class
end
end
return properties_keys
end
def diff_properties(type, properties_oar, properties_ref)
properties_oar ||= {}
properties_ref ||= {}
ignore_keys = ignore_keys()
ignore_keys.each { |key| properties_oar.delete(key) }
ignore_keys.each { |key| properties_ref.delete(key) }
# Ignore the 'state' property only if the node is not 'Dead' according to
# the reference-repo.
# Otherwise, we must enforce that the node state is also 'Dead' on the OAR server.
# On the OAR server, the 'state' property can be modified by phoenix. We ignore that.
if type == 'default' && properties_ref['state'] != 'Dead'
properties_oar.delete('state')
properties_ref.delete('state')
elsif type == 'default' && properties_ref.size == 1
# For dead nodes, when information is missing from the reference-repo, only enforce the 'state' property and ignore other differences.
return HashDiff.diff('state' => properties_oar['state'], 'state' => properties_ref['state'])
end
return HashDiff.diff(properties_oar, properties_ref)
end
# These keys will not be created neither compared with the -d option
def ignore_keys()
# default OAR at resource creation:
# available_upto: '2147483647'
# besteffort: 'YES'
......@@ -172,220 +334,134 @@ def ignore_keys()
# state_num: 3
# suspended_jobs: 'NO'
# type: default
ignore_keys = [
"chassis",
"slash_16",
"slash_17",
"slash_18",
"slash_19",
"slash_20",
"slash_21",
"slash_22",
"available_upto",
"chunks",
"comment", # TODO
"core",
"cpu",
"cpuset",
"desktop_computing",
"drain",
"expiry_date",
"finaud_decision",
"grub",
"host", # TODO
"jobs", # This property exists when a job is running
"last_available_upto",
"last_job_date",
"network_address", # TODO
"next_finaud_decision",
"next_state",
"rconsole", # TODO
"resource_id",
"scheduler_priority",
"state",
"state_num",
"switch", # TODO
"subnet_address",
"subnet_prefix",
"suspended_jobs",
"thread",
"type", # TODO
"vlan",
"pdu",
"id", #id from API (= resource_id from oarnodes)
"api_timestamp", # from API
"links", #from API
"ip_virtual" #from API
]
end
def diff_node_properties(node_properties_oar, node_properties_ref)
node_properties_oar ||= {}
node_properties_ref ||= {}
ignore_keys.each { |key| node_properties_oar.delete(key) }
ignore_keys.each { |key| node_properties_ref.delete(key) }
# Ignore the 'state' property only if the node is not 'Dead' according to the reference-repo.
# Otherwise, we must enforce that the node state is also 'Dead' on the OAR server.
# On the OAR server, the 'state' property can be modified by phoenix. We ignore that.
if node_properties_ref['state'] != 'Dead'
node_properties_oar.delete('state')
node_properties_ref.delete('state')
elsif node_properties_ref.size == 1
# For dead nodes, when information is missing from the reference-repo, only enforce the 'state' property and ignore other differences.
return HashDiff.diff({'state' => node_properties_oar['state']}, {'state' => node_properties_ref['state']})
end
return HashDiff.diff(node_properties_oar, node_properties_ref)
end
# Return a list of properties (as a hash: { property1 => String, property2 => Fixnum, ... })
# We try to detect the type of the property (Fixnum/String) by looking at the existing values. This is not possible if no value is set (NilClass).
def get_property_keys(nodelist_properties)
properties_keys = {}
nodelist_properties.each { |site_uid, site_properties|
# We do not use site/cluster/node filters here as we want the same list of properties across OAR servers
site_properties.each { |node_uid, node_properties|
next if node_uid == nil
node_properties.each { |k, v|
next if properties_keys.key?(k)
next if NilClass === v # we cannot infer type if v is nil
next if v == false # also skip detection if 'v == false' because it seems that if a varchar property only as 'NO' values,
# it might be interpreted as a boolean (see the ib property at nantes: ib: NO in the YAML instead of ib: 'NO')
# ... to be fixed ?
properties_keys[k] = v.class
}
}
}
return properties_keys
"chassis",
"slash_16",
"slash_17",
"slash_18",
"slash_19",
"slash_20",
"slash_21",
"slash_22",
"available_upto",
"chunks",
"comment", # TODO
"core",
"cpu",
"cpuset",
"desktop_computing",
"drain",
"expiry_date",
"finaud_decision",
"grub",
"host", # TODO
"jobs", # This property exists when a job is running
"last_available_upto",
"last_job_date",
"network_address", # TODO
"next_finaud_decision",
"next_state",
"rconsole", # TODO
"resource_id",
"scheduler_priority",
"state",
"state_num",
"subnet_address",
"subnet_prefix",
"suspended_jobs",
"thread",
"type", # TODO
"vlan",
"pdu",
"id", # id from API (= resource_id from oarnodes)
"api_timestamp", # from API
"links", # from API
]
return ignore_keys
end
def oarcmd_script_header()
return <<EOF
#! /usr/bin/env bash
set -eu
set -o pipefail
EOF
end
def oarcmd_create_node_header()
return <<EOF
nodelist=$(oarnodes -l)
node_exist () {
[[ $(oarnodes --sql "host='$1' and type='default'") ]]
}
list_contains () {
[[ "$1" =~ (^|[[:space:]])"$2"($|[[:space:]]) ]] && return 0 || return 1
disk_exist () {
[[ $(oarnodes --sql "host='$1' and type='disk' and disk=$2") ]]
}
EOF
end
def oarcmd_create_node(host, properties, node_hash) # host = grifffon-1.nancy.grid5000.fr; properties, node_hash: input of the reference API for the node
#return "# Cannot create #{host} : not enough information about it (node_hash['architecture']['nb_procs'], properties['cpucore'])" if node_hash['architecture'].nil? || properties['cpucore'].nil?
def oarcmd_separator
return "echo '" + '=' * 80 + "'\n\n"
end
node_uid, site_uid, grid_uid = host.split(".")
cluster_uid, node_number = node_uid.split("-")
def oarcmd_create_properties(properties_keys)
command = ''
properties_keys.each do |key, key_type|
if key_type == Fixnum
command += "oarproperty -a #{key} || true\n"
elsif key_type == String
command += "oarproperty -a #{key} --varchar || true\n"
else
raise "Error: the type of the '#{key}' property is unknown (Integer/String). Cannot generate the corresponding 'oarproperty' command. You must create this property manually ('oarproperty -a #{key} [--varchar]')"
end
end