lib-oar-properties.rb 12.6 KB
Newer Older
1 2 3 4 5 6 7 8 9 10
#!/usr/bin/ruby

require 'pp'
require 'erb'
require 'fileutils'
require 'pathname'
require 'json'
require 'time'
require 'yaml'
require 'hashdiff'
11
require 'set'
12 13 14 15 16 17

class MissingProperty <  StandardError; end

MiB = 1024**2

# Get node properties from the reference repo hash
18
# See also: https://www.grid5000.fr/mediawiki/index.php/Reference_Repository
19 20 21
def get_node_properties(cluster_uid, cluster, node_uid, node)
  h = {} # ouput

22 23 24 25 26
  if node['status'] == 'retired'
    h['state'] = 'Dead'
    return h if node.size == 1 # for dead nodes, additional information is most likely missing from the ref-repository.
  end

27 28 29 30
  main_network_adapter = node['network_interfaces'].values.find{ |na| na['enabled'] && na['mounted'] && na['interface'] =~ /ethernet/i && !na['management'] }
  main_network_adapter = node['network_interfaces'].values.find{ |na| na['enabled'] && na['mounted'] }
  raise MissingProperty, "Node #{node_uid} does not have a main network_adapter" unless main_network_adapter

31 32
  #  h['host']            = main_network_adapter['network_address']
  #TODO  raise MissingProperty, "Node #{node_uid} has no network_address" unless h['host']
33 34 35 36 37 38 39 40 41 42 43 44 45 46 47

  h['ip']              = main_network_adapter['ip']
  raise MissingProperty, "Node #{node_uid} has no IP" unless h['ip']
  h['cluster']         = cluster_uid
  h['nodemodel']       = cluster['model']
  h['switch']          = main_network_adapter['switch']
  h['besteffort']      = node['supported_job_types']['besteffort']
  h['deploy']          = node['supported_job_types']['deploy']
  h['ip_virtual']      = node['supported_job_types']['virtual'] == 'ivt'
  h['virtual']         = node['supported_job_types']['virtual']
  h['cpuarch']         = node['architecture']['platform_type']
  h['cpucore']         = node['architecture']['smt_size']/node['architecture']['smp_size']
  h['cputype']         = [node['processor']['model'], node['processor']['version']].join(' ')
  h['cpufreq']         = node['processor']['clock_speed']/1_000_000_000.0
  h['disktype']        = (node['block_devices'].first[1] || {})['interface']
48
  h['ethnb']           = node['network_interfaces'].values.select{|na| na['interface'] =~ /ethernet/i}.select{|nb| nb['mounted'] == true || nb['mountable'] == true}.length
49

50
  eth10g               = node['network_interfaces'].values.select{|na| na['interface'] =~ /ethernet/i}.select{|nb| nb['mounted'] == true || nb['mountable'] == true}
51 52
  h['eth10g']          = eth10g.detect{|na| na['rate'] == 10_000_000_000}.nil? ? false : true

53
  ib10g                = node['network_interfaces'].values.detect{|na| na['interface'] =~ /infiniband/i && ( na['mounted'] == true || na['mountable'] == true ) && na['rate'] == 10_000_000_000}
54 55 56
  h['ib10g']           = ib10g ? true : false
  h['ib10gmodel']      = ib10g ? ib10g['version'] : 'none'

57
  ib20g                = node['network_interfaces'].values.detect{|na| na['interface'] =~ /infiniband/i && ( na['mounted'] == true || na['mountable'] == true ) && na['rate'] == 20_000_000_000}
58 59 60
  h['ib20g']           = ib20g ? true : false
  h['ib20gmodel']      = ib20g ? ib20g['version'] : 'none'

61
  ib40g                = node['network_interfaces'].values.detect{|na| na['interface'] =~ /infiniband/i && ( na['mounted'] == true || na['mountable'] == true ) && na['rate'] == 40_000_000_000}
62 63 64
  h['ib40g']           = ib40g ? true : false
  h['ib40gmodel']      = ib40g ? ib40g['version'] : 'none'

65
  ib56g                = node['network_interfaces'].values.detect{|na| na['interface'] =~ /infiniband/i && ( na['mounted'] == true || na['mountable'] == true ) && na['rate'] == 56_000_000_000}
66 67 68
  h['ib56g']           = ib56g ? true : false
  h['ib56gmodel']      = ib56g ? ib56g['version'] : 'none'

69
  myri10g              = node['network_interfaces'].values.detect{|na| na['interface'] =~ /myri/i && ( na['mounted'] == true || na['mountable'] == true ) && na['rate'] == 10_000_000_000}
70 71 72
  h['myri10g']         = myri10g ? true : false
  h['myri10gmodel']    = myri10g ? myri10g['version'] : 'none'

73
  myri2g               = node['network_interfaces'].values.detect{|na| na['interface'] =~ /myri/i && ( na['mounted'] == true || na['mountable'] == true ) && na['rate'] == 2_000_000_000}
74 75 76 77 78 79 80 81 82 83 84 85 86 87
  h['myri2g']          = myri2g ? true : false
  h['myri2gmodel']     = myri2g ? myri2g['version'] : 'none'

  h['memcore']         = node['main_memory']['ram_size']/node['architecture']['smt_size']/MiB
  h['memcpu']          = node['main_memory']['ram_size']/node['architecture']['smp_size']/MiB
  h['memnode']         = node['main_memory']['ram_size']/MiB

  node['gpu']  ||= {}
  h['gpu']             = case node['gpu']['gpu'] when true; true; when false; false when nil; false; else node['gpu']['gpu'].upcase end
  h['gpu_count']       = node['gpu']['gpu_count']
  h['gpu_model']       = node['gpu']['gpu_model']

  node['monitoring'] ||= {}

88
  h['wattmeter'] = case node['monitoring']['wattmeter'] when true; true; when false; false when nil; false; else node['monitoring']['wattmeter'].upcase end
89 90 91 92 93

  # h['rconsole'] = node['monitoring']['rconsole']

  h['cluster_priority'] = (cluster['priority'] || Time.parse(cluster['created_at'].to_s).strftime('%Y%m')).to_i
  
94 95 96 97 98
  h['production'] = false # default
  h['production'] = node['supported_job_types']['queues'].include?('production') if node['supported_job_types'] && node['supported_job_types'].has_key?('queues')

  h['max_walltime'] = 0 # default
  h['max_walltime'] = node['supported_job_types']['max_walltime'] if node['supported_job_types'] && node['supported_job_types'].has_key?('max_walltime')
99
  
100
  return h
101
end
102

103 104 105 106 107 108 109 110 111 112 113 114 115
#
#
#
def get_nodelist_properties(site_uid, site)
  properties = {} # output
  
  site['clusters'].each do |cluster_uid, cluster|

    cluster['nodes'].each do |node_uid, node|

      begin
        properties[node_uid] = get_node_properties(cluster_uid, cluster, node_uid, node)
      rescue MissingProperty => e
116 117
        # TODO
        #puts "Error while processing node #{node_uid}: #{e}"
118 119 120 121 122 123 124 125
      end

    end
  end

  return properties
end

126 127 128
def diff_node_properties(node_properties_oar, node_properties_ref)
  node_properties_oar ||= {}
  node_properties_ref ||= {}
129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152

  # default OAR at resource creation:
  #  available_upto: '2147483647'
  #  besteffort: 'YES'
  #  core: ~
  #  cpu: ~
  #  cpuset: 0
  #  deploy: 'NO'
  #  desktop_computing: 'NO'
  #  drain: 'NO'
  #  expiry_date: 0
  #  finaud_decision: 'YES'
  #  host: ~
  #  last_available_upto: 0
  #  last_job_date: 0
  #  network_address: server
  #  next_finaud_decision: 'NO'
  #  next_state: UnChanged
  #  resource_id: 9
  #  scheduler_priority: 0
  #  state: Suspected
  #  state_num: 3
  #  suspended_jobs: 'NO'
  #  type: default
153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173

  ignore_keys = [
                 "slash_16",
                 "slash_17",
                 "slash_18",
                 "slash_19",
                 "slash_20",
                 "slash_21",
                 "slash_22",
                 "available_upto",
                 "chunks",
                 "comment", # TODO
                 "core",
                 "cpu",
                 "cpuset",
                 "desktop_computing",
                 "drain",
                 "expiry_date",
                 "finaud_decision",
                 "grub",
                 "host", # TODO
174
                 "jobs", # This property exists when a job is running
175 176 177 178 179 180 181 182 183 184 185
                 "last_available_upto",
                 "last_job_date",
                 "maintenance",
                 "mic", # TODO
                 "network_address", # TODO
                 "next_finaud_decision",
                 "next_state",
                 "rconsole", # TODO
                 "resource_id",
                 "scheduler_priority",
                 "state_num",
186
                 "switch", # TODO
187 188 189 190 191 192 193 194 195
                 "subnet_address",
                 "subnet_prefix",
                 "suspended_jobs",
                 "thread",
                 "type", # TODO
                 "vlan",
                 "wattmeter" # TODO
                ]

196 197 198 199 200 201 202 203 204 205 206 207 208 209 210
  ignore_keys.each { |key| node_properties_oar.delete(key) }
  ignore_keys.each { |key| node_properties_ref.delete(key) }

  # Ignore the 'state' property only if the node is not 'Dead' according to the reference-repo.
  # Otherwise, we must enforce that the node state is also 'Dead' on the OAR server.
  # On the OAR server, the 'state' property can be modified by phoenix. We ignore that.
  if node_properties_ref['state'] != 'Dead'
    node_properties_oar.delete('state')
    node_properties_ref.delete('state')
  elsif node_properties_ref.size == 1
    # For dead nodes, when information is missing from the reference-repo, only enforce the 'state' property and ignore other differences.
    return HashDiff.diff({'state' => node_properties_oar['state']}, {'state' => node_properties_ref['state']})
  end

  return HashDiff.diff(node_properties_oar, node_properties_ref)
211 212 213

end

214 215 216 217 218 219 220 221 222 223 224 225 226
# Return a list of properties
def get_property_keys(nodelist_properties)
  properties_keys = Set.new []
  nodelist_properties.each { |site_uid, site_properties| 
    # We do not use site/cluster/node filters here as we want the same list of properties across OAR servers
    site_properties.each { |node_uid, node_properties| 
      next if node_uid == nil
      properties_keys.merge(node_properties.keys)
    }
  }
  return properties_keys
end

227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246
def oarcmd_script_header()
  return <<EOF
set -eu

EOF
end

def oarcmd_create_node_header()
  return <<EOF
nodelist=$(oarnodes -l)

list_contains () { 
    [[ "$1" =~ (^|[[:space:]])"$2"($|[[:space:]]) ]] && return 0 || return 1
}

EOF

end

def oarcmd_create_node(host, properties, node_hash) # host = grifffon-1.nancy.grid5000.fr; properties, node_hash: input of the reference API for the node
247 248
  #return "# Cannot create #{host} : not enough information about it (node_hash['architecture']['smp_size'], properties['cpucore'])" if node_hash['architecture'].nil? || properties['cpucore'].nil?

249 250 251 252 253 254 255 256 257 258 259 260
  node_uid, site_uid, grid_uid = host.split(".")
  cluster_uid, node_number     = node_uid.split("-")

  command  = "echo; echo 'Adding host #{host}:'\n"
  command += 'list_contains "$nodelist" "' + host + '" && '
  command += "echo '=> host already exist'\n"
  command += 'list_contains "$nodelist" "' + host + '" || '
  command += "sudo oar_resources_add -a --hosts 1 --host0 #{node_number} --host-prefix #{cluster_uid}- --host-suffix .#{site_uid}.#{grid_uid}.fr --cpus #{node_hash['architecture']['smp_size']} --cores #{properties['cpucore']}"
  command += ' | sudo bash'
  
  return command + "\n"
end
261 262

def oarcmd_set_node_properties(host, properties)
263 264
  #return "# #{host}: OK" if properties.size == 0
  return "" if properties.size == 0
265

266 267
  command  = "echo; echo 'Setting properties for #{host}:'; echo\n"
  command += "sudo oarnodesetting -h #{host} -p "
268 269 270 271 272 273 274 275 276

  command +=
    properties.to_a.map{ |(k,v)|
    v = "YES" if v == true
    v = "NO"  if v == false
    
    ! v.nil? ? "#{k}=#{v.inspect.gsub("'", "\\'").gsub("\"", "'")}" : nil
  }.compact.join(' -p ')
  
277
  return command + "\n"
278
end
279 280 281 282
# '

# Get the OAR properties from the OAR scheduler
# This is only needed for the -d option
283
def oarcmd_get_nodelist_properties(site_uid, filename=nil, options)
284
  oarnodes_yaml = ""
285
  
286 287
  if filename and File.exist?(filename)
    # Read oar properties from file
288
    puts "Read 'oarnodes -Y' from #{filename}" if options[:verbose]
289 290 291
    oarnodes_yaml = File.open(filename, 'rb') { |f| f.read }
  else
    # Download the oar properties from the oar server
292
    puts "Downloading 'oarnodes -Y' from " + options[:ssh][:host].gsub("%s", site_uid) + "..." if options[:verbose]
293

294
    Net::SSH.start(options[:ssh][:host].gsub("%s", site_uid), options[:ssh][:user], options[:ssh][:params]) { |ssh|
295 296
      # capture all stderr and stdout output from a remote process
      oarnodes_yaml = ssh.exec!('oarnodes -Y')
297
    }
298
    puts "... done" if options[:verbose]
299

300 301
    if filename
      # Cache the file
302
      puts "Save 'oarnodes -Y' as #{filename}" if options[:verbose]
303 304 305 306 307 308 309 310
      File.write(filename, oarnodes_yaml)
    end
  end

  # Load the YAML file into an hashtable
  h = YAML.load(oarnodes_yaml)

  # Format convertion: use host as keys of the hash (instead of id)
311
  h = h.map {|k, v| v['type'] == 'default' ? [v['host'].split('.').first, v] : [nil, nil] }.to_h
312

313 314 315
  return h
end

316 317 318 319 320 321 322
def oarcmd_create_properties(properties_keys)
  command = ""
  properties_keys.each { |key|
    command += "oarproperty -a #{key} || true\n"
  }
  return command
end
323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341

def ssh_exec(site_uid, cmds, options)
  # The following is equivalent to : "cat cmds | bash"
  #res = ""
  c = Net::SSH.start(options[:ssh][:host].gsub("%s", site_uid), options[:ssh][:user], options[:ssh][:params])
  c.open_channel { |channel|
    channel.exec('bash') { |ch, success|
      channel.on_data { |ch, data|
        puts data #if options[:verbose] # ssh cmd output
      }
      
      cmds.each { |cmd| 
        channel.send_data cmd 
      }
      channel.eof!
    }
  }
  c.loop
end