lib-oar-properties.rb 12.6 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
#!/usr/bin/ruby

require 'pp'
require 'erb'
require 'fileutils'
require 'pathname'
require 'json'
require 'time'
require 'yaml'
require 'hashdiff'
11
require 'set'
12
13
14
15
16
17

class MissingProperty <  StandardError; end

MiB = 1024**2

# Get node properties from the reference repo hash
18
# See also: https://www.grid5000.fr/mediawiki/index.php/Reference_Repository
19
20
21
def get_node_properties(cluster_uid, cluster, node_uid, node)
  h = {} # ouput

22
23
24
25
26
  if node['status'] == 'retired'
    h['state'] = 'Dead'
    return h if node.size == 1 # for dead nodes, additional information is most likely missing from the ref-repository.
  end

Jérémie Gaidamour's avatar
Jérémie Gaidamour committed
27
28
  main_network_adapter = node['network_adapters'].values.find{ |na| na['enabled'] && na['mounted'] && na['interface'] =~ /ethernet/i && !na['management'] }
  main_network_adapter = node['network_adapters'].values.find{ |na| na['enabled'] && na['mounted'] }
29
30
  raise MissingProperty, "Node #{node_uid} does not have a main network_adapter" unless main_network_adapter

31
32
  #  h['host']            = main_network_adapter['network_address']
  #TODO  raise MissingProperty, "Node #{node_uid} has no network_address" unless h['host']
33
34
35
36
37
38
39
40
41
42
43
44
45
46

  h['ip']              = main_network_adapter['ip']
  raise MissingProperty, "Node #{node_uid} has no IP" unless h['ip']
  h['cluster']         = cluster_uid
  h['nodemodel']       = cluster['model']
  h['switch']          = main_network_adapter['switch']
  h['besteffort']      = node['supported_job_types']['besteffort']
  h['deploy']          = node['supported_job_types']['deploy']
  h['ip_virtual']      = node['supported_job_types']['virtual'] == 'ivt'
  h['virtual']         = node['supported_job_types']['virtual']
  h['cpuarch']         = node['architecture']['platform_type']
  h['cpucore']         = node['architecture']['smt_size']/node['architecture']['smp_size']
  h['cputype']         = [node['processor']['model'], node['processor']['version']].join(' ')
  h['cpufreq']         = node['processor']['clock_speed']/1_000_000_000.0
Jérémie Gaidamour's avatar
Jérémie Gaidamour committed
47
  h['disktype']        = (node['storage_devices'].first[1] || {})['interface']
Jérémie Gaidamour's avatar
Jérémie Gaidamour committed
48
  h['ethnb']           = node['network_adapters'].values.select{|na| na['interface'] =~ /ethernet/i}.select{|nb| nb['mounted'] == true || nb['mountable'] == true}.length
49

Jérémie Gaidamour's avatar
Jérémie Gaidamour committed
50
  eth10g               = node['network_adapters'].values.select{|na| na['interface'] =~ /ethernet/i}.select{|nb| nb['mounted'] == true || nb['mountable'] == true}
51
52
  h['eth10g']          = eth10g.detect{|na| na['rate'] == 10_000_000_000}.nil? ? false : true

Jérémie Gaidamour's avatar
Jérémie Gaidamour committed
53
  ib10g                = node['network_adapters'].values.detect{|na| na['interface'] =~ /infiniband/i && ( na['mounted'] == true || na['mountable'] == true ) && na['rate'] == 10_000_000_000}
54
55
56
  h['ib10g']           = ib10g ? true : false
  h['ib10gmodel']      = ib10g ? ib10g['version'] : 'none'

Jérémie Gaidamour's avatar
Jérémie Gaidamour committed
57
  ib20g                = node['network_adapters'].values.detect{|na| na['interface'] =~ /infiniband/i && ( na['mounted'] == true || na['mountable'] == true ) && na['rate'] == 20_000_000_000}
58
59
60
  h['ib20g']           = ib20g ? true : false
  h['ib20gmodel']      = ib20g ? ib20g['version'] : 'none'

Jérémie Gaidamour's avatar
Jérémie Gaidamour committed
61
  ib40g                = node['network_adapters'].values.detect{|na| na['interface'] =~ /infiniband/i && ( na['mounted'] == true || na['mountable'] == true ) && na['rate'] == 40_000_000_000}
62
63
64
  h['ib40g']           = ib40g ? true : false
  h['ib40gmodel']      = ib40g ? ib40g['version'] : 'none'

Jérémie Gaidamour's avatar
Jérémie Gaidamour committed
65
  ib56g                = node['network_adapters'].values.detect{|na| na['interface'] =~ /infiniband/i && ( na['mounted'] == true || na['mountable'] == true ) && na['rate'] == 56_000_000_000}
66
67
68
  h['ib56g']           = ib56g ? true : false
  h['ib56gmodel']      = ib56g ? ib56g['version'] : 'none'

Jérémie Gaidamour's avatar
Jérémie Gaidamour committed
69
  myri10g              = node['network_adapters'].values.detect{|na| na['interface'] =~ /myri/i && ( na['mounted'] == true || na['mountable'] == true ) && na['rate'] == 10_000_000_000}
70
71
72
  h['myri10g']         = myri10g ? true : false
  h['myri10gmodel']    = myri10g ? myri10g['version'] : 'none'

Jérémie Gaidamour's avatar
Jérémie Gaidamour committed
73
  myri2g               = node['network_adapters'].values.detect{|na| na['interface'] =~ /myri/i && ( na['mounted'] == true || na['mountable'] == true ) && na['rate'] == 2_000_000_000}
74
75
76
77
78
79
80
81
82
83
84
85
86
87
  h['myri2g']          = myri2g ? true : false
  h['myri2gmodel']     = myri2g ? myri2g['version'] : 'none'

  h['memcore']         = node['main_memory']['ram_size']/node['architecture']['smt_size']/MiB
  h['memcpu']          = node['main_memory']['ram_size']/node['architecture']['smp_size']/MiB
  h['memnode']         = node['main_memory']['ram_size']/MiB

  node['gpu']  ||= {}
  h['gpu']             = case node['gpu']['gpu'] when true; true; when false; false when nil; false; else node['gpu']['gpu'].upcase end
  h['gpu_count']       = node['gpu']['gpu_count']
  h['gpu_model']       = node['gpu']['gpu_model']

  node['monitoring'] ||= {}

88
  h['wattmeter'] = case node['monitoring']['wattmeter'] when true; true; when false; false when nil; false; else node['monitoring']['wattmeter'].upcase end
89
90
91
92
93

  # h['rconsole'] = node['monitoring']['rconsole']

  h['cluster_priority'] = (cluster['priority'] || Time.parse(cluster['created_at'].to_s).strftime('%Y%m')).to_i
  
94
95
96
97
98
  h['production'] = false # default
  h['production'] = node['supported_job_types']['queues'].include?('production') if node['supported_job_types'] && node['supported_job_types'].has_key?('queues')

  h['max_walltime'] = 0 # default
  h['max_walltime'] = node['supported_job_types']['max_walltime'] if node['supported_job_types'] && node['supported_job_types'].has_key?('max_walltime')
99
  
100
  return h
101
end
102

103
104
105
106
107
108
109
110
111
112
113
114
115
#
#
#
def get_nodelist_properties(site_uid, site)
  properties = {} # output
  
  site['clusters'].each do |cluster_uid, cluster|

    cluster['nodes'].each do |node_uid, node|

      begin
        properties[node_uid] = get_node_properties(cluster_uid, cluster, node_uid, node)
      rescue MissingProperty => e
116
        puts "Error while processing node #{node_uid}: #{e}"
117
118
119
120
121
122
123
124
      end

    end
  end

  return properties
end

125
126
127
def diff_node_properties(node_properties_oar, node_properties_ref)
  node_properties_oar ||= {}
  node_properties_ref ||= {}
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151

  # default OAR at resource creation:
  #  available_upto: '2147483647'
  #  besteffort: 'YES'
  #  core: ~
  #  cpu: ~
  #  cpuset: 0
  #  deploy: 'NO'
  #  desktop_computing: 'NO'
  #  drain: 'NO'
  #  expiry_date: 0
  #  finaud_decision: 'YES'
  #  host: ~
  #  last_available_upto: 0
  #  last_job_date: 0
  #  network_address: server
  #  next_finaud_decision: 'NO'
  #  next_state: UnChanged
  #  resource_id: 9
  #  scheduler_priority: 0
  #  state: Suspected
  #  state_num: 3
  #  suspended_jobs: 'NO'
  #  type: default
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172

  ignore_keys = [
                 "slash_16",
                 "slash_17",
                 "slash_18",
                 "slash_19",
                 "slash_20",
                 "slash_21",
                 "slash_22",
                 "available_upto",
                 "chunks",
                 "comment", # TODO
                 "core",
                 "cpu",
                 "cpuset",
                 "desktop_computing",
                 "drain",
                 "expiry_date",
                 "finaud_decision",
                 "grub",
                 "host", # TODO
173
                 "jobs", # This property exists when a job is running
174
175
176
177
178
179
180
181
182
183
184
                 "last_available_upto",
                 "last_job_date",
                 "maintenance",
                 "mic", # TODO
                 "network_address", # TODO
                 "next_finaud_decision",
                 "next_state",
                 "rconsole", # TODO
                 "resource_id",
                 "scheduler_priority",
                 "state_num",
185
                 "switch", # TODO
186
187
188
189
190
191
192
193
194
                 "subnet_address",
                 "subnet_prefix",
                 "suspended_jobs",
                 "thread",
                 "type", # TODO
                 "vlan",
                 "wattmeter" # TODO
                ]

195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
  ignore_keys.each { |key| node_properties_oar.delete(key) }
  ignore_keys.each { |key| node_properties_ref.delete(key) }

  # Ignore the 'state' property only if the node is not 'Dead' according to the reference-repo.
  # Otherwise, we must enforce that the node state is also 'Dead' on the OAR server.
  # On the OAR server, the 'state' property can be modified by phoenix. We ignore that.
  if node_properties_ref['state'] != 'Dead'
    node_properties_oar.delete('state')
    node_properties_ref.delete('state')
  elsif node_properties_ref.size == 1
    # For dead nodes, when information is missing from the reference-repo, only enforce the 'state' property and ignore other differences.
    return HashDiff.diff({'state' => node_properties_oar['state']}, {'state' => node_properties_ref['state']})
  end

  return HashDiff.diff(node_properties_oar, node_properties_ref)
210
211
212

end

213
214
215
216
217
218
219
220
221
222
223
224
225
# Return a list of properties
def get_property_keys(nodelist_properties)
  properties_keys = Set.new []
  nodelist_properties.each { |site_uid, site_properties| 
    # We do not use site/cluster/node filters here as we want the same list of properties across OAR servers
    site_properties.each { |node_uid, node_properties| 
      next if node_uid == nil
      properties_keys.merge(node_properties.keys)
    }
  }
  return properties_keys
end

226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
def oarcmd_script_header()
  return <<EOF
set -eu

EOF
end

def oarcmd_create_node_header()
  return <<EOF
nodelist=$(oarnodes -l)

list_contains () { 
    [[ "$1" =~ (^|[[:space:]])"$2"($|[[:space:]]) ]] && return 0 || return 1
}

EOF

end

def oarcmd_create_node(host, properties, node_hash) # host = grifffon-1.nancy.grid5000.fr; properties, node_hash: input of the reference API for the node
246
247
  #return "# Cannot create #{host} : not enough information about it (node_hash['architecture']['smp_size'], properties['cpucore'])" if node_hash['architecture'].nil? || properties['cpucore'].nil?

248
249
250
251
252
253
254
  node_uid, site_uid, grid_uid = host.split(".")
  cluster_uid, node_number     = node_uid.split("-")

  command  = "echo; echo 'Adding host #{host}:'\n"
  command += 'list_contains "$nodelist" "' + host + '" && '
  command += "echo '=> host already exist'\n"
  command += 'list_contains "$nodelist" "' + host + '" || '
255
256
  command += "oar_resources_add -a --hosts 1 --host0 #{node_number} --host-prefix #{cluster_uid}- --host-suffix .#{site_uid}.#{grid_uid}.fr --cpus #{node_hash['architecture']['smp_size']} --cores #{properties['cpucore']}"
  command += ' | bash'
257
258
259
  
  return command + "\n"
end
260
261

def oarcmd_set_node_properties(host, properties)
262
263
  #return "# #{host}: OK" if properties.size == 0
  return "" if properties.size == 0
264

265
  command  = "echo; echo 'Setting properties for #{host}:'; echo\n"
266
  command += "oarnodesetting -h #{host} -p "
267
268
269
270
271
272
273
274
275

  command +=
    properties.to_a.map{ |(k,v)|
    v = "YES" if v == true
    v = "NO"  if v == false
    
    ! v.nil? ? "#{k}=#{v.inspect.gsub("'", "\\'").gsub("\"", "'")}" : nil
  }.compact.join(' -p ')
  
276
  return command + "\n"
277
end
278
279
280
281
# '

# Get the OAR properties from the OAR scheduler
# This is only needed for the -d option
282
def oarcmd_get_nodelist_properties(site_uid, filename=nil, options)
283
  oarnodes_yaml = ""
284
  
285
286
  if filename and File.exist?(filename)
    # Read oar properties from file
287
    puts "Read 'oarnodes -Y' from #{filename}" if options[:verbose]
288
289
290
    oarnodes_yaml = File.open(filename, 'rb') { |f| f.read }
  else
    # Download the oar properties from the oar server
291
    puts "Downloading 'oarnodes -Y' from " + options[:ssh][:host].gsub("%s", site_uid) + "..." if options[:verbose]
292

293
    Net::SSH.start(options[:ssh][:host].gsub("%s", site_uid), options[:ssh][:user], options[:ssh][:params]) { |ssh|
294
295
      # capture all stderr and stdout output from a remote process
      oarnodes_yaml = ssh.exec!('oarnodes -Y')
296
    }
297
    puts "... done" if options[:verbose]
298

299
300
    if filename
      # Cache the file
301
      puts "Save 'oarnodes -Y' as #{filename}" if options[:verbose]
302
303
304
305
306
307
308
309
      File.write(filename, oarnodes_yaml)
    end
  end

  # Load the YAML file into an hashtable
  h = YAML.load(oarnodes_yaml)

  # Format convertion: use host as keys of the hash (instead of id)
310
  h = h.map {|k, v| v['type'] == 'default' ? [v['host'].split('.').first, v] : [nil, nil] }.to_h
311

312
313
314
  return h
end

315
316
317
318
319
320
321
def oarcmd_create_properties(properties_keys)
  command = ""
  properties_keys.each { |key|
    command += "oarproperty -a #{key} || true\n"
  }
  return command
end
322

323
# sudo exec
324
325
326
327
328
def ssh_exec(site_uid, cmds, options)
  # The following is equivalent to : "cat cmds | bash"
  #res = ""
  c = Net::SSH.start(options[:ssh][:host].gsub("%s", site_uid), options[:ssh][:user], options[:ssh][:params])
  c.open_channel { |channel|
329
    channel.exec('sudo bash') { |ch, success|
330
331
332
333
334
335
336
337
338
339
340
341
      channel.on_data { |ch, data|
        puts data #if options[:verbose] # ssh cmd output
      }
      
      cmds.each { |cmd| 
        channel.send_data cmd 
      }
      channel.eof!
    }
  }
  c.loop
end