Commit 6cb50e47 authored by Jérémie Gaidamour's avatar Jérémie Gaidamour
Browse files

[dev] run-g5kchecks: added a --force option

* this option still checks if the node is busy
* I also improved ssh error handling
* I also added -s, -c, -n, -q, --ssh-keys, -h options
parent fc108bbb
......@@ -11,6 +11,8 @@ require 'pathname'
require 'yaml'
require '../lib/hash/hash'
puts 'Postprocessing of output/. Copying files into ../input/'
list_of_yaml_files = Dir['output/*.y*ml'].sort_by { |x| -x.count('/') }
list_of_yaml_files.each { |filename|
file = filename.split("/")[1]
......@@ -19,8 +21,12 @@ list_of_yaml_files.each { |filename|
cluster_uid = node_uid.split("-")[0]
hash = YAML::load_file(filename)
if hash == false
puts "Error found in #{filename}"
next
end
hash["storage_devices"] = hash.delete("block_devices")
hash["storage_devices"] = hash.delete("block_devices")
hash["storage_devices"] = hash["storage_devices"].sort_by_array(["sda", "sdb", "sdc", "sdd", "sde"])
hash["storage_devices"].each {|k, v| v.delete("device") }
......
......@@ -11,120 +11,197 @@
require 'cute'
require 'peach'
require 'fileutils'
require '../lib/input_loader'
require 'pp'
options ||= {}
options[:ssh] ||= {}
options[:ssh][:user] = 'g5kadmin' unless options[:ssh][:user]
options[:ssh][:host] = '%s.g5kadmin' unless options[:ssh][:host]
options[:ssh][:params] ||= {:timeout => 10}
# puts 'Init ruby-cute'
$g5k = Cute::G5K::API.new()
# puts '...done'
options[:queue] = 'admin'
options[:queue] = 'default'
# puts 'Get site_uids'
sites = $g5k.site_uids()
# puts '...done'
jobs = []
#
# Parse command line parameters
#
FileUtils::mkdir_p("output/")
options = {}
options[:sites] = sites # %w{grenoble lille luxembourg lyon nancy nantes reims rennes sophia}
def run_g5kcheck(site_uid, node_uid, options)
puts "#{site_uid}: Processing #{node_uid}"
begin
Net::SSH.start(options[:ssh][:host].gsub("%s", site_uid), options[:ssh][:user], options[:ssh][:params]) { |ssh|
output1 = ssh.exec!("sudo ssh -o StrictHostKeychecking=no root@#{node_uid} '/usr/bin/g5k-checks -m api'")
output2 = ssh.exec!("sudo ssh -q -o StrictHostKeychecking=no root@#{node_uid} 'cat /tmp/#{node_uid}.yaml'")
File.open("output/#{node_uid}.yaml", 'w') do |f|
f.write(output2)
end
}
rescue Exception => e
puts "#{site_uid}: Error while processing #{node_uid} - #{e.class}: #{e.message}"
OptionParser.new do |opts|
opts.banner = "Usage: oar-properties.rb [options]"
opts.separator ""
opts.separator "Example: ruby run-g5kchecks.rb -s nancy -n graoully-1 # make a reservation on graoully-1 and run g5k-checks"
opts.separator " ruby run-g5kchecks.rb -s nancy -n graoully-1 --force # run g5k-checks without making a reservation (for dead node)"
# opts.separator " ruby run-g5kchecks.rb # make a reservation on every nodes"
###
opts.separator ""
opts.separator "Filters:"
opts.on('-s', '--sites a,b,c', Array, 'Select site(s)',
"Default: "+options[:sites].join(", ")) do |s|
raise "Wrong argument for -s option." unless (s - options[:sites]).empty?
options[:sites] = s
end
end
begin
opts.on('-c', '--clusters a,b,c', Array, 'Select clusters(s). Default: all') do |s|
options[:clusters] = s
end
g5k = Cute::G5K::API.new()
opts.on('-n', '--nodes a,b,c', Array, 'Select nodes(s). Default: all') do |n|
options[:nodes] = n
end
g5k.site_uids().peach { |site_uid|
###
#
# Node reservation
#
opts.separator ""
opts.separator "Node reservation options:"
# Reserve as many free node as possible in one reservation
begin
jobs << g5k.reserve(:site => site_uid, :resources => "nodes=BEST", :walltime => '00:30:00', :wait => false, :queue => options[:queue])
rescue Exception => e
puts "#{site_uid}: Error during the reservation nodes=BEST"
end
opts.on('-q', '--queue', 'Specify an OAR reservation queue') do |q|
options[:queue] = q
end
# Reserve busy nodes one by one
g5k.nodes_status(site_uid).each { |node_uid, status|
next if File.exist?("output/#{node_uid}.yaml") # skip reservation if we alread have the node info
next if status != "busy" # only busy nodes
opts.on('-f', '--force',
'Run g5k-checks on the nodes without any OAR reservation',
'This option is meant to be used for dead nodes',
'or if you already reserved the ressources') do |f|
options[:force] = true
end
begin
jobs << g5k.reserve(:site => site_uid, :resources => "{host='#{node_uid}'}", :walltime => '00:30:00', :wait => false, :queue => options[:queue])
rescue Exception => e
puts "#{site_uid}: Error during the reservation of #{node_uid} - #{e.class}: #{e.message}"
end
}
###
###
#
# Process running jobs
#
released_jobs = {};
###
loop do
waiting_jobs = g5k.get_my_jobs(site_uid, state='waiting')
running_jobs = g5k.get_my_jobs(site_uid, state='running')
launching_jobs = g5k.get_my_jobs(site_uid, state='launching')
opts.separator ""
opts.separator "SSH options:"
puts "#{site_uid}: Running: #{running_jobs.size} - Waiting: #{waiting_jobs.size} - Launching: #{launching_jobs.size}"
opts.on('--ssh-keys k1,k2,k3', Array, 'SSH keys') do |k|
options[:ssh] ||= {}
options[:ssh][:params] ||= {}
options[:ssh][:params][:keys] ||= []
options[:ssh][:params][:keys] << k
end
###
opts.separator ""
opts.separator "Common options:"
# Print an options summary.
opts.on_tail("-h", "--help", "Show this message") do
puts opts
exit
end
end.parse!
running_jobs.each { |job|
job_uid = job['uid']
if released_jobs[job_uid]
puts "#{site_uid}: #{job_uid} already processed"
next
end
options[:ssh] ||= {}
options[:ssh][:user] = 'g5kadmin' unless options[:ssh][:user]
options[:ssh][:host] = '%s.g5kadmin' unless options[:ssh][:host]
options[:ssh][:params] ||= {:timeout => 10}
options[:queue] ||= 'admin'
puts "#{site_uid}: Processing #{job_uid}"
puts "Options: #{options}" if options[:verbose]
job['assigned_nodes'].peach(10) { |node_uid|
#
#
#
next if File.exist?("output/#{node_uid}.yaml")
$jobs = [] # list of OAR reservation
run_g5kcheck(site_uid, node_uid, options)
}
puts "#{site_uid}: Release #{job_uid}"
begin
g5k.release(job)
released_jobs[job_uid] = true
rescue Exception => e
puts "#{site_uid}: Error while releasing job #{job_uid} - #{e.class}: #{e.message}"
end
}
FileUtils::mkdir_p("output/")
def run_g5kcheck(site_uid, fnode_uid, options)
puts "#{site_uid}: Processing #{fnode_uid}"
# Stop when there isn't any job left
break if running_jobs.empty? and waiting_jobs.empty? and launching_jobs.empty?
begin
Net::SSH.start(options[:ssh][:host].gsub("%s", site_uid), options[:ssh][:user], options[:ssh][:params]) { |ssh|
output1 = ssh.exec!("sudo ssh -o StrictHostKeychecking=no root@#{fnode_uid} '/usr/bin/g5k-checks -m api'")
output2 = ssh.exec!("sudo ssh -q -o StrictHostKeychecking=no root@#{fnode_uid} 'cat /tmp/#{fnode_uid}.yaml'")
if output2 == ''
puts output1 # ex: "ssh: connect to host graphite-1.nancy.grid5000.fr port 22: No route to host\r\n"
else
File.open("output/#{fnode_uid}.yaml", 'w') do |f|
f.write(output2)
end
end
}
rescue Exception => e
puts "#{site_uid}: Error while processing #{fnode_uid} - #{e.class}: #{e.message}"
end
end
# Wait a little bit when the previous loop iteration did not find any job to process
sleep(5) if running_jobs.empty?
def oarsub(site_uid, resources, queue)
begin
$jobs << $g5k.reserve(:site => site_uid, :resources => resources, :walltime => '00:30:00', :wait => false, :queue => queue)
rescue Exception => e
puts "#{site_uid}: Error during the reservation '#{resources}' at #{site_uid} - #{e.class}: #{e.message}"
end
end
if options[:force]
# puts 'Get input/'
refapi_hash = load_yaml_file_hierarchy("../input/grid5000/") # use input/ has nodes might not be register in OAR db yet (for new clusters installation)
# puts '...done'
run_queue ||= {}
# Safeguard. Ask before running g5k-checks on reserved nodes (We should not interfere with user experiments
prompt = ''
options[:sites].each { |site_uid|
run_queue[site_uid] ||= []
# puts "Get node status at #{site_uid}"
nodes_status = nil # postpone query
# puts '...done'
refapi_hash['sites'][site_uid]["clusters"].peach { |cluster_uid, cluster|
next if options[:clusters] && ! options[:clusters].include?(cluster_uid)
end
cluster["nodes"].each_sort_by_node_uid { |node_uid, node|
next if options[:nodes] && ! options[:nodes].include?(noder_uid)
fnode_uid = "#{node_uid}.#{site_uid}.grid5000.fr"
if File.exist?("output/#{fnode_uid}.yaml")
puts "output/#{fnode_uid}.yaml exist. Remove this file if you want to run g5k-checks again on this node."
next
end
nodes_status = $g5k.nodes_status(site_uid) if nodes_status.nil?
if prompt != 'yes-all' && nodes_status["#{node_uid}.#{site_uid}.grid5000.fr"] && nodes_status["#{fnode_uid}"] == "busy"
if prompt != 'no-all'
printf "#{site_uid} - #{node_uid} is busy (ie. there is currently an OAR reservation. Run g5k-checks on reserved nodes ? (y/yes-all/no-all/N) "
prompt = STDIN.gets.chomp
run_queue[site_uid] << fnode_uid if prompt == 'y' || prompt == 'yes-all'
end
else
run_queue[site_uid] << fnode_uid
end
}
}
}
rescue Exception => e
puts "#{e.class}: #{e.message}"
ensure
jobs.each { |job|
puts "Release job #{job['links'][0]['href']}"
g5k.release(job)
# Actual run
run_queue.peach { |site_uid, q|
q.peach { |fnode_uid|
run_g5kcheck(site_uid, fnode_uid, options)
}
}
else # options[:force]
puts 'Temporarily disabled. Use --force'
exit
end
`ruby postprocessing.rb`
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment