Une MAJ de sécurité est nécessaire sur notre version actuelle. Elle sera effectuée lundi 02/08 entre 12h30 et 13h. L'interruption de service devrait durer quelques minutes (probablement moins de 5 minutes).

Commit 5598e167 authored by Jérémie Gaidamour's avatar Jérémie Gaidamour
Browse files

[dev] run-g5kchecks implemented -s, -c, -n with oar reservations

parent a2df87c8
......@@ -8,9 +8,10 @@
# - If an output YAML file already exist in ouput/, the execution of g5k-check on the corresponding node is skipped.
# - Use postprocessing.rb for moving the file in th input/ directory. This script also edits some keys of the YAML files.
require 'optparse'
require 'fileutils'
require 'cute'
require 'peach'
require 'fileutils'
require '../lib/input_loader'
require 'pp'
......@@ -61,7 +62,7 @@ OptionParser.new do |opts|
opts.separator ""
opts.separator "Node reservation options:"
opts.on('-q', '--queue', 'Specify an OAR reservation queue') do |q|
opts.on('-qQUEUE', '--queue=queue', String, 'Specify an OAR reservation queue') do |q|
options[:queue] = q
end
......@@ -112,8 +113,6 @@ puts "Options: #{options}" if options[:verbose]
#
#
$jobs = [] # list of OAR reservation
FileUtils::mkdir_p("output/")
def run_g5kcheck(site_uid, fnode_uid, options)
......@@ -139,7 +138,9 @@ end
def oarsub(site_uid, resources, queue)
begin
$jobs << $g5k.reserve(:site => site_uid, :resources => resources, :walltime => '00:30:00', :wait => false, :queue => queue)
$g5k.reserve(:site => site_uid, :resources => resources, :walltime => '00:30:00', :wait => false, :queue => queue)
rescue Exception => e
puts "#{site_uid}: Error during the reservation '#{resources}' at #{site_uid} - #{e.class}: #{e.message}"
end
......@@ -176,7 +177,7 @@ if options[:force]
end
nodes_status = $g5k.nodes_status(site_uid) if nodes_status.nil?
if prompt != 'yes-all' && nodes_status["#{node_uid}.#{site_uid}.grid5000.fr"] && nodes_status["#{fnode_uid}"] == "busy"
if prompt != 'yes-all' && nodes_status[fnode_uid] && nodes_status[fnode_uid] == "busy"
if prompt != 'no-all'
printf "#{site_uid} - #{node_uid} is busy (ie. there is currently an OAR reservation. Run g5k-checks on reserved nodes ? (y/yes-all/no-all/N) "
prompt = STDIN.gets.chomp
......@@ -197,11 +198,111 @@ if options[:force]
}
}
else # options[:force]
else # ! options[:force]
puts 'Temporarily disabled. Use --force'
exit
options[:sites].peach { |site_uid|
end
jobs = [] # list of OAR reservation
begin
#
# Node reservation
#
nodes_status = $g5k.nodes_status(site_uid)
if options[:nodes]
# Reserve nodes one by one
options[:nodes].each { |node_uid| jobs << oarsub(site_uid, "{host='#{node_uid.split('.')[0]}'}", options[:queue]) }
else
clusters = $g5k.cluster_uids(site_uid)
# Reserve as many free node as possible in one reservation
if options[:clusters]
options[:clusters].each { |cluster_uid|
jobs << oarsub(site_uid, "{cluster='#{cluster_uid}'}/nodes=BEST", options[:queue]) if clusters.include?(cluster_uid)
}
else
jobs << oarsub(site_uid, "nodes=BEST", options[:queue])
end
# Reserve busy nodes one by one
$g5k.nodes_status(site_uid).each { |fnode_uid, status|
cluster_uid = fnode_uid.split(/-/).first
next if options[:clusters] && ! options[:clusters].include?(cluster_uid)
next if File.exist?("output/#{fnode_uid}.yaml") # skip reservation if we alread have the node info
next if status != "busy" # only busy nodes
jobs << oarsub(site_uid, "{host='#{fnode_uid}'}", options[:queue])
}
end
#
# Process running jobs
#
released_jobs = {};
loop do
waiting_jobs = $g5k.get_my_jobs(site_uid, state='waiting')
running_jobs = $g5k.get_my_jobs(site_uid, state='running')
launching_jobs = $g5k.get_my_jobs(site_uid, state='launching')
puts "#{site_uid}: Running: #{running_jobs.size} - Waiting: #{waiting_jobs.size} - Launching: #{launching_jobs.size}"
running_jobs.each { |job|
job_uid = job['uid']
next unless jobs.any? { |j| j['uid'] == job_uid } # skip reservations that are not related to this script
if released_jobs[job_uid]
puts "#{site_uid}: #{job_uid} already processed" # AOR job deletions can take some times
next
end
puts "#{site_uid}: Processing #{job_uid}"
job['assigned_nodes'].peach(10) { |fnode_uid|
next if File.exist?("output/#{fnode_uid}.yaml")
run_g5kcheck(site_uid, fnode_uid, options)
}
puts "#{site_uid}: Release #{job_uid}"
begin
$g5k.release(job)
released_jobs[job_uid] = true
rescue Exception => e
puts "#{site_uid}: Error while releasing job #{job_uid} - #{e.class}: #{e.message}"
end
}
# Stop when there isn't any job left
break if running_jobs.empty? and waiting_jobs.empty? and launching_jobs.empty?
# Wait a little bit when the previous loop iteration did not find any job to process
sleep(5) if running_jobs.empty?
end
rescue Exception => e
puts "#{e.class}: #{e.message}"
ensure
jobs.each { |job|
puts "Release job #{job['links'][0]['href']}"
$g5k.release(job)
} if jobs.size > 0
end # begin/rescue/ensure
} # options[:sites].each
end # options[:force]
`ruby postprocessing.rb`
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment