run-g5kchecks.rb 11.9 KB
Newer Older
Jérémie Gaidamour's avatar
Jérémie Gaidamour committed
1 2
# This script reserves nodes and then runs g5k-check as root using the g5kadmin credentials.
#
Jérémie Gaidamour's avatar
Jérémie Gaidamour committed
3
# Usage: cd run-g5kcheck; ruby run-g5kchecks.rb
Jérémie Gaidamour's avatar
Jérémie Gaidamour committed
4 5 6 7 8 9 10
#
# - You can edit the node reservation at the beginning of the script (or create reservation manually).
# - The script will run g5k-checks on every nodes that have been reserved.
# - Output YAML files of g5k-checks are stored in output/
# - If an output YAML file already exist in ouput/, the execution of g5k-check on the corresponding node is skipped.
# - Use postprocessing.rb for moving the file in th input/ directory. This script also edits some keys of the YAML files.

11 12
require 'optparse'
require 'fileutils'
Jérémie Gaidamour's avatar
Jérémie Gaidamour committed
13 14
require 'cute'
require 'peach'
15
require '../lib/input_loader'
Jérémie Gaidamour's avatar
Jérémie Gaidamour committed
16 17
require 'pp'

18 19 20 21 22
if RUBY_VERSION < "2.1"
  puts "This script requires ruby >= 2.1"
  exit
end

23 24 25
# puts 'Init ruby-cute'
$g5k = Cute::G5K::API.new()
# puts '...done'
Jérémie Gaidamour's avatar
Jérémie Gaidamour committed
26

27
# puts 'Get site_uids'
28 29 30 31 32
begin
  sites = $g5k.site_uids()
rescue Exception => e
  puts "Error while getting the site list with ruby-cute: #{e.class}: #{e.message}"
  puts "API unavailable ?"
33
  refapi = load_yaml_file_hierarchy("../../input/grid5000/")
34 35
  sites = refapi["sites"].keys
end
36
# puts '...done'
Jérémie Gaidamour's avatar
Jérémie Gaidamour committed
37

38 39 40
#
# Parse command line parameters
#
Jérémie Gaidamour's avatar
Jérémie Gaidamour committed
41

42 43
options = {}
options[:sites] = sites # %w{grenoble lille luxembourg lyon nancy nantes reims rennes sophia}
Jérémie Gaidamour's avatar
Jérémie Gaidamour committed
44

45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61
OptionParser.new do |opts|
  opts.banner = "Usage: oar-properties.rb [options]"

  opts.separator ""
  opts.separator "Example: ruby run-g5kchecks.rb -s nancy -n graoully-1         # make a reservation on graoully-1 and run g5k-checks"
  opts.separator "         ruby run-g5kchecks.rb -s nancy -n graoully-1 --force # run g5k-checks without making a reservation (for dead node)"
  # opts.separator "         ruby run-g5kchecks.rb                                # make a reservation on every nodes"      

  ###

  opts.separator ""
  opts.separator "Filters:"

  opts.on('-s', '--sites a,b,c', Array, 'Select site(s)',
          "Default: "+options[:sites].join(", ")) do |s|
    raise "Wrong argument for -s option." unless (s - options[:sites]).empty?
    options[:sites] = s
Jérémie Gaidamour's avatar
Jérémie Gaidamour committed
62 63
  end

64 65 66
  opts.on('-c', '--clusters a,b,c', Array, 'Select clusters(s). Default: all') do |s|
    options[:clusters] = s
  end
Jérémie Gaidamour's avatar
Jérémie Gaidamour committed
67

68 69 70
  opts.on('-n', '--nodes a,b,c', Array, 'Select nodes(s). Default: all') do |n|
    options[:nodes] = n
  end
Jérémie Gaidamour's avatar
Jérémie Gaidamour committed
71

72
  ###
Jérémie Gaidamour's avatar
Jérémie Gaidamour committed
73

74 75
  opts.separator ""
  opts.separator "Node reservation options:"
Jérémie Gaidamour's avatar
Jérémie Gaidamour committed
76

77
  opts.on('-qQUEUE', '--queue=queue', String, 'Specify an OAR reservation queue') do |q|
78 79
    options[:queue] = q
  end
Jérémie Gaidamour's avatar
Jérémie Gaidamour committed
80

81 82 83 84 85 86
  opts.on('-f', '--force', 
          'Run g5k-checks on the nodes without any OAR reservation', 
          'This option is meant to be used for dead nodes',
          'or if you already reserved the ressources') do |f|
    options[:force] = true
  end
Jérémie Gaidamour's avatar
Jérémie Gaidamour committed
87

88 89
  ###
  ###
Jérémie Gaidamour's avatar
Jérémie Gaidamour committed
90

91
  ###
Jérémie Gaidamour's avatar
Jérémie Gaidamour committed
92

93 94
  opts.separator ""
  opts.separator "SSH options:"
Jérémie Gaidamour's avatar
Jérémie Gaidamour committed
95

96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112
  opts.on('--ssh-keys k1,k2,k3', Array, 'SSH keys') do |k|
    options[:ssh] ||= {}
    options[:ssh][:params] ||= {}
    options[:ssh][:params][:keys] ||= []
    options[:ssh][:params][:keys] << k
  end
  
  ###

  opts.separator ""
  opts.separator "Common options:"
  
  # Print an options summary.
  opts.on_tail("-h", "--help", "Show this message") do
    puts opts
    exit
  end
Jérémie Gaidamour's avatar
Jérémie Gaidamour committed
113 114 115 116 117 118 119 120

  if ARGV.empty?
    printf "No option is specified. Run g5k-checks on the entire platform ? (y/N) "
    prompt = STDIN.gets.chomp
    puts opts
    exit if prompt != 'y'
  end

121
end.parse!
Jérémie Gaidamour's avatar
Jérémie Gaidamour committed
122

123 124 125 126 127 128
options[:ssh] ||= {}
options[:ssh][:user] = 'g5kadmin'        unless options[:ssh][:user]
options[:ssh][:host] = '%s.g5kadmin' unless options[:ssh][:host]
options[:ssh][:params] ||= {:timeout => 10}

options[:queue] ||= 'admin'
Jérémie Gaidamour's avatar
Jérémie Gaidamour committed
129

130
puts "Options: #{options}" if options[:verbose]
Jérémie Gaidamour's avatar
Jérémie Gaidamour committed
131

132 133 134
#
#
#
Jérémie Gaidamour's avatar
Jérémie Gaidamour committed
135

136 137
FileUtils::mkdir_p("output/")

Jérémie Gaidamour's avatar
Jérémie Gaidamour committed
138
# fnode_uid = fully qualified name (node.site.nancy.grid5000.fr)
139 140
def run_g5kcheck(site_uid, fnode_uid, options)
  puts "#{site_uid}: Processing #{fnode_uid}"
Jérémie Gaidamour's avatar
Jérémie Gaidamour committed
141

142 143
  begin
    Net::SSH.start(options[:ssh][:host].gsub("%s", site_uid), options[:ssh][:user], options[:ssh][:params]) { |ssh|
144
      output1 = ssh.exec!("sudo ssh -o StrictHostKeychecking=no root@#{fnode_uid} 'sudo /usr/bin/g5k-checks -m api'")
145 146 147 148 149 150 151 152 153 154 155 156 157 158
      output2 = ssh.exec!("sudo ssh -q -o StrictHostKeychecking=no root@#{fnode_uid} 'cat /tmp/#{fnode_uid}.yaml'")

      if output2 == ''
        puts output1 # ex: "ssh: connect to host graphite-1.nancy.grid5000.fr port 22: No route to host\r\n"
      else
        File.open("output/#{fnode_uid}.yaml", 'w') do |f|
          f.write(output2) 
        end
      end
    }
  rescue Exception => e
    puts "#{site_uid}: Error while processing #{fnode_uid} - #{e.class}: #{e.message}"
  end
end
Jérémie Gaidamour's avatar
Jérémie Gaidamour committed
159

160
def oarsub(site_uid, resources, queue)
161
  job = nil
162

163 164
  begin
    job = $g5k.reserve(:site => site_uid, :resources => resources, :walltime => '00:30:00', :wait => false, :queue => queue)
165

166 167 168
  rescue Exception => e
    puts "#{site_uid}: Error during the reservation '#{resources}' at #{site_uid} - #{e.class}: #{e.message}"
  end
169 170

  return job
171
end
172

173 174 175
if options[:force]
  
  # puts 'Get input/'
176
  refapi_hash = load_yaml_file_hierarchy("../../input/grid5000/") # use input/ has nodes might not be register in OAR db yet (for new clusters installation)
177 178 179 180
  # puts '...done'

  run_queue ||= {}

Jérémie Gaidamour's avatar
Jérémie Gaidamour committed
181
  # Safeguard. Ask before running g5k-checks on reserved nodes (We should not interfere with user experiments)
182 183 184 185 186 187 188 189 190 191
  prompt = ''
  options[:sites].each { |site_uid|
    run_queue[site_uid] ||= []

    # puts "Get node status at #{site_uid}"
    nodes_status = nil # postpone query
    # puts '...done'

    refapi_hash['sites'][site_uid]["clusters"].peach { |cluster_uid, cluster|
      next if options[:clusters] && ! options[:clusters].include?(cluster_uid)
Jérémie Gaidamour's avatar
Jérémie Gaidamour committed
192
      
193
      cluster["nodes"].each_sort_by_node_uid { |node_uid, node|
PARISOT Clement's avatar
PARISOT Clement committed
194
        next if options[:nodes] && ! options[:nodes].include?(node_uid)
Jérémie Gaidamour's avatar
Jérémie Gaidamour committed
195

196
        fnode_uid = "#{node_uid}.#{site_uid}.grid5000.fr"
197
        
198 199 200 201 202
        if File.exist?("output/#{fnode_uid}.yaml")
          puts "output/#{fnode_uid}.yaml exist. Remove this file if you want to run g5k-checks again on this node."
          next
        end

Jérémie Gaidamour's avatar
Jérémie Gaidamour committed
203 204 205 206
        if nodes_status.nil?
          begin
            nodes_status = $g5k.nodes_status(site_uid)
          rescue Exception => e
207
            nodes_status = {} # do not retry
Jérémie Gaidamour's avatar
Jérémie Gaidamour committed
208 209 210 211 212
            puts "Error while getting nodes status at #{site_uid}" #{e}
            next
          end
        end

213
        if prompt != 'yes-all' && nodes_status[fnode_uid] && nodes_status[fnode_uid] == "busy"
214 215 216 217 218 219 220 221 222 223 224
          if prompt != 'no-all'
            printf "#{site_uid} - #{node_uid} is busy (ie. there is currently an OAR reservation. Run g5k-checks on reserved nodes ? (y/yes-all/no-all/N) "
            prompt = STDIN.gets.chomp
            run_queue[site_uid] << fnode_uid if prompt == 'y' || prompt == 'yes-all'
          end
        else
          run_queue[site_uid] << fnode_uid
        end
        
      }
    }
Jérémie Gaidamour's avatar
Jérémie Gaidamour committed
225 226
  }

227 228 229 230 231
  # Actual run
  run_queue.peach { |site_uid, q|
    q.peach { |fnode_uid|
      run_g5kcheck(site_uid, fnode_uid, options)
    }
Jérémie Gaidamour's avatar
Jérémie Gaidamour committed
232
  }
233

234 235
else # ! options[:force]

Jérémie Gaidamour's avatar
Jérémie Gaidamour committed
236
  begin
237 238 239
    jobs = {} # list of OAR reservation
    released_jobs = {};

240
    options[:sites].peach  { |site_uid|
Jérémie Gaidamour's avatar
Jérémie Gaidamour committed
241

242 243 244
      jobs[site_uid] = [] # list of OAR reservation
      released_jobs[site_uid] = {};

Jérémie Gaidamour's avatar
Jérémie Gaidamour committed
245 246
      begin

247 248 249 250 251 252 253 254 255 256
        #
        # Node reservation
        #
        
        begin
          nodes_status = $g5k.nodes_status(site_uid)
        rescue Exception => e
          puts "Error while getting nodes status at #{site_uid}" #{e}
          next
        end
Jérémie Gaidamour's avatar
Jérémie Gaidamour committed
257

258
        if options[:nodes]        
Jérémie Gaidamour's avatar
Jérémie Gaidamour committed
259

260 261 262 263
          # Reserve nodes one by one
          options[:nodes].each { |uid| 
            node_uid = uid.split('.')[0] # entries might be either 'node' or 'node.site.grid5000.fr'
            fnode_uid = "#{node_uid}.#{site_uid}.grid5000.fr"
264

265 266 267
            cluster_uid = node_uid.split(/-/).first
            next if options[:clusters] && ! options[:clusters].include?(cluster_uid) # -c and -n info should be consistent
            next if ! nodes_status.keys.include?(fnode_uid)                          # the node does not belong to this site
268 269 270 271 272

            if File.exist?("output/#{fnode_uid}.yaml")
              puts "output/#{fnode_uid}.yaml exist. Remove this file if you want to run g5k-checks again on this node."
              next
            end
273
            
274
            jobs[site_uid] << oarsub(site_uid, "{host='#{fnode_uid}'}", options[:queue]) 
275
          }
276
          
277
        else
278 279 280 281 282 283

          clusters = $g5k.cluster_uids(site_uid)
          
          # Reserve as many free node as possible in one reservation
          if options[:clusters]
            options[:clusters].each { |cluster_uid|
284
              jobs[site_uid] << oarsub(site_uid, "{cluster='#{cluster_uid}'}/nodes=BEST", options[:queue]) if clusters.include?(cluster_uid)
285 286
            }
          else
287
            jobs[site_uid] << oarsub(site_uid, "nodes=BEST", options[:queue])
288 289 290 291 292 293 294 295
          end
          
          # Reserve busy nodes one by one
          nodes_status.each { |fnode_uid, status|
            cluster_uid = fnode_uid.split(/-/).first
            next if options[:clusters] && ! options[:clusters].include?(cluster_uid)
            next if File.exist?("output/#{fnode_uid}.yaml") # skip reservation if we alread have the node info
            next if status != "busy"                        # only busy nodes
296
            
297 298 299 300 301
            if File.exist?("output/#{fnode_uid}.yaml")
              puts "output/#{fnode_uid}.yaml exist. Remove this file if you want to run g5k-checks again on this node."
              next
            end

302
            jobs[site_uid] << oarsub(site_uid, "{host='#{fnode_uid}'}", options[:queue])
303 304 305
          }
          
        end
306
        
307 308 309
        #
        # Process running jobs
        #
310
        
311 312 313 314
        loop do
          waiting_jobs   = $g5k.get_my_jobs(site_uid, state='waiting')
          running_jobs   = $g5k.get_my_jobs(site_uid, state='running')
          launching_jobs = $g5k.get_my_jobs(site_uid, state='launching')
315
          
316
          puts "#{site_uid}: Running: #{running_jobs.size} - Waiting: #{waiting_jobs.size} - Launching: #{launching_jobs.size}"
317
          
318 319 320
          running_jobs.each { |job|
            job_uid = job['uid']

321
            next unless jobs[site_uid].any? { |j| j['uid'] == job_uid } # skip reservations that are not related to this script
322
            
323
            if released_jobs[site_uid][job_uid]
324 325 326
              puts "#{site_uid}: #{job_uid} already processed (waiting for job termination)" # OAR job deletions can take some times
              next
            end
327
            
328
            puts "#{site_uid}: Processing #{job_uid}"
329
            
330 331 332 333 334 335 336
            job['assigned_nodes'].peach(10) { |fnode_uid|
              
              next if File.exist?("output/#{fnode_uid}.yaml")
              
              run_g5kcheck(site_uid, fnode_uid, options)
              
            }
337
            
338 339 340
            puts "#{site_uid}: Release #{job_uid}"
            begin
              $g5k.release(job)
341
              released_jobs[site_uid][job_uid] = true
342 343 344
            rescue Exception => e
              puts "#{site_uid}: Error while releasing job #{job_uid} - #{e.class}: #{e.message}"
            end
345 346
          }
          
347 348 349 350 351 352 353
          # Stop when there isn't any job left
          break    if running_jobs.empty? and waiting_jobs.empty? and launching_jobs.empty?
          
          # Wait a little bit when the previous loop iteration did not find any job to process
          sleep(5) if running_jobs.empty?
          
        end
354 355
        
      end # begin
356 357
      
    } # options[:sites].peach
358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374
    
  rescue Exception => e
    puts "#{e.class}: #{e.message}"
  ensure
    jobs.each{|site_uid, jobs| jobs.compact.each { |job|
        begin
          job_uid = job['uid']
          if released_jobs[site_uid][job_uid] != true
            puts "Release job #{job['links'][0]['href']}"
            $g5k.release(job)
          end
        rescue Exception => e
          puts "Failed releasing job #{job['links'][0]['href']} - #{e.class}: #{e.message}"
        end
      }
    }
    exit
Jérémie Gaidamour's avatar
Jérémie Gaidamour committed
375
  end
376
  
377
end # options[:force]
378 379

`ruby postprocessing.rb`