run-g5kchecks.rb 9.47 KB
Newer Older
Jérémie Gaidamour's avatar
Jérémie Gaidamour committed
1
2
# This script reserves nodes and then runs g5k-check as root using the g5kadmin credentials.
#
Jérémie Gaidamour's avatar
Jérémie Gaidamour committed
3
# Usage: cd run-g5kcheck; ruby run-g5kchecks.rb
Jérémie Gaidamour's avatar
Jérémie Gaidamour committed
4
5
6
7
8
9
10
#
# - You can edit the node reservation at the beginning of the script (or create reservation manually).
# - The script will run g5k-checks on every nodes that have been reserved.
# - Output YAML files of g5k-checks are stored in output/
# - If an output YAML file already exist in ouput/, the execution of g5k-check on the corresponding node is skipped.
# - Use postprocessing.rb for moving the file in th input/ directory. This script also edits some keys of the YAML files.

11
12
require 'optparse'
require 'fileutils'
Jérémie Gaidamour's avatar
Jérémie Gaidamour committed
13
14
require 'cute'
require 'peach'
15
require '../lib/input_loader'
Jérémie Gaidamour's avatar
Jérémie Gaidamour committed
16
17
require 'pp'

18
19
20
# puts 'Init ruby-cute'
$g5k = Cute::G5K::API.new()
# puts '...done'
Jérémie Gaidamour's avatar
Jérémie Gaidamour committed
21

22
23
24
# puts 'Get site_uids'
sites = $g5k.site_uids()
# puts '...done'
Jérémie Gaidamour's avatar
Jérémie Gaidamour committed
25

26
27
28
#
# Parse command line parameters
#
Jérémie Gaidamour's avatar
Jérémie Gaidamour committed
29

30
31
options = {}
options[:sites] = sites # %w{grenoble lille luxembourg lyon nancy nantes reims rennes sophia}
Jérémie Gaidamour's avatar
Jérémie Gaidamour committed
32

33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
OptionParser.new do |opts|
  opts.banner = "Usage: oar-properties.rb [options]"

  opts.separator ""
  opts.separator "Example: ruby run-g5kchecks.rb -s nancy -n graoully-1         # make a reservation on graoully-1 and run g5k-checks"
  opts.separator "         ruby run-g5kchecks.rb -s nancy -n graoully-1 --force # run g5k-checks without making a reservation (for dead node)"
  # opts.separator "         ruby run-g5kchecks.rb                                # make a reservation on every nodes"      

  ###

  opts.separator ""
  opts.separator "Filters:"

  opts.on('-s', '--sites a,b,c', Array, 'Select site(s)',
          "Default: "+options[:sites].join(", ")) do |s|
    raise "Wrong argument for -s option." unless (s - options[:sites]).empty?
    options[:sites] = s
Jérémie Gaidamour's avatar
Jérémie Gaidamour committed
50
51
  end

52
53
54
  opts.on('-c', '--clusters a,b,c', Array, 'Select clusters(s). Default: all') do |s|
    options[:clusters] = s
  end
Jérémie Gaidamour's avatar
Jérémie Gaidamour committed
55

56
57
58
  opts.on('-n', '--nodes a,b,c', Array, 'Select nodes(s). Default: all') do |n|
    options[:nodes] = n
  end
Jérémie Gaidamour's avatar
Jérémie Gaidamour committed
59

60
  ###
Jérémie Gaidamour's avatar
Jérémie Gaidamour committed
61

62
63
  opts.separator ""
  opts.separator "Node reservation options:"
Jérémie Gaidamour's avatar
Jérémie Gaidamour committed
64

65
  opts.on('-qQUEUE', '--queue=queue', String, 'Specify an OAR reservation queue') do |q|
66
67
    options[:queue] = q
  end
Jérémie Gaidamour's avatar
Jérémie Gaidamour committed
68

69
70
71
72
73
74
  opts.on('-f', '--force', 
          'Run g5k-checks on the nodes without any OAR reservation', 
          'This option is meant to be used for dead nodes',
          'or if you already reserved the ressources') do |f|
    options[:force] = true
  end
Jérémie Gaidamour's avatar
Jérémie Gaidamour committed
75

76
77
  ###
  ###
Jérémie Gaidamour's avatar
Jérémie Gaidamour committed
78

79
  ###
Jérémie Gaidamour's avatar
Jérémie Gaidamour committed
80

81
82
  opts.separator ""
  opts.separator "SSH options:"
Jérémie Gaidamour's avatar
Jérémie Gaidamour committed
83

84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
  opts.on('--ssh-keys k1,k2,k3', Array, 'SSH keys') do |k|
    options[:ssh] ||= {}
    options[:ssh][:params] ||= {}
    options[:ssh][:params][:keys] ||= []
    options[:ssh][:params][:keys] << k
  end
  
  ###

  opts.separator ""
  opts.separator "Common options:"
  
  # Print an options summary.
  opts.on_tail("-h", "--help", "Show this message") do
    puts opts
    exit
  end
end.parse!
Jérémie Gaidamour's avatar
Jérémie Gaidamour committed
102

103
104
105
106
107
108
options[:ssh] ||= {}
options[:ssh][:user] = 'g5kadmin'        unless options[:ssh][:user]
options[:ssh][:host] = '%s.g5kadmin' unless options[:ssh][:host]
options[:ssh][:params] ||= {:timeout => 10}

options[:queue] ||= 'admin'
Jérémie Gaidamour's avatar
Jérémie Gaidamour committed
109

110
puts "Options: #{options}" if options[:verbose]
Jérémie Gaidamour's avatar
Jérémie Gaidamour committed
111

112
113
114
#
#
#
Jérémie Gaidamour's avatar
Jérémie Gaidamour committed
115

116
117
118
119
FileUtils::mkdir_p("output/")

def run_g5kcheck(site_uid, fnode_uid, options)
  puts "#{site_uid}: Processing #{fnode_uid}"
Jérémie Gaidamour's avatar
Jérémie Gaidamour committed
120

121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
  begin
    Net::SSH.start(options[:ssh][:host].gsub("%s", site_uid), options[:ssh][:user], options[:ssh][:params]) { |ssh|
      output1 = ssh.exec!("sudo ssh -o StrictHostKeychecking=no root@#{fnode_uid} '/usr/bin/g5k-checks -m api'")
      output2 = ssh.exec!("sudo ssh -q -o StrictHostKeychecking=no root@#{fnode_uid} 'cat /tmp/#{fnode_uid}.yaml'")

      if output2 == ''
        puts output1 # ex: "ssh: connect to host graphite-1.nancy.grid5000.fr port 22: No route to host\r\n"
      else
        File.open("output/#{fnode_uid}.yaml", 'w') do |f|
          f.write(output2) 
        end
      end
    }
  rescue Exception => e
    puts "#{site_uid}: Error while processing #{fnode_uid} - #{e.class}: #{e.message}"
  end
end
Jérémie Gaidamour's avatar
Jérémie Gaidamour committed
138

139
140
def oarsub(site_uid, resources, queue)
  begin
141
142
143

    $g5k.reserve(:site => site_uid, :resources => resources, :walltime => '00:30:00', :wait => false, :queue => queue)

144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
  rescue Exception => e
    puts "#{site_uid}: Error during the reservation '#{resources}' at #{site_uid} - #{e.class}: #{e.message}"
  end
end
  
if options[:force]
  
  # puts 'Get input/'
  refapi_hash = load_yaml_file_hierarchy("../input/grid5000/") # use input/ has nodes might not be register in OAR db yet (for new clusters installation)
  # puts '...done'

  run_queue ||= {}

  # Safeguard. Ask before running g5k-checks on reserved nodes (We should not interfere with user experiments
  prompt = ''
  options[:sites].each { |site_uid|
    run_queue[site_uid] ||= []

    # puts "Get node status at #{site_uid}"
    nodes_status = nil # postpone query
    # puts '...done'

    refapi_hash['sites'][site_uid]["clusters"].peach { |cluster_uid, cluster|
      next if options[:clusters] && ! options[:clusters].include?(cluster_uid)
Jérémie Gaidamour's avatar
Jérémie Gaidamour committed
168
      
169
170
      cluster["nodes"].each_sort_by_node_uid { |node_uid, node|
        next if options[:nodes] && ! options[:nodes].include?(noder_uid)
Jérémie Gaidamour's avatar
Jérémie Gaidamour committed
171

172
173
174
175
176
177
178
179
        fnode_uid = "#{node_uid}.#{site_uid}.grid5000.fr"
   
        if File.exist?("output/#{fnode_uid}.yaml")
          puts "output/#{fnode_uid}.yaml exist. Remove this file if you want to run g5k-checks again on this node."
          next
        end

        nodes_status = $g5k.nodes_status(site_uid) if nodes_status.nil?
180
        if prompt != 'yes-all' && nodes_status[fnode_uid] && nodes_status[fnode_uid] == "busy"
181
182
183
184
185
186
187
188
189
190
191
          if prompt != 'no-all'
            printf "#{site_uid} - #{node_uid} is busy (ie. there is currently an OAR reservation. Run g5k-checks on reserved nodes ? (y/yes-all/no-all/N) "
            prompt = STDIN.gets.chomp
            run_queue[site_uid] << fnode_uid if prompt == 'y' || prompt == 'yes-all'
          end
        else
          run_queue[site_uid] << fnode_uid
        end
        
      }
    }
Jérémie Gaidamour's avatar
Jérémie Gaidamour committed
192
193
  }

194
195
196
197
198
  # Actual run
  run_queue.peach { |site_uid, q|
    q.peach { |fnode_uid|
      run_g5kcheck(site_uid, fnode_uid, options)
    }
Jérémie Gaidamour's avatar
Jérémie Gaidamour committed
199
  }
200

201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
else # ! options[:force]

  options[:sites].peach  { |site_uid|

    jobs = [] # list of OAR reservation
    
    begin
         
      #
      # Node reservation
      #
      
      nodes_status = $g5k.nodes_status(site_uid)
      
      if options[:nodes]
        
        # Reserve nodes one by one
        options[:nodes].each { |node_uid| jobs << oarsub(site_uid, "{host='#{node_uid.split('.')[0]}'}", options[:queue]) }
        
      else
221

222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
        clusters = $g5k.cluster_uids(site_uid)
        
        # Reserve as many free node as possible in one reservation
        if options[:clusters]
          options[:clusters].each { |cluster_uid|
            jobs << oarsub(site_uid, "{cluster='#{cluster_uid}'}/nodes=BEST", options[:queue]) if clusters.include?(cluster_uid)
          }
        else
          jobs << oarsub(site_uid, "nodes=BEST", options[:queue])
        end
        
        # Reserve busy nodes one by one
        $g5k.nodes_status(site_uid).each { |fnode_uid, status|
          cluster_uid = fnode_uid.split(/-/).first
          next if options[:clusters] && ! options[:clusters].include?(cluster_uid)
          next if File.exist?("output/#{fnode_uid}.yaml") # skip reservation if we alread have the node info
          next if status != "busy"                        # only busy nodes
            
          jobs << oarsub(site_uid, "{host='#{fnode_uid}'}", options[:queue])
        }
        
      end
      
      #
      # Process running jobs
      #
      
      released_jobs = {};
      
      loop do
        waiting_jobs   = $g5k.get_my_jobs(site_uid, state='waiting')
        running_jobs   = $g5k.get_my_jobs(site_uid, state='running')
        launching_jobs = $g5k.get_my_jobs(site_uid, state='launching')
        
        puts "#{site_uid}: Running: #{running_jobs.size} - Waiting: #{waiting_jobs.size} - Launching: #{launching_jobs.size}"
        
        running_jobs.each { |job|
          job_uid = job['uid']

          next unless jobs.any? { |j| j['uid'] == job_uid } # skip reservations that are not related to this script
          
          if released_jobs[job_uid]
            puts "#{site_uid}: #{job_uid} already processed" # AOR job deletions can take some times
            next
          end
          
          puts "#{site_uid}: Processing #{job_uid}"
          
          job['assigned_nodes'].peach(10) { |fnode_uid|
            
            next if File.exist?("output/#{fnode_uid}.yaml")
            
            run_g5kcheck(site_uid, fnode_uid, options)
            
          }
          
          puts "#{site_uid}: Release #{job_uid}"
          begin
            $g5k.release(job)
            released_jobs[job_uid] = true
          rescue Exception => e
            puts "#{site_uid}: Error while releasing job #{job_uid} - #{e.class}: #{e.message}"
          end
        }
        
        # Stop when there isn't any job left
        break    if running_jobs.empty? and waiting_jobs.empty? and launching_jobs.empty?
        
        # Wait a little bit when the previous loop iteration did not find any job to process
        sleep(5) if running_jobs.empty?
        
      end

    rescue Exception => e
      puts "#{e.class}: #{e.message}"
    ensure
      jobs.each { |job|
        puts "Release job #{job['links'][0]['href']}"
        $g5k.release(job) 
      } if jobs.size > 0
    end # begin/rescue/ensure
    
  } # options[:sites].each
305
  
306
end # options[:force]
307
308

`ruby postprocessing.rb`