check-cluster-homogeneity.rb 8.95 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13
#!/usr/bin/ruby

# This script checks the cluster homogeneity

if RUBY_VERSION < "2.1"
  puts "This script requires ruby >= 2.1"
  exit
end

require 'pp'
require 'fileutils'
require 'pathname'
require 'hashdiff'
14 15 16
require 'json'
require 'uri'
require 'net/https'
17

18
require_relative "../lib/input_loader"
19

20 21 22 23 24
def global_ignore_keys()

  #
  # Global ignore keys
  #
25 26

  ignore_keys = %w(
27
    ~chassis.serial
28
  
29 30 31 32 33
    ~network_adapters.bmc.ip
    ~network_adapters.bmc.mac
    ~network_adapters.bmc.network_address
    ~network_adapters.bmc.switch
    ~network_adapters.bmc.switch_port
34
  
35 36 37 38
    ~network_adapters.myri0.ip
    ~network_adapters.myri0.ip6
    ~network_adapters.myri0.mac
    ~network_adapters.myri0.network_address
39
  
40 41 42 43 44
    ~pdu
    ~pdu.port
    ~pdu.uid
    ~pdu[0]
    ~pdu[1]
45

46
    ~supported_job_types.max_walltime
47

48 49
    ~mic.ip
    ~mic.mac
50

51 52
    +status
    -status
53 54 55
  )

  ignore_netkeys = <<-eos
56 57
    ~network_adapters.eth.rate
    ~network_adapters.eth.name
58
    ~network_adapters.eth.ip
59
    -network_adapters.eth.ip
60
    ~network_adapters.eth.ip6
61
    -network_adapters.eth.ip6
62 63 64 65
    ~network_adapters.eth.mac
    ~network_adapters.eth.network_address
    ~network_adapters.eth.switch
    ~network_adapters.eth.switch_port
66 67 68
eos

  ignore_stokeys = <<-eos
69
    ~storage_devices.sd.model
70
    ~storage_devices.sd.firmware_version
71
    ~storage_devices.sd.rev
72
    -storage_devices.sd.rev
73 74 75 76
    ~storage_devices.sd.size
    ~storage_devices.sd.timeread
    ~storage_devices.sd.timewrite
    ~storage_devices.sd.vendor
77 78
    ~storage_devices.sd.by_id
    ~storage_devices.sd.by_path
79 80
eos

81
  (0..5).each { |eth|
82 83 84 85
    keys = ignore_netkeys.gsub('.eth.', ".eth#{eth}.").gsub("\n", " ").split(" ")
    ignore_keys.push(* keys)

    (1..21).each { |kavlan|
86
      ignore_keys << "~kavlan.eth#{eth}.kavlan-#{kavlan}"
87 88 89
    }
  }

90
  ('a'..'f').each { |sd| 
91 92 93 94
    keys = ignore_stokeys.gsub('.sd.', ".sd#{sd}.").gsub("\n", " ").split(" ")
    ignore_keys.push(* keys)
  }

95 96 97 98 99 100 101
  ignore_ibkeys = <<-eos
    ~network_adapters.IB_IF.guid
    ~network_adapters.IB_IF.hwid
    ~network_adapters.IB_IF.ip
    ~network_adapters.IB_IF.ip6
    ~network_adapters.IB_IF.line_card
    ~network_adapters.IB_IF.position
102
    +network_adapters.IB_IF.version
103 104 105 106 107 108 109 110 111 112 113 114
eos

  ib_interfaces = [
    'ib0',
    'ib1',
    'ib0.8100'
  ]

  ib_interfaces.each { |ib_if|
    keys = ignore_ibkeys.gsub('IB_IF', "#{ib_if}").gsub("\n", " ").split(" ")
    ignore_keys.push(* keys)
  }
115 116 117
  return ignore_keys
end

118 119 120 121 122 123
def cluster_ignore_keys(filename)
  file_hash = YAML::load(ERB.new(File.read(filename)).result(binding))
  file_hash.expand_square_brackets() if file_hash
  return file_hash
end

124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156
def get_site_dead_nodes(site_uid, options)

  oarnodes = ''
  api_uri = URI.parse('https://api.grid5000.fr/stable/sites/' + site_uid  + '/internal/oarapi/resources/details.json?limit=999999')

  # Download the OAR properties from the OAR API (through G5K API)
  puts "Downloading OAR resources properties from #{api_uri} ..." if options[:verbose]
  http = Net::HTTP.new(api_uri.host, Net::HTTP.https_default_port)
  http.use_ssl = true
  request = Net::HTTP::Get.new(api_uri.request_uri)

  # For outside g5k network access
  if options[:api][:user] && options[:api][:pwd]
    request.basic_auth(options[:api][:user], options[:api][:pwd])
    #request.basic_auth("nmichon", "o|JsGvGD4200")
  end

  response = http.request(request)
  raise "Failed to fetch resources properties from API: \n#{response.body}\n" unless response.code.to_i == 200
  puts '... done' if options[:verbose]
  oarnodes = JSON.parse(response.body)

  # Adapt from the format of the OAR API
  oarnodes = oarnodes['items'] if oarnodes.key?('items')
  dead_nodes = []
  oarnodes.each() { |node|
    if node["state"] == "Dead" && !dead_nodes.include?(node["network_address"])
      dead_nodes << node["network_address"].split(".")[0]
    end
  }
  return dead_nodes
end

157 158 159
def cluster_homogeneity(refapi_hash, options = {:verbose => false})
  verbose = options[:verbose]

160 161 162 163 164 165 166 167
  if verbose
    puts "The change set is represented using the following syntax:"
    puts '  [["+", "path.to.key1", value],          # new key'
    puts '   ["-", "path.to.key2", value],          # missing key'
    puts '   ["~", "path.to.key3", value1, value2]] # modified value'
    puts ''
  end

168
  ignore_keys  = global_ignore_keys()
169
  cignore_keys = cluster_ignore_keys(File.expand_path("../input-validators/check-cluster-homogeneity.yaml.erb", File.dirname(__FILE__)))
170

171 172
  input_data_dir = "../../input/grid5000/"
  refapi_hash = load_yaml_file_hierarchy(File.expand_path(input_data_dir, File.dirname(__FILE__)))
173
  count = {}
174 175
  total_count = 0

176
  refapi_hash["sites"].sort.each do |site_uid, site|
177 178
    next if options.key?(:sites) && !options[:sites].include?(site_uid)

179 180
    site_dead_nodes = get_site_dead_nodes(site_uid, options)

181 182 183
    count[site_uid] = {}

    site["clusters"].sort.each do |cluster_uid, cluster|
184 185
      next if options.key?(:clusters) && !options[:clusters].include?(cluster_uid)

186 187
      count[site_uid][cluster_uid] = 0

188 189
      refnode_uid = nil
      refnode = nil
190

191
      cluster["nodes"].each_sort_by_node_uid do |node_uid, node|
192
        next if node['status'] == 'retired' || site_dead_nodes.include?(node_uid)
193

194 195 196 197 198 199
        if !refnode
          refnode = node
          refnode_uid = node_uid
          next
        end

200
        diffs = HashDiff.diff(refnode, node)
201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223

        # Hack HashDiff output for arrays:
        #[["-", "pdu[1]", {"uid"=>"graphene-pdu9", "port"=>24}],
        # ["-", "pdu[0]", {"uid"=>"graphene-pdu9", "port"=>23}],
        # ["+", "pdu[0]", {"uid"=>"graphene-pdu9", "port"=>21}],
        # ["+", "pdu[1]", {"uid"=>"graphene-pdu9", "port"=>22}]]
        # => should be something like this:
        # [["~", "pdu[0]", {"uid"=>"graphene-pdu9", "port"=>23}, {"uid"=>"graphene-pdu9", "port"=>22},
        #  ["~", "pdu[1]", {"uid"=>"graphene-pdu9", "port"=>24}, {"uid"=>"graphene-pdu9", "port"=>23}}
        d = diffs.select{|x| x[0] != '~' }.group_by{ |x| x[1] }
        d.each { |k, v|
          d[k] = v.group_by{ |x| x[0] }
        }
        d.each { |k,v|
          if v.key?('-') && v.key?('+')
            #puts "Warning: #{node_uid}: convert +/- -> ~ for #{k}"
            diffs.delete(["-", k, v['-'][0][2]])
            diffs.delete(["+", k, v['+'][0][2]])
            diffs << ["~", k, v['-'][0][2], v['+'][0][2] ]
          end
        }
        # end of hack

224
        # Remove keys that are specific to each nodes (ip, mac etc.)
225
        ikeys = cignore_keys[site_uid][node_uid] rescue nil
226
        diffs.clone.each { |diff|
227 228
          diffs.delete(diff) if ignore_keys.include?(diff[0] + diff[1])
          diffs.delete(diff) if ikeys && ikeys.include?(diff[0] + diff[1])
229 230 231
        }

        if verbose && !diffs.empty?
232 233
          puts "Differences between #{refnode_uid} and #{node_uid}:"
          pp diffs
234 235
        end

236
        total_count += diffs.size
237 238 239 240 241 242 243 244 245
        count[site_uid][cluster_uid] += diffs.size

        # Remove the following line if you want to compare each nodes to the first cluster node
        refnode_uid = node_uid
        refnode = node
      end
    end
  end

246
  return [total_count, count]
247 248
end

249 250
def check_cluster_homogeneity(refapi_hash, options = {:verbose => false})
  verbose = options[:verbose]
251 252
  puts "Differences found between successive nodes, per cluster:\n\n"

253
  total_count, count = cluster_homogeneity(refapi_hash, options)
254 255 256 257
  puts "\n" if verbose

  puts count.to_yaml unless verbose

258
  puts "\nUse '-v' option for details." unless verbose
259 260

  return total_count
261 262 263 264 265 266
end

if __FILE__ == $0
  require 'optparse'

  options = {}
Arthur Garnier's avatar
Arthur Garnier committed
267
  options[:sites] = %w{grenoble lille luxembourg lyon nancy nantes rennes sophia}
268
  options[:api] = {}
269 270 271

  OptionParser.new do |opts|
    opts.banner = "Usage: check-cluster-homogeneity.rb [options]"
272

273 274
    opts.separator ""
    opts.separator "Example: ruby check-cluster-homogeneity.rb -v"
275 276

    ###
277

278
    opts.separator ""
279
    opts.separator "Filters:"
280

281 282 283 284 285
    opts.on('-s', '--sites a,b,c', Array, 'Select site(s)',
            "Default: "+options[:sites].join(", ")) do |s|
      raise "Wrong argument for -s option." unless (s - options[:sites]).empty?
      options[:sites] = s
    end
286

287 288 289
    opts.on('-c', '--clusters a,b,c', Array, 'Select clusters(s). Default: all') do |s|
      options[:clusters] = s
    end
290

291 292
    opts.separator ""
    opts.separator "Common options:"
293 294 295 296 297

    opts.on("-v", "--[no-]verbose", "Run verbosely") do |v|
      options[:verbose] ||= 0
      options[:verbose] = options[:verbose] + 1
    end
298

299 300 301 302 303 304 305 306
    opts.on('--api-user user', String, 'HTTP authentication user when outside G5K') do |user|
      options[:api][:user] = user
    end

    opts.on('--api-password pwd', String, 'HTTP authentication password when outside G5K') do |pwd|
      options[:api][:pwd] = pwd
    end

307 308 309 310 311 312
    # Print an options summary.
    opts.on_tail("-h", "--help", "Show this message") do
      puts opts
      exit
    end
  end.parse!
313

314
  refapi_hash = load_yaml_file_hierarchy(File.expand_path("../../input/grid5000/", File.dirname(__FILE__)))
315 316 317 318 319
  total_count = check_cluster_homogeneity(refapi_hash, options)

  # return 0 if all nodes are homogeneous, 1 otherwise
  exit total_count == 0

320
end