Une MAJ de sécurité est nécessaire sur notre version actuelle. Elle sera effectuée lundi 02/08 entre 12h30 et 13h. L'interruption de service devrait durer quelques minutes (probablement moins de 5 minutes).

check-cluster-homogeneity.rb 9.08 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13
#!/usr/bin/ruby

# This script checks the cluster homogeneity

if RUBY_VERSION < "2.1"
  puts "This script requires ruby >= 2.1"
  exit
end

require 'pp'
require 'fileutils'
require 'pathname'
require 'hashdiff'
14 15 16
require 'json'
require 'uri'
require 'net/https'
17

18 19
$LOAD_PATH.unshift(File.expand_path(File.join(File.dirname(__FILE__), '../../lib')))
require 'refrepo/input_loader'
20

21 22 23 24 25
def global_ignore_keys()

  #
  # Global ignore keys
  #
26 27

  ignore_keys = %w(
28
    ~chassis.serial
29
  
30 31 32 33 34
    ~network_adapters.bmc.ip
    ~network_adapters.bmc.mac
    ~network_adapters.bmc.network_address
    ~network_adapters.bmc.switch
    ~network_adapters.bmc.switch_port
35
  
36 37 38 39
    ~network_adapters.myri0.ip
    ~network_adapters.myri0.ip6
    ~network_adapters.myri0.mac
    ~network_adapters.myri0.network_address
40
  
41 42 43
    ~network_adapters.ib0.mac
    ~network_adapters.ib1.mac

44 45 46 47 48
    ~pdu
    ~pdu.port
    ~pdu.uid
    ~pdu[0]
    ~pdu[1]
49

50
    ~supported_job_types.max_walltime
51

52 53
    ~mic.ip
    ~mic.mac
54

55 56
    +status
    -status
57 58 59
  )

  ignore_netkeys = <<-eos
60 61
    ~network_adapters.eth.rate
    ~network_adapters.eth.name
62
    ~network_adapters.eth.ip
63
    -network_adapters.eth.ip
64
    ~network_adapters.eth.ip6
65
    -network_adapters.eth.ip6
66 67 68 69
    ~network_adapters.eth.mac
    ~network_adapters.eth.network_address
    ~network_adapters.eth.switch
    ~network_adapters.eth.switch_port
70 71 72
eos

  ignore_stokeys = <<-eos
73
    ~storage_devices.sd.model
74
    ~storage_devices.sd.firmware_version
75
    ~storage_devices.sd.rev
76
    -storage_devices.sd.rev
77 78 79 80
    ~storage_devices.sd.size
    ~storage_devices.sd.timeread
    ~storage_devices.sd.timewrite
    ~storage_devices.sd.vendor
81 82
    ~storage_devices.sd.by_id
    ~storage_devices.sd.by_path
83 84
eos

85
  (0..5).each { |eth|
86 87 88 89
    keys = ignore_netkeys.gsub('.eth.', ".eth#{eth}.").gsub("\n", " ").split(" ")
    ignore_keys.push(* keys)

    (1..21).each { |kavlan|
90
      ignore_keys << "~kavlan.eth#{eth}.kavlan-#{kavlan}"
91 92 93
    }
  }

94
  ('a'..'f').each { |sd| 
95 96 97 98
    keys = ignore_stokeys.gsub('.sd.', ".sd#{sd}.").gsub("\n", " ").split(" ")
    ignore_keys.push(* keys)
  }

99 100 101 102 103 104 105
  ignore_ibkeys = <<-eos
    ~network_adapters.IB_IF.guid
    ~network_adapters.IB_IF.hwid
    ~network_adapters.IB_IF.ip
    ~network_adapters.IB_IF.ip6
    ~network_adapters.IB_IF.line_card
    ~network_adapters.IB_IF.position
106
    +network_adapters.IB_IF.version
107 108 109 110 111 112 113 114 115 116 117 118
eos

  ib_interfaces = [
    'ib0',
    'ib1',
    'ib0.8100'
  ]

  ib_interfaces.each { |ib_if|
    keys = ignore_ibkeys.gsub('IB_IF', "#{ib_if}").gsub("\n", " ").split(" ")
    ignore_keys.push(* keys)
  }
119 120 121
  return ignore_keys
end

122 123 124 125 126 127
def cluster_ignore_keys(filename)
  file_hash = YAML::load(ERB.new(File.read(filename)).result(binding))
  file_hash.expand_square_brackets() if file_hash
  return file_hash
end

128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160
def get_site_dead_nodes(site_uid, options)

  oarnodes = ''
  api_uri = URI.parse('https://api.grid5000.fr/stable/sites/' + site_uid  + '/internal/oarapi/resources/details.json?limit=999999')

  # Download the OAR properties from the OAR API (through G5K API)
  puts "Downloading OAR resources properties from #{api_uri} ..." if options[:verbose]
  http = Net::HTTP.new(api_uri.host, Net::HTTP.https_default_port)
  http.use_ssl = true
  request = Net::HTTP::Get.new(api_uri.request_uri)

  # For outside g5k network access
  if options[:api][:user] && options[:api][:pwd]
    request.basic_auth(options[:api][:user], options[:api][:pwd])
    #request.basic_auth("nmichon", "o|JsGvGD4200")
  end

  response = http.request(request)
  raise "Failed to fetch resources properties from API: \n#{response.body}\n" unless response.code.to_i == 200
  puts '... done' if options[:verbose]
  oarnodes = JSON.parse(response.body)

  # Adapt from the format of the OAR API
  oarnodes = oarnodes['items'] if oarnodes.key?('items')
  dead_nodes = []
  oarnodes.each() { |node|
    if node["state"] == "Dead" && !dead_nodes.include?(node["network_address"])
      dead_nodes << node["network_address"].split(".")[0]
    end
  }
  return dead_nodes
end

161 162 163
def cluster_homogeneity(refapi_hash, options = {:verbose => false})
  verbose = options[:verbose]

164 165 166 167 168 169 170 171
  if verbose
    puts "The change set is represented using the following syntax:"
    puts '  [["+", "path.to.key1", value],          # new key'
    puts '   ["-", "path.to.key2", value],          # missing key'
    puts '   ["~", "path.to.key3", value1, value2]] # modified value'
    puts ''
  end

172
  ignore_keys  = global_ignore_keys()
173
  cignore_keys = cluster_ignore_keys(File.expand_path("../input-validators/check-cluster-homogeneity.yaml.erb", File.dirname(__FILE__)))
174

175 176
  input_data_dir = "../../input/grid5000/"
  refapi_hash = load_yaml_file_hierarchy(File.expand_path(input_data_dir, File.dirname(__FILE__)))
177
  count = {}
178 179
  total_count = 0

180
  refapi_hash["sites"].sort.each do |site_uid, site|
181 182
    next if options.key?(:sites) && !options[:sites].include?(site_uid)

183 184
    site_dead_nodes = get_site_dead_nodes(site_uid, options)

185 186 187
    count[site_uid] = {}

    site["clusters"].sort.each do |cluster_uid, cluster|
188 189
      next if options.key?(:clusters) && !options[:clusters].include?(cluster_uid)

190 191
      count[site_uid][cluster_uid] = 0

192 193
      refnode_uid = nil
      refnode = nil
194

195
      cluster["nodes"].each_sort_by_node_uid do |node_uid, node|
196
        next if node['status'] == 'retired' || site_dead_nodes.include?(node_uid)
197

198 199 200 201 202 203
        if !refnode
          refnode = node
          refnode_uid = node_uid
          next
        end

204
        diffs = HashDiff.diff(refnode, node)
205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227

        # Hack HashDiff output for arrays:
        #[["-", "pdu[1]", {"uid"=>"graphene-pdu9", "port"=>24}],
        # ["-", "pdu[0]", {"uid"=>"graphene-pdu9", "port"=>23}],
        # ["+", "pdu[0]", {"uid"=>"graphene-pdu9", "port"=>21}],
        # ["+", "pdu[1]", {"uid"=>"graphene-pdu9", "port"=>22}]]
        # => should be something like this:
        # [["~", "pdu[0]", {"uid"=>"graphene-pdu9", "port"=>23}, {"uid"=>"graphene-pdu9", "port"=>22},
        #  ["~", "pdu[1]", {"uid"=>"graphene-pdu9", "port"=>24}, {"uid"=>"graphene-pdu9", "port"=>23}}
        d = diffs.select{|x| x[0] != '~' }.group_by{ |x| x[1] }
        d.each { |k, v|
          d[k] = v.group_by{ |x| x[0] }
        }
        d.each { |k,v|
          if v.key?('-') && v.key?('+')
            #puts "Warning: #{node_uid}: convert +/- -> ~ for #{k}"
            diffs.delete(["-", k, v['-'][0][2]])
            diffs.delete(["+", k, v['+'][0][2]])
            diffs << ["~", k, v['-'][0][2], v['+'][0][2] ]
          end
        }
        # end of hack

228
        # Remove keys that are specific to each nodes (ip, mac etc.)
229
        ikeys = cignore_keys[site_uid][node_uid] rescue nil
230
        diffs.clone.each { |diff|
231 232
          diffs.delete(diff) if ignore_keys.include?(diff[0] + diff[1])
          diffs.delete(diff) if ikeys && ikeys.include?(diff[0] + diff[1])
233 234 235
        }

        if verbose && !diffs.empty?
236 237
          puts "Differences between #{refnode_uid} and #{node_uid}:"
          pp diffs
238 239
        end

240
        total_count += diffs.size
241 242 243 244 245 246 247 248 249
        count[site_uid][cluster_uid] += diffs.size

        # Remove the following line if you want to compare each nodes to the first cluster node
        refnode_uid = node_uid
        refnode = node
      end
    end
  end

250
  return [total_count, count]
251 252
end

253 254
def check_cluster_homogeneity(refapi_hash, options = {:verbose => false})
  verbose = options[:verbose]
255 256
  puts "Differences found between successive nodes, per cluster:\n\n"

257
  total_count, count = cluster_homogeneity(refapi_hash, options)
258 259 260 261
  puts "\n" if verbose

  puts count.to_yaml unless verbose

262
  puts "\nUse '-v' option for details." unless verbose
263 264

  return total_count
265 266 267 268 269 270
end

if __FILE__ == $0
  require 'optparse'

  options = {}
Arthur Garnier's avatar
Arthur Garnier committed
271
  options[:sites] = %w{grenoble lille luxembourg lyon nancy nantes rennes sophia}
272
  options[:api] = {}
273 274 275

  OptionParser.new do |opts|
    opts.banner = "Usage: check-cluster-homogeneity.rb [options]"
276

277 278
    opts.separator ""
    opts.separator "Example: ruby check-cluster-homogeneity.rb -v"
279 280

    ###
281

282
    opts.separator ""
283
    opts.separator "Filters:"
284

285 286 287 288 289
    opts.on('-s', '--sites a,b,c', Array, 'Select site(s)',
            "Default: "+options[:sites].join(", ")) do |s|
      raise "Wrong argument for -s option." unless (s - options[:sites]).empty?
      options[:sites] = s
    end
290

291 292 293
    opts.on('-c', '--clusters a,b,c', Array, 'Select clusters(s). Default: all') do |s|
      options[:clusters] = s
    end
294

295 296
    opts.separator ""
    opts.separator "Common options:"
297 298 299 300 301

    opts.on("-v", "--[no-]verbose", "Run verbosely") do |v|
      options[:verbose] ||= 0
      options[:verbose] = options[:verbose] + 1
    end
302

303 304 305 306 307 308 309 310
    opts.on('--api-user user', String, 'HTTP authentication user when outside G5K') do |user|
      options[:api][:user] = user
    end

    opts.on('--api-password pwd', String, 'HTTP authentication password when outside G5K') do |pwd|
      options[:api][:pwd] = pwd
    end

311 312 313 314 315 316
    # Print an options summary.
    opts.on_tail("-h", "--help", "Show this message") do
      puts opts
      exit
    end
  end.parse!
317

318
  refapi_hash = load_yaml_file_hierarchy(File.expand_path("../../input/grid5000/", File.dirname(__FILE__)))
319 320 321 322 323
  total_count = check_cluster_homogeneity(refapi_hash, options)

  # return 0 if all nodes are homogeneous, 1 otherwise
  exit total_count == 0

324
end