Skip to content

Instantly share code, notes, and snippets.

@dallasmarlow
Last active December 17, 2015 11:49
Show Gist options
  • Save dallasmarlow/5605403 to your computer and use it in GitHub Desktop.
Save dallasmarlow/5605403 to your computer and use it in GitHub Desktop.
hbck nagios check
#!/usr/bin/env tumblr_ruby
require 'zk'
require 'json'
require 'logger'
require 'socket'
require 'timeout'
require 'nrpe_check'
class CheckHBase
include NRPE::Check
attr_reader :options, :log
def initialize options = {}
@options = {
timeout: 120, # seconds
critical_threshold: 1800, # amount of time (seconds) check should be in warning status
# before escalating to critical status
log: {
location: '/var/log/hbase/check_hbase.log',
read_size: 2 ** 20, # 1mb
max_size: 2 ** 30, # 1gb
retention: 3, # files
format: lambda {|severity, time, program_name, message| message.merge(time: time.to_i).to_json + "\n"},
},
zookeeper: {
znode: '/hbase/master',
host: '0.0.0.0',
port: 2181,
},
hbck: {
command: 'hbase hbck 2>/dev/null',
record_path: File.join('/tmp', [:hbck, Time.now.to_i].join('_')),
keys: [
'Version',
'Master',
'Status',
'Number of Tables',
'Number of live region servers',
'Number of dead region servers',
'Number of backup masters',
],
},
}.merge options
@log = Logger.new @options[:log][:location],
@options[:log][:retention],
@options[:log][:max_size]
@log.formatter = @options[:log][:format]
end
def active_master?
ZK.open [options[:zookeeper][:host], options[:zookeeper][:port]].join(':') do |zk|
if zk.exists? options[:zookeeper][:znode]
value, status = zk.get options[:zookeeper][:znode]
value.include? Socket.gethostname
end
end
end
def status_log_entries offset = File.size(options[:log][:location]) - options[:log][:read_size]
# ensure valid offset
offset = 0 if offset < 0
# read the last n bytes of status log
File.read(options[:log][:location],
options[:log][:read_size],
offset).lines.reduce([]) do |entries, line|
# parse entry if valid json
if line =~ /^{.+}$/
entry = JSON.parse entry, symbolize_names: true
# filter entries from within our time threshold
if Time.now - Time.at(entry[:time]) < options[:critical_threshold]
entries << entry
end
end
entries
end
end
def escalate_status?
not status_log_entries.empty? and status_log_entries.all? {|entry| entry[:status] == :warning}
end
def hbck
@hbck ||= Timeout.timeout options[:timeout] do
[%x[#{options[:hbck][:command]}], $?]
end
Hash[[:output, :status].zip(@hbck)]
rescue Timeout::Error
exit_with_status :critical, 'hbck timeout'
end
def hbase_status
@hbase_status ||= hbck[:output].lines.reduce({}) do |status, line|
if options[:hbck][:keys].any? {|key| line.include? key}
key, value = line.chomp.split ':'
status[key] = value.strip
end
status
end
end
def run
check do
status :unknown, 'unable to determine hbck status' # register an initial status
# exit if in standby state
exit_with_status :ok, 'not active master, skipping check' unless active_master?
# check hbase
check_status, check_message = case
when hbase_status['Number of dead region servers'].to_i != 0
[:critical, "dead region servers: #{hbase_status['Number of dead region servers']}"]
when hbase_status['Status'] != 'OK'
[:warning, "hbck status: #{hbase_status['Status']}"]
when hbck[:status].exitstatus != 0
[:warning, "hbck status: unhealthy"]
else
[:ok, hbase_status.to_json]
end
unless check_status == :ok
# record raw hbck output
unless hbck[:output].empty?
File.open options[:hbck][:record_path], 'w' do |file|
file.write hbck[:output]
end
end
# escalate status if in prolonged warning state
if check_status == :warning and escalate_status?
check_status = :critical
end
end
log.info status: check_status
status check_status, check_message
end
end
end
CheckHBase.new.run
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment