-
-
Save sodabrew/8610563 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env tumblr_ruby | |
require 'zk' | |
require 'json' | |
require 'logger' | |
require 'socket' | |
require 'timeout' | |
require 'nrpe_check' | |
class CheckHBase | |
include NRPE::Check | |
attr_reader :options, :log | |
def initialize options = {} | |
@options = { | |
timeout: 120, # seconds | |
critical_threshold: 1800, # amount of time (seconds) check should be in warning status | |
# before escalating to critical status | |
log: { | |
location: '/var/log/hbase/check_hbase.log', | |
read_size: 2 ** 20, # 1mb | |
max_size: 2 ** 30, # 1gb | |
retention: 3, # files | |
format: lambda {|severity, time, program_name, message| message.merge(time: time.to_i).to_json + "\n"}, | |
}, | |
zookeeper: { | |
znode: '/hbase/master', | |
host: '0.0.0.0', | |
port: 2181, | |
}, | |
hbck: { | |
command: 'hbase hbck 2>/dev/null', | |
record_path: File.join('/tmp', [:hbck, Time.now.to_i].join('_')), | |
keys: [ | |
'Version', | |
'Master', | |
'Status', | |
'Number of Tables', | |
'Number of live region servers', | |
'Number of dead region servers', | |
'Number of backup masters', | |
], | |
}, | |
}.merge options | |
@log = Logger.new @options[:log][:location], | |
@options[:log][:retention], | |
@options[:log][:max_size] | |
@log.formatter = @options[:log][:format] | |
end | |
def active_master? | |
ZK.open [options[:zookeeper][:host], options[:zookeeper][:port]].join(':') do |zk| | |
if zk.exists? options[:zookeeper][:znode] | |
value, status = zk.get options[:zookeeper][:znode] | |
value.include? Socket.gethostname | |
end | |
end | |
end | |
def status_log_entries offset = File.size(options[:log][:location]) - options[:log][:read_size] | |
# ensure valid offset | |
offset = 0 if offset < 0 | |
# read the last n bytes of status log | |
File.read(options[:log][:location], | |
options[:log][:read_size], | |
offset).lines.reduce([]) do |entries, line| | |
# parse entry if valid json | |
if line =~ /^{.+}$/ | |
entry = JSON.parse entry, symbolize_names: true | |
# filter entries from within our time threshold | |
if Time.now - Time.at(entry[:time]) < options[:critical_threshold] | |
entries << entry | |
end | |
end | |
entries | |
end | |
end | |
def escalate_status? | |
not status_log_entries.empty? and status_log_entries.all? {|entry| entry[:status] == :warning} | |
end | |
def hbck | |
@hbck ||= Timeout.timeout options[:timeout] do | |
[%x[#{options[:hbck][:command]}], $?] | |
end | |
Hash[[:output, :status].zip(@hbck)] | |
rescue Timeout::Error | |
exit_with_status :critical, 'hbck timeout' | |
end | |
def hbase_status | |
@hbase_status ||= hbck[:output].lines.reduce({}) do |status, line| | |
if options[:hbck][:keys].any? {|key| line.include? key} | |
key, value = line.chomp.split ':' | |
status[key] = value.strip | |
end | |
status | |
end | |
end | |
def run | |
check do | |
status :unknown, 'unable to determine hbck status' # register an initial status | |
# exit if in standby state | |
exit_with_status :ok, 'not active master, skipping check' unless active_master? | |
# check hbase | |
check_status, check_message = case | |
when hbase_status['Number of dead region servers'].to_i != 0 | |
[:critical, "dead region servers: #{hbase_status['Number of dead region servers']}"] | |
when hbase_status['Status'] != 'OK' | |
[:warning, "hbck status: #{hbase_status['Status']}"] | |
when hbck[:status].exitstatus != 0 | |
[:warning, "hbck status: unhealthy"] | |
else | |
[:ok, hbase_status.to_json] | |
end | |
unless check_status == :ok | |
# record raw hbck output | |
unless hbck[:output].empty? | |
File.open options[:hbck][:record_path], 'w' do |file| | |
file.write hbck[:output] | |
end | |
end | |
# escalate status if in prolonged warning state | |
if check_status == :warning and escalate_status? | |
check_status = :critical | |
end | |
end | |
log.info status: check_status | |
status check_status, check_message | |
end | |
end | |
end | |
CheckHBase.new.run |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment