Created
June 18, 2013 15:56
-
-
Save joemiller/5806570 to your computer and use it in GitHub Desktop.
a meta-check for sensu that creates many other checks
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env ruby | |
# | |
# this is a special meta-check. It runs ping checks against all hosts in | |
# the /endpoints API and sends individual results directly to sensu-client via | |
# the udp/3030 client socket. this is different from the normal sensu check model | |
# where individual scripts run and their exit status and output is used to create | |
# a single event. | |
# | |
# the reason for this check is to be able to dynamically ping a list of hosts | |
# without the race conditions and timing issues involved with creating individual | |
# sensu check definitions using chef. | |
# | |
require 'rubygems' | |
require 'json' | |
require 'popen4' | |
require 'forkmanager' # gem install parallel-forkmanager | |
require 'rest-client' | |
require 'sensu-plugin/check/cli' | |
class PantheonCheckPingEndpoints < Sensu::Plugin::Check::CLI | |
# option :host, :short => '-h HOST', :long => "--host HOST", :required => true | |
option :critical_rtt, :short => '-c MS', :long => "--critical-rtt MS", :default => nil, :proc => Proc.new { |c| c.to_f } | |
option :warning_rtt, :short => '-w MS', :long => "--warning-rtt MS", :default => nil, :proc => Proc.new { |w| w.to_f } | |
option :critical_loss, :long => "--critical-loss COUNT", :default => 0, :proc => Proc.new { |c| c.to_f } | |
option :warning_loss, :long => "--warning-loss COUNT", :default => 0, :proc => Proc.new { |w| w.to_f } | |
option :options, :short => '-f OPTIONS', :long => "--fping-args OPTIONS", :default => nil | |
option :verbose, :short => '-v', :long => "--verbose", :boolean => true, :default => false | |
option :handler, :short => '-l HANDLER', :long => '--handler HANDLER', :default => 'default' | |
option :zone, :short => '-z ZONE', :long => '--zone ZONE', :required => true | |
option :procs, :short => '-p NUM_PROCS', :long => '--procs NUM_PROCS', :default => 50, :proc => Proc.new { |p| p.to_i } | |
# pantheon api | |
option :api, :long => '--api API_URL', :default => 'https://redacted:443' | |
option :timeout, :long => '--api-timeout SECONDS', :default => 30 | |
option :client_cert, :long => '--client-cert FILE', :default => 'cert.pem' | |
option :ca_file, :long => '--ca-file FILE', :default => 'ca.pem' | |
def pantheon_api(resource, jsonify=true) | |
begin | |
request = RestClient::Resource.new(config[:api] + resource, { | |
:timeout => config[:timeout], | |
:ssl_client_cert => OpenSSL::X509::Certificate.new(File.read(config[:client_cert])), | |
:ssl_client_key => OpenSSL::PKey::RSA.new(File.read(config[:client_cert])), | |
:ssl_ca_file => config[:ca_file], | |
:verify_ssl => OpenSSL::SSL::VERIFY_NONE | |
}) | |
if jsonify | |
JSON.parse(request.get, :symbolize_names => true) | |
else | |
request.get | |
end | |
rescue Errno::ECONNREFUSED | |
warning "Connection refused" | |
rescue RestClient::RequestFailed | |
warning "Request failed" | |
rescue RestClient::RequestTimeout | |
warning "Connection timed out" | |
rescue RestClient::Unauthorized | |
warning "Missing or incorrect Pantheon API credentials" | |
rescue JSON::ParserError | |
warning "Pantheon API returned invalid JSON" | |
end | |
end | |
def endpoints | |
pantheon_api('/endpoints?extended=0&source=check_ping') | |
end | |
def sensu_client_socket(msg) | |
u = UDPSocket.new | |
u.send(msg + "\n", 0, '127.0.0.1', 3030) | |
end | |
def send_ok(check_name, msg) | |
d = { 'name' => check_name, 'status' => 0, 'output' => 'OK: ' + msg, 'handler' => config[:handler] } | |
sensu_client_socket d.to_json | |
end | |
def send_warning(check_name, msg) | |
d = { 'name' => check_name, 'status' => 1, 'output' => 'WARNING: ' + msg, 'handler' => config[:handler] } | |
sensu_client_socket d.to_json | |
end | |
def send_critical(check_name, msg) | |
d = { 'name' => check_name, 'status' => 2, 'output' => 'CRITICAL: ' + msg, 'handler' => config[:handler] } | |
sensu_client_socket d.to_json | |
end | |
def run_fping(host) | |
cmd = "fping -s #{host} #{config[:options]}" | |
puts "Command:\n#{cmd}" if config[:verbose] | |
stats = nil | |
errors = nil | |
result = POpen4::popen4(cmd) do |stdin, stdout, stderr, pid| | |
stats = stdout.read | |
begin | |
errors = stderr.read | |
rescue | |
# stderr is not always open for reading. | |
end | |
end | |
exit_status = result ? result.exitstatus : nil | |
puts "Output:\n#{stats}" if config[:verbose] | |
puts "Exit Status:\n#{exit_status}" if config[:verbose] | |
return [cmd, exit_status, stats, errors] | |
end | |
def get_max_rtt(stats) | |
stats.match('(\d+[\.\d+]*) ms \(max round trip time\)')[1].to_f | |
end | |
def get_lost_packet_count(stats) | |
sent = stats.match('\d+ ICMP Echos sent')[1].to_i | |
received = stats.match('\d+ ICMP Echos sent')[1].to_i | |
sent - received | |
end | |
def ping_host(check_name, hostname, host) | |
cmd, exit_status, stats, errors = run_fping(host) | |
puts "results from #{host} #{hostname}: #{exit_status}, #{stats}" if config[:verbose] | |
case exit_status | |
when 0 | |
begin | |
max_rtt = get_max_rtt(stats) | |
lost_packets = get_lost_packet_count(stats) | |
if config[:critical_rtt] && max_rtt > config[:critical_rtt] | |
send_critical check_name, "Host '#{host}' reached in #{max_rtt} ms, which is greater than specified RTT of #{config[:critical_rtt]} ms" | |
elsif config[:warning_rtt] && max_rtt > config[:warning_rtt] | |
warning "Host '#{host}' reached in #{max_rtt} ms, which is greater than specified RTT of #{config[:warning_rtt]} ms" | |
elsif config[:critical_loss] && lost_packets > config[:critical_loss] | |
send_critical check_name, "Host '#{host}' dropped #{lost_packets}, which is greater than allowed loss of #{config[:critical_loss]} packet" | |
elsif config[:warning_loss] && lost_packets > config[:warning_loss] | |
send_warning check_name, "Host '#{host}' dropped #{lost_packets}, which is greater than allowed loss of #{config[:warning_loss]} packet" | |
else | |
send_ok check_name, "Host '#{host}' reached in #{max_rtt} ms dropping #{lost_packets} packets" | |
end | |
rescue | |
send_critical check_name, "Error extracting results: [#{cmd}, #{exit_status}, #{stats}, #{errors}]" | |
end | |
when 1 | |
send_critical check_name, "Host '#{host}' is unreachable" | |
when 2 | |
send_warning check_name, "Invalid IP address: #{host}" | |
when 3 | |
send_warning check_name, "Invalid fping command: #{cmd}" | |
when 4 | |
send_warning check_name, "Fping system call error: #{cmd}" | |
when nil | |
send_warning check_name, "Cannot locate 'fping', please add to your system path." | |
end | |
end | |
# this is the main method executed in the child processes | |
def process_endpoint(uuid, meta) | |
puts "in child process: pid: #{$$}, endpoint: #{uuid}" if config[:verbose] | |
if meta[:host].nil? or meta[:hostname].nil? | |
puts "skipping endpoint #{uuid}, missing 'host' or 'hostname' attributes." | |
return | |
end | |
public_ip_check_name = "#{meta[:hostname]}_ping_check" | |
private_ip_check_name = "#{meta[:hostname]}_private_ip_ping_check" | |
if meta[:pool] == 'down' | |
# endpoint is marked down, cleanup any open alerts in sensu by sending an 'OK' event | |
send_ok public_ip_check_name, "host is marked down. no ping necessary." | |
send_ok private_ip_check_name, "host is marked down. no ping necessary." | |
else | |
ping_host public_ip_check_name, meta[:hostname], meta[:host] | |
# only check private_ip if the endpoint is in the same zone specified by the '-z' arg | |
if config[:zone] == meta[:zone] | |
ping_host private_ip_check_name, meta[:hostname], meta[:private_ip] | |
end | |
end | |
end | |
def run | |
pm = Parallel::ForkManager.new(config[:procs]) | |
endpoints.each do |uuid, meta| | |
pm.start(uuid) and next # block until new process slot is available | |
process_endpoint(uuid, meta) | |
pm.finish(0) | |
end | |
pm.wait_all_children | |
ok "Finished ping checks." | |
end | |
end |
How do you handle state changes on the handler side? For example, suppose you have two servers and you cannot ping one of them. I think you generate one event for "critical" to the handler (a "create" event) and then immediately generate an "ok" to the handler for the other server. The second event would appear as a "resolve" event, because Sensu assumes a 1:1 mapping from check to response.
Did you do something on the handler side to deal with that problem? Thanks for any insight!
Can i use this in case i have a list of hosts that i want to ping and check if they are replying or not. Also, in case one of them fails would the alert fail for only that host?
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
@joemiller
I was reading this thread and I need something like Jarrod did here: https://groups.google.com/forum/#!topic/sensu-users/0YvotW8-doI. I need to check if X urls are up and running and I though about doing a http request check (requiring a 500 response to be OK). Is there any already created?
Thanks!!