Forked from joemiller/pantheon-check-ping-endpionts.rb
Created
March 11, 2016 16:30
-
-
Save hanynowsky/51f1f46c8dc4d027eceb to your computer and use it in GitHub Desktop.
a meta-check for sensu that creates many other checks
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env ruby | |
# | |
# this is a special meta-check. It runs ping checks against all hosts in | |
# the /endpoints API and sends individual results directly to sensu-client via | |
# the udp/3030 client socket. this is different from the normal sensu check model | |
# where individual scripts run and their exit status and output is used to create | |
# a single event. | |
# | |
# the reason for this check is to be able to dynamically ping a list of hosts | |
# without the race conditions and timing issues involved with creating individual | |
# sensu check definitions using chef. | |
# | |
require 'rubygems' | |
require 'json' | |
require 'popen4' | |
require 'forkmanager' # gem install parallel-forkmanager | |
require 'rest-client' | |
require 'sensu-plugin/check/cli' | |
class PantheonCheckPingEndpoints < Sensu::Plugin::Check::CLI | |
# option :host, :short => '-h HOST', :long => "--host HOST", :required => true | |
option :critical_rtt, :short => '-c MS', :long => "--critical-rtt MS", :default => nil, :proc => Proc.new { |c| c.to_f } | |
option :warning_rtt, :short => '-w MS', :long => "--warning-rtt MS", :default => nil, :proc => Proc.new { |w| w.to_f } | |
option :critical_loss, :long => "--critical-loss COUNT", :default => 0, :proc => Proc.new { |c| c.to_f } | |
option :warning_loss, :long => "--warning-loss COUNT", :default => 0, :proc => Proc.new { |w| w.to_f } | |
option :options, :short => '-f OPTIONS', :long => "--fping-args OPTIONS", :default => nil | |
option :verbose, :short => '-v', :long => "--verbose", :boolean => true, :default => false | |
option :handler, :short => '-l HANDLER', :long => '--handler HANDLER', :default => 'default' | |
option :zone, :short => '-z ZONE', :long => '--zone ZONE', :required => true | |
option :procs, :short => '-p NUM_PROCS', :long => '--procs NUM_PROCS', :default => 50, :proc => Proc.new { |p| p.to_i } | |
# pantheon api | |
option :api, :long => '--api API_URL', :default => 'https://redacted:443' | |
option :timeout, :long => '--api-timeout SECONDS', :default => 30 | |
option :client_cert, :long => '--client-cert FILE', :default => 'cert.pem' | |
option :ca_file, :long => '--ca-file FILE', :default => 'ca.pem' | |
def pantheon_api(resource, jsonify=true) | |
begin | |
request = RestClient::Resource.new(config[:api] + resource, { | |
:timeout => config[:timeout], | |
:ssl_client_cert => OpenSSL::X509::Certificate.new(File.read(config[:client_cert])), | |
:ssl_client_key => OpenSSL::PKey::RSA.new(File.read(config[:client_cert])), | |
:ssl_ca_file => config[:ca_file], | |
:verify_ssl => OpenSSL::SSL::VERIFY_NONE | |
}) | |
if jsonify | |
JSON.parse(request.get, :symbolize_names => true) | |
else | |
request.get | |
end | |
rescue Errno::ECONNREFUSED | |
warning "Connection refused" | |
rescue RestClient::RequestFailed | |
warning "Request failed" | |
rescue RestClient::RequestTimeout | |
warning "Connection timed out" | |
rescue RestClient::Unauthorized | |
warning "Missing or incorrect Pantheon API credentials" | |
rescue JSON::ParserError | |
warning "Pantheon API returned invalid JSON" | |
end | |
end | |
def endpoints | |
pantheon_api('/endpoints?extended=0&source=check_ping') | |
end | |
def sensu_client_socket(msg) | |
u = UDPSocket.new | |
u.send(msg + "\n", 0, '127.0.0.1', 3030) | |
end | |
def send_ok(check_name, msg) | |
d = { 'name' => check_name, 'status' => 0, 'output' => 'OK: ' + msg, 'handler' => config[:handler] } | |
sensu_client_socket d.to_json | |
end | |
def send_warning(check_name, msg) | |
d = { 'name' => check_name, 'status' => 1, 'output' => 'WARNING: ' + msg, 'handler' => config[:handler] } | |
sensu_client_socket d.to_json | |
end | |
def send_critical(check_name, msg) | |
d = { 'name' => check_name, 'status' => 2, 'output' => 'CRITICAL: ' + msg, 'handler' => config[:handler] } | |
sensu_client_socket d.to_json | |
end | |
def run_fping(host) | |
cmd = "fping -s #{host} #{config[:options]}" | |
puts "Command:\n#{cmd}" if config[:verbose] | |
stats = nil | |
errors = nil | |
result = POpen4::popen4(cmd) do |stdin, stdout, stderr, pid| | |
stats = stdout.read | |
begin | |
errors = stderr.read | |
rescue | |
# stderr is not always open for reading. | |
end | |
end | |
exit_status = result ? result.exitstatus : nil | |
puts "Output:\n#{stats}" if config[:verbose] | |
puts "Exit Status:\n#{exit_status}" if config[:verbose] | |
return [cmd, exit_status, stats, errors] | |
end | |
def get_max_rtt(stats) | |
stats.match('(\d+[\.\d+]*) ms \(max round trip time\)')[1].to_f | |
end | |
def get_lost_packet_count(stats) | |
sent = stats.match('\d+ ICMP Echos sent')[1].to_i | |
received = stats.match('\d+ ICMP Echos sent')[1].to_i | |
sent - received | |
end | |
def ping_host(check_name, hostname, host) | |
cmd, exit_status, stats, errors = run_fping(host) | |
puts "results from #{host} #{hostname}: #{exit_status}, #{stats}" if config[:verbose] | |
case exit_status | |
when 0 | |
begin | |
max_rtt = get_max_rtt(stats) | |
lost_packets = get_lost_packet_count(stats) | |
if config[:critical_rtt] && max_rtt > config[:critical_rtt] | |
send_critical check_name, "Host '#{host}' reached in #{max_rtt} ms, which is greater than specified RTT of #{config[:critical_rtt]} ms" | |
elsif config[:warning_rtt] && max_rtt > config[:warning_rtt] | |
warning "Host '#{host}' reached in #{max_rtt} ms, which is greater than specified RTT of #{config[:warning_rtt]} ms" | |
elsif config[:critical_loss] && lost_packets > config[:critical_loss] | |
send_critical check_name, "Host '#{host}' dropped #{lost_packets}, which is greater than allowed loss of #{config[:critical_loss]} packet" | |
elsif config[:warning_loss] && lost_packets > config[:warning_loss] | |
send_warning check_name, "Host '#{host}' dropped #{lost_packets}, which is greater than allowed loss of #{config[:warning_loss]} packet" | |
else | |
send_ok check_name, "Host '#{host}' reached in #{max_rtt} ms dropping #{lost_packets} packets" | |
end | |
rescue | |
send_critical check_name, "Error extracting results: [#{cmd}, #{exit_status}, #{stats}, #{errors}]" | |
end | |
when 1 | |
send_critical check_name, "Host '#{host}' is unreachable" | |
when 2 | |
send_warning check_name, "Invalid IP address: #{host}" | |
when 3 | |
send_warning check_name, "Invalid fping command: #{cmd}" | |
when 4 | |
send_warning check_name, "Fping system call error: #{cmd}" | |
when nil | |
send_warning check_name, "Cannot locate 'fping', please add to your system path." | |
end | |
end | |
# this is the main method executed in the child processes | |
def process_endpoint(uuid, meta) | |
puts "in child process: pid: #{$$}, endpoint: #{uuid}" if config[:verbose] | |
if meta[:host].nil? or meta[:hostname].nil? | |
puts "skipping endpoint #{uuid}, missing 'host' or 'hostname' attributes." | |
return | |
end | |
public_ip_check_name = "#{meta[:hostname]}_ping_check" | |
private_ip_check_name = "#{meta[:hostname]}_private_ip_ping_check" | |
if meta[:pool] == 'down' | |
# endpoint is marked down, cleanup any open alerts in sensu by sending an 'OK' event | |
send_ok public_ip_check_name, "host is marked down. no ping necessary." | |
send_ok private_ip_check_name, "host is marked down. no ping necessary." | |
else | |
ping_host public_ip_check_name, meta[:hostname], meta[:host] | |
# only check private_ip if the endpoint is in the same zone specified by the '-z' arg | |
if config[:zone] == meta[:zone] | |
ping_host private_ip_check_name, meta[:hostname], meta[:private_ip] | |
end | |
end | |
end | |
def run | |
pm = Parallel::ForkManager.new(config[:procs]) | |
endpoints.each do |uuid, meta| | |
pm.start(uuid) and next # block until new process slot is available | |
process_endpoint(uuid, meta) | |
pm.finish(0) | |
end | |
pm.wait_all_children | |
ok "Finished ping checks." | |
end | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment