Last active
October 10, 2015 21:05
-
-
Save nmilford/726af34ef683d4ef2a79 to your computer and use it in GitHub Desktop.
Simple Diskcheck Daemon
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env ruby | |
require 'rubygems' | |
require 'fileutils' | |
require 'trollop' | |
require 'pstore' | |
require 'logger' | |
require 'dante' | |
require 'pony' | |
require 'erb' | |
module DiskCheck | |
class Runner | |
def initialize(opts = []) | |
# Initial structure for storing disk problems. | |
@last_run = {} | |
@this_run = {} | |
@this_run[:bad_disks] = [] | |
@this_run[:over_thresh] = {} | |
@drama = false | |
@threshold = opts[:threshold] | |
@test_path = opts[:test_path] | |
@hist_file = opts[:hist_file] | |
@alert_to = opts[:alert_to] | |
@alert_from = opts[:alert_from] | |
# It's better than bad, it's good. | |
@log_file = opts[:log_file] | |
FileUtils.touch(@log_file) unless File.exists?(@log_file) | |
@log = Logger.new(@log_file) | |
@log.level = Logger::DEBUG | |
@log.debug("Starting DiskCheck run.") | |
end | |
def disks | |
disks = {} | |
`df -Ph`.split("\n").each do |line| | |
line = line.split | |
next if line[0][0..4] != '/dev/' | |
disks[line[0]] = { | |
:size => line[1], | |
:used => line[2], | |
:avail => line[3], | |
:used_pct => line[4].gsub("%",''), | |
:mount => line[5] | |
} | |
end | |
disks | |
end | |
def is_writable?(path) | |
# We can go crazy inventing ways to see of a volume is healthy, or we | |
# could just use it. In the end, if a volume is writable, it is viable. | |
# We do, however, need a standard place to dump a file to test. | |
begin | |
file = File.open(path, "w") | |
file << "writetest" | |
rescue Errno::ENOENT, Errno::EACCES => e | |
return false | |
ensure | |
file.close unless @file.nil? | |
end | |
return false unless File.exists?(path) | |
File.delete(path) | |
true | |
end | |
def above_thresh?(used_pct) | |
return false unless @threshold < used_pct.to_i | |
true | |
end | |
def set_history! | |
@log.debug("Overwriting old metrics to #{@hist_file}.") | |
history = PStore.new(@hist_file) | |
history.transaction { history[:last] = @this_run } | |
end | |
def load_history! | |
@log.debug("Loading metrics from previous run from #{@hist_file}.") | |
history = PStore.new(@hist_file) | |
@last_run = history.transaction { history[:last] } | |
# the pstore is empty on the initial run, so bail in that case. | |
return {} if @last_run.nil? | |
@last_run | |
end | |
def do_alert | |
# This could easily be hooked into an existing system to emit to NSCA | |
# or send JSON to an endpoint somewhere. | |
subject = "ALERT: Open Disk issues on #{`hostname`.chomp}" | |
body = ERB.new(%{ | |
<% if @this_run[:bad_disks].length > 0 %> | |
The following disks contain unwritable file systems, please investigate. | |
<% @this_run[:bad_disks].each do |d| %> | |
<%= d %> | |
<% end %> | |
<% end %> | |
<% if @this_run[:over_thresh].length > 0 %> | |
The following disks are over the threshold of <%= @threshold %>%, please investigate. | |
<% @this_run[:over_thresh].each do |k, v| %> | |
<%= k %> mounted at <%= v[:mount] %> is at <%= v[:used_pct] %>% utilization. | |
<% end %> | |
<% end %> | |
}).result(binding) | |
if @drama == false | |
@log.debug("No problems detected.") | |
else | |
Pony.mail( | |
:to => @alert_to, | |
:from => @alert_from, | |
:subject => subject, | |
:body => body | |
) | |
@log.debug("Alert triggered and sent to #{@alert_to}") | |
end | |
end | |
def do_checks | |
disks.each do |disk, info| | |
if is_writable?("#{info[:mount]}#{@test_path}") | |
@log.debug("#{disk} is writable.") | |
else | |
@log.error("#{disk} is unwritable.") | |
@this_run[:bad_disks] << disk | |
@drama = true | |
end | |
if above_thresh?(info[:used_pct]) | |
@this_run[:over_thresh][disk] = info | |
@log.error("#{disk} (#{info[:mount]}) usage is above threshold of #{@threshold}% at #{info[:used_pct]}%.") | |
@drama = true | |
else | |
@log.debug("#{disk} (#{info[:mount]}) usage is under threshold of #{@threshold}% at #{info[:used_pct]}%.") | |
end | |
end | |
end | |
def check_history | |
load_history! | |
unless @last_run[:over_thresh].nil? | |
@last_run[:over_thresh].each do |disk, info| | |
next if @this_run[:over_thresh].include?(disk) | |
@log.debug("#{disk} (#{info[:mount]}) usage is no longer over threshold of #{@threshold}% at #{info[:used_pct]}%.") | |
end | |
end | |
unless @last_run[:bad_disks].nil? | |
@last_run[:bad_disks].each do |disk| | |
next if @this_run[:bad_disks].include?(disk) | |
@log.debug("#{disk} is now writable (or was administratively removed and not detected this run).") | |
end | |
end | |
end | |
def run | |
do_checks | |
do_alert | |
check_history | |
set_history! | |
@log.debug("Ending DiskCheck run.") | |
end | |
end | |
end | |
opts = Trollop::options do | |
banner <<-EOS | |
Checks if volumes are writable or full. | |
Usage: | |
diskcheck [options] | |
Use --help for options | |
EOS | |
opt :daemonize, "Run as a daemon.", :default => false | |
opt :kill, "Attempts to kill a running daemon.", :default => false | |
opt :frequency, "If daemonized, the run frequency in seconds.", :default => 300 | |
opt :pid_file, "If daemonized, path to the pid file.", :default => '/var/tmp/diskcheck.pid' | |
opt :hist_file, "Metrics history file location.", :default => '/var/tmp/diskcheck.dat' | |
opt :log_file, "Log file location.", :default => '/var/tmp/diskcheck.log' | |
opt :test_path, "Each volume needs a writable path to test.", :default => '/tmp/writetest' | |
opt :alert_to, "Email address to send alerts to.", :default => '[email protected]' | |
opt :alert_from, "Email address to send alerts from", :default => "alert@#{`hostname`.chomp}" | |
opt :threshold, "Threshold to alert on percentage of disk space used", :default => 75 | |
end | |
if opts[:kill] | |
if File.exists?(opts[:pid_file]) | |
pid = File.read(opts[:pid_file]).chomp.to_i | |
puts "Attempting to stop diskcheck." | |
begin | |
Process.kill('INT', pid) | |
rescue Errno::ESRCH => e | |
puts "Process not found, are you sure it is still running?" | |
ensure | |
File.delete(opts[:pid_file]) | |
end | |
else | |
puts "Pidfile doesn't exist, should be at #{opts[:pid_file]}." | |
exit -1 | |
end | |
exit 0 | |
end | |
if opts[:daemonize] | |
puts "Daemonizing into the background." | |
puts "Pidfile is at #{opts[:pid_file]}." | |
puts "Logging to #{opts[:log_file]}" | |
Dante::Runner.new('diskcheck').execute( | |
:daemonize => true, | |
:pid_path => opts[:pid_file], | |
:log_path => opts[:log_file]) { | |
loop do | |
DiskCheck::Runner.new(opts).run | |
sleep(opts[:frequency] ) | |
end | |
} | |
else | |
DiskCheck::Runner.new(opts).run | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment