Skip to content

Instantly share code, notes, and snippets.

@nmilford
Last active October 10, 2015 21:05
Show Gist options
  • Save nmilford/726af34ef683d4ef2a79 to your computer and use it in GitHub Desktop.
Save nmilford/726af34ef683d4ef2a79 to your computer and use it in GitHub Desktop.
Simple Diskcheck Daemon
#!/usr/bin/env ruby
require 'rubygems'
require 'fileutils'
require 'trollop'
require 'pstore'
require 'logger'
require 'dante'
require 'pony'
require 'erb'
module DiskCheck
class Runner
def initialize(opts = [])
# Initial structure for storing disk problems.
@last_run = {}
@this_run = {}
@this_run[:bad_disks] = []
@this_run[:over_thresh] = {}
@drama = false
@threshold = opts[:threshold]
@test_path = opts[:test_path]
@hist_file = opts[:hist_file]
@alert_to = opts[:alert_to]
@alert_from = opts[:alert_from]
# It's better than bad, it's good.
@log_file = opts[:log_file]
FileUtils.touch(@log_file) unless File.exists?(@log_file)
@log = Logger.new(@log_file)
@log.level = Logger::DEBUG
@log.debug("Starting DiskCheck run.")
end
def disks
disks = {}
`df -Ph`.split("\n").each do |line|
line = line.split
next if line[0][0..4] != '/dev/'
disks[line[0]] = {
:size => line[1],
:used => line[2],
:avail => line[3],
:used_pct => line[4].gsub("%",''),
:mount => line[5]
}
end
disks
end
def is_writable?(path)
# We can go crazy inventing ways to see of a volume is healthy, or we
# could just use it. In the end, if a volume is writable, it is viable.
# We do, however, need a standard place to dump a file to test.
begin
file = File.open(path, "w")
file << "writetest"
rescue Errno::ENOENT, Errno::EACCES => e
return false
ensure
file.close unless @file.nil?
end
return false unless File.exists?(path)
File.delete(path)
true
end
def above_thresh?(used_pct)
return false unless @threshold < used_pct.to_i
true
end
def set_history!
@log.debug("Overwriting old metrics to #{@hist_file}.")
history = PStore.new(@hist_file)
history.transaction { history[:last] = @this_run }
end
def load_history!
@log.debug("Loading metrics from previous run from #{@hist_file}.")
history = PStore.new(@hist_file)
@last_run = history.transaction { history[:last] }
# the pstore is empty on the initial run, so bail in that case.
return {} if @last_run.nil?
@last_run
end
def do_alert
# This could easily be hooked into an existing system to emit to NSCA
# or send JSON to an endpoint somewhere.
subject = "ALERT: Open Disk issues on #{`hostname`.chomp}"
body = ERB.new(%{
<% if @this_run[:bad_disks].length > 0 %>
The following disks contain unwritable file systems, please investigate.
<% @this_run[:bad_disks].each do |d| %>
<%= d %>
<% end %>
<% end %>
<% if @this_run[:over_thresh].length > 0 %>
The following disks are over the threshold of <%= @threshold %>%, please investigate.
<% @this_run[:over_thresh].each do |k, v| %>
<%= k %> mounted at <%= v[:mount] %> is at <%= v[:used_pct] %>% utilization.
<% end %>
<% end %>
}).result(binding)
if @drama == false
@log.debug("No problems detected.")
else
Pony.mail(
:to => @alert_to,
:from => @alert_from,
:subject => subject,
:body => body
)
@log.debug("Alert triggered and sent to #{@alert_to}")
end
end
def do_checks
disks.each do |disk, info|
if is_writable?("#{info[:mount]}#{@test_path}")
@log.debug("#{disk} is writable.")
else
@log.error("#{disk} is unwritable.")
@this_run[:bad_disks] << disk
@drama = true
end
if above_thresh?(info[:used_pct])
@this_run[:over_thresh][disk] = info
@log.error("#{disk} (#{info[:mount]}) usage is above threshold of #{@threshold}% at #{info[:used_pct]}%.")
@drama = true
else
@log.debug("#{disk} (#{info[:mount]}) usage is under threshold of #{@threshold}% at #{info[:used_pct]}%.")
end
end
end
def check_history
load_history!
unless @last_run[:over_thresh].nil?
@last_run[:over_thresh].each do |disk, info|
next if @this_run[:over_thresh].include?(disk)
@log.debug("#{disk} (#{info[:mount]}) usage is no longer over threshold of #{@threshold}% at #{info[:used_pct]}%.")
end
end
unless @last_run[:bad_disks].nil?
@last_run[:bad_disks].each do |disk|
next if @this_run[:bad_disks].include?(disk)
@log.debug("#{disk} is now writable (or was administratively removed and not detected this run).")
end
end
end
def run
do_checks
do_alert
check_history
set_history!
@log.debug("Ending DiskCheck run.")
end
end
end
opts = Trollop::options do
banner <<-EOS
Checks if volumes are writable or full.
Usage:
diskcheck [options]
Use --help for options
EOS
opt :daemonize, "Run as a daemon.", :default => false
opt :kill, "Attempts to kill a running daemon.", :default => false
opt :frequency, "If daemonized, the run frequency in seconds.", :default => 300
opt :pid_file, "If daemonized, path to the pid file.", :default => '/var/tmp/diskcheck.pid'
opt :hist_file, "Metrics history file location.", :default => '/var/tmp/diskcheck.dat'
opt :log_file, "Log file location.", :default => '/var/tmp/diskcheck.log'
opt :test_path, "Each volume needs a writable path to test.", :default => '/tmp/writetest'
opt :alert_to, "Email address to send alerts to.", :default => '[email protected]'
opt :alert_from, "Email address to send alerts from", :default => "alert@#{`hostname`.chomp}"
opt :threshold, "Threshold to alert on percentage of disk space used", :default => 75
end
if opts[:kill]
if File.exists?(opts[:pid_file])
pid = File.read(opts[:pid_file]).chomp.to_i
puts "Attempting to stop diskcheck."
begin
Process.kill('INT', pid)
rescue Errno::ESRCH => e
puts "Process not found, are you sure it is still running?"
ensure
File.delete(opts[:pid_file])
end
else
puts "Pidfile doesn't exist, should be at #{opts[:pid_file]}."
exit -1
end
exit 0
end
if opts[:daemonize]
puts "Daemonizing into the background."
puts "Pidfile is at #{opts[:pid_file]}."
puts "Logging to #{opts[:log_file]}"
Dante::Runner.new('diskcheck').execute(
:daemonize => true,
:pid_path => opts[:pid_file],
:log_path => opts[:log_file]) {
loop do
DiskCheck::Runner.new(opts).run
sleep(opts[:frequency] )
end
}
else
DiskCheck::Runner.new(opts).run
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment