Skip to content

Instantly share code, notes, and snippets.

@hoffrocket
Created May 23, 2012 05:03
Show Gist options
  • Save hoffrocket/2773364 to your computer and use it in GitHub Desktop.
Save hoffrocket/2773364 to your computer and use it in GitHub Desktop.
raid0 ebs disk health monitor
#!/usr/bin/env python
import atexit
import datetime
from datetime import timedelta
import logging
import os
import smtplib
import subprocess as sub
import sys
import time
from multiprocessing import Process, Queue
from Queue import Empty
HOST = os.uname()[1]
# lifted this from http://www.electricmonk.nl/log/2011/08/14/redirect-stdout-and-stderr-to-a-logger-in-python/
class StreamToLogger(object):
"""
Fake file-like stream object that redirects writes to a logger instance.
"""
def __init__(self, logger, log_level=logging.INFO):
self.logger = logger
self.log_level = log_level
self.linebuf = ''
def write(self, buf):
for line in buf.rstrip().splitlines():
self.logger.log(self.log_level, line.rstrip())
def flush(self):
pass
def time_write(file_path, q):
"time the writes to a file on disk"
start_time = time.time();
file_size = 0
if os.path.exists(file_path):
file_size = os.path.getsize(file_path)
# raid0 chunk size smaller than 256kb
buffer_size = 256 * 1024
# 4 ebs volumes in the raid
chunks = 4
if file_size < (chunks * buffer_size):
# file doesn't exist, so we create it with all '1'
with open(file_path, 'wb') as touchfile:
buffer = '0' * buffer_size
for i in range(0, chunks):
touchfile.write(buffer)
os.fsync(touchfile.fileno())
else:
# file exists, so we do sparse rates at every buffer_size offset
with open(file_path, 'r+b') as touchfile:
buffer = '1111'
for i in range(0, chunks):
touchfile.seek(i * buffer_size)
touchfile.write(buffer)
os.fsync(touchfile.fileno())
q.put(int( (time.time() - start_time)*1000 ))
def send_alert(subject, text):
# tell someone?
def touch(kill_path, subject, message):
print("writing to killfile %s: %s" % (kill_path, message))
try:
with open(kill_path, 'w') as file:
file.write(message)
except Exception as e:
# gracefully handle unwriteable kill file
msg = "error writing to killfile %s" % kill_path
print(msg, e)
send_alert(msg, str(e))
pass
msg = message + "\n\n"
# capture five seconds of iostat output
try:
p = sub.Popen(['iostat', '-xm', '1', '5'],stdout=sub.PIPE, stderr=sub.PIPE)
output, errors = p.communicate()
if p.returncode == None:
p.terminate()
msg += "iostat:\n%s\n\n%s" % (output, errors)
except Exception as e:
errmsg = "couldn't get iostat"
print >> sys.stderr (errmsg, e)
msg += errmsg + " " + str(e)
send_alert(subject, msg)
def start_loop(kill_path, file_path, max_time_ms):
# setup logging
logging.basicConfig(
level=logging.DEBUG,
format='[%(asctime)s] %(message)s',
)
sys.stdout = StreamToLogger(logging.getLogger('STDOUT'), logging.INFO)
sys.stderr = StreamToLogger(logging.getLogger('STDERR'), logging.ERROR)
# 10x hard timeout in seconds. a single timeout of this length will write the kill file
hard_timeout = (max_time_ms * 10) / 1000
# keep track of timeouts we've seen. increment on timeout, decrement when ok
timeout_count = 0
# number of consecutive events to trigger state change
timeout_count_trigger = 5
# number of times killfile has been written
killfile_count = 1
# email subject for why touchfile was written
touchfile_subject = ""
should_touch = True
while True:
# if the killfile is empty, it was most likely placed there for a reason
is_killed = os.path.exists(kill_path) and os.path.getsize(kill_path) > 0
is_hard_timeout = False
q = Queue()
# use process library in order to gracefully handle hung IO operations
# this was tested with fusehalt, don't change without testing
p = Process(target=time_write, args=(file_path,q))
try:
p.start()
elapsed = q.get(timeout=hard_timeout)
p.join()
if elapsed > max_time_ms:
timeout_count += 1
else:
timeout_count = max(0, timeout_count - 1)
except os.error as e:
print >> sys.stderr, e
pass
except Empty:
# move timeout_count all the up so that it takes
# trigger good periods to undo kill
timeout_count = timeout_count_trigger
is_hard_timeout = True
p.terminate()
if is_hard_timeout:
if should_touch:
touchfile_subject = "Killfile #%d written (reason: hard timeout breached)" % killfile_count
message = "hard timeout of %d breached at %s" % (hard_timeout, time.ctime())
touch(kill_path, touchfile_subject, message)
should_touch = False
killfile_count += 1
elif timeout_count >= timeout_count_trigger:
if should_touch:
touchfile_subject = "Killfile #%d written: (reason: write exceeded timeout count threshold)" % killfile_count
message = "timeout_count is %d. last write took %s ms at %s" % (timeout_count, str(elapsed), str(time.ctime()))
touch(kill_path, touchfile_subject, message)
should_touch = False
killfile_count += 1
elif is_killed and timeout_count == 0:
print("removing kill file at %s" % time.ctime())
try:
os.remove(kill_path)
should_touch = True
message = "Disk cleared up at %s\n\n" % str(time.ctime())
send_alert(touchfile_subject, message)
except os.error as e:
# gracefully handle existing kill file
msg = "failed to remove killfile %s" % kill_path
print >> sys.stderr, (msg, e)
send_alert(msg, str(e))
pass
time.sleep(1)
def cleanup(killfile):
# clean up the kill file if we're gracefully killed
try: os.unlink(killfile)
except: pass
if __name__ == '__main__':
if len(sys.argv) != 4:
print("Usage: %s /pathto/killfile /dbpath/touchfile timeout_millis" % sys.argv[0])
sys.exit(1)
# ensure that the killfile path and touchfile path exist
for directory in (1, 2):
try: os.makedirs(os.path.dirname(sys.argv[directory]))
except: pass
atexit.register(cleanup, sys.argv[1])
start_loop(sys.argv[1], sys.argv[2], int(sys.argv[3]))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment