Created
May 23, 2012 05:03
-
-
Save hoffrocket/2773364 to your computer and use it in GitHub Desktop.
raid0 ebs disk health monitor
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
import atexit | |
import datetime | |
from datetime import timedelta | |
import logging | |
import os | |
import smtplib | |
import subprocess as sub | |
import sys | |
import time | |
from multiprocessing import Process, Queue | |
from Queue import Empty | |
HOST = os.uname()[1] | |
# lifted this from http://www.electricmonk.nl/log/2011/08/14/redirect-stdout-and-stderr-to-a-logger-in-python/ | |
class StreamToLogger(object): | |
""" | |
Fake file-like stream object that redirects writes to a logger instance. | |
""" | |
def __init__(self, logger, log_level=logging.INFO): | |
self.logger = logger | |
self.log_level = log_level | |
self.linebuf = '' | |
def write(self, buf): | |
for line in buf.rstrip().splitlines(): | |
self.logger.log(self.log_level, line.rstrip()) | |
def flush(self): | |
pass | |
def time_write(file_path, q): | |
"time the writes to a file on disk" | |
start_time = time.time(); | |
file_size = 0 | |
if os.path.exists(file_path): | |
file_size = os.path.getsize(file_path) | |
# raid0 chunk size smaller than 256kb | |
buffer_size = 256 * 1024 | |
# 4 ebs volumes in the raid | |
chunks = 4 | |
if file_size < (chunks * buffer_size): | |
# file doesn't exist, so we create it with all '1' | |
with open(file_path, 'wb') as touchfile: | |
buffer = '0' * buffer_size | |
for i in range(0, chunks): | |
touchfile.write(buffer) | |
os.fsync(touchfile.fileno()) | |
else: | |
# file exists, so we do sparse rates at every buffer_size offset | |
with open(file_path, 'r+b') as touchfile: | |
buffer = '1111' | |
for i in range(0, chunks): | |
touchfile.seek(i * buffer_size) | |
touchfile.write(buffer) | |
os.fsync(touchfile.fileno()) | |
q.put(int( (time.time() - start_time)*1000 )) | |
def send_alert(subject, text): | |
# tell someone? | |
def touch(kill_path, subject, message): | |
print("writing to killfile %s: %s" % (kill_path, message)) | |
try: | |
with open(kill_path, 'w') as file: | |
file.write(message) | |
except Exception as e: | |
# gracefully handle unwriteable kill file | |
msg = "error writing to killfile %s" % kill_path | |
print(msg, e) | |
send_alert(msg, str(e)) | |
pass | |
msg = message + "\n\n" | |
# capture five seconds of iostat output | |
try: | |
p = sub.Popen(['iostat', '-xm', '1', '5'],stdout=sub.PIPE, stderr=sub.PIPE) | |
output, errors = p.communicate() | |
if p.returncode == None: | |
p.terminate() | |
msg += "iostat:\n%s\n\n%s" % (output, errors) | |
except Exception as e: | |
errmsg = "couldn't get iostat" | |
print >> sys.stderr (errmsg, e) | |
msg += errmsg + " " + str(e) | |
send_alert(subject, msg) | |
def start_loop(kill_path, file_path, max_time_ms): | |
# setup logging | |
logging.basicConfig( | |
level=logging.DEBUG, | |
format='[%(asctime)s] %(message)s', | |
) | |
sys.stdout = StreamToLogger(logging.getLogger('STDOUT'), logging.INFO) | |
sys.stderr = StreamToLogger(logging.getLogger('STDERR'), logging.ERROR) | |
# 10x hard timeout in seconds. a single timeout of this length will write the kill file | |
hard_timeout = (max_time_ms * 10) / 1000 | |
# keep track of timeouts we've seen. increment on timeout, decrement when ok | |
timeout_count = 0 | |
# number of consecutive events to trigger state change | |
timeout_count_trigger = 5 | |
# number of times killfile has been written | |
killfile_count = 1 | |
# email subject for why touchfile was written | |
touchfile_subject = "" | |
should_touch = True | |
while True: | |
# if the killfile is empty, it was most likely placed there for a reason | |
is_killed = os.path.exists(kill_path) and os.path.getsize(kill_path) > 0 | |
is_hard_timeout = False | |
q = Queue() | |
# use process library in order to gracefully handle hung IO operations | |
# this was tested with fusehalt, don't change without testing | |
p = Process(target=time_write, args=(file_path,q)) | |
try: | |
p.start() | |
elapsed = q.get(timeout=hard_timeout) | |
p.join() | |
if elapsed > max_time_ms: | |
timeout_count += 1 | |
else: | |
timeout_count = max(0, timeout_count - 1) | |
except os.error as e: | |
print >> sys.stderr, e | |
pass | |
except Empty: | |
# move timeout_count all the up so that it takes | |
# trigger good periods to undo kill | |
timeout_count = timeout_count_trigger | |
is_hard_timeout = True | |
p.terminate() | |
if is_hard_timeout: | |
if should_touch: | |
touchfile_subject = "Killfile #%d written (reason: hard timeout breached)" % killfile_count | |
message = "hard timeout of %d breached at %s" % (hard_timeout, time.ctime()) | |
touch(kill_path, touchfile_subject, message) | |
should_touch = False | |
killfile_count += 1 | |
elif timeout_count >= timeout_count_trigger: | |
if should_touch: | |
touchfile_subject = "Killfile #%d written: (reason: write exceeded timeout count threshold)" % killfile_count | |
message = "timeout_count is %d. last write took %s ms at %s" % (timeout_count, str(elapsed), str(time.ctime())) | |
touch(kill_path, touchfile_subject, message) | |
should_touch = False | |
killfile_count += 1 | |
elif is_killed and timeout_count == 0: | |
print("removing kill file at %s" % time.ctime()) | |
try: | |
os.remove(kill_path) | |
should_touch = True | |
message = "Disk cleared up at %s\n\n" % str(time.ctime()) | |
send_alert(touchfile_subject, message) | |
except os.error as e: | |
# gracefully handle existing kill file | |
msg = "failed to remove killfile %s" % kill_path | |
print >> sys.stderr, (msg, e) | |
send_alert(msg, str(e)) | |
pass | |
time.sleep(1) | |
def cleanup(killfile): | |
# clean up the kill file if we're gracefully killed | |
try: os.unlink(killfile) | |
except: pass | |
if __name__ == '__main__': | |
if len(sys.argv) != 4: | |
print("Usage: %s /pathto/killfile /dbpath/touchfile timeout_millis" % sys.argv[0]) | |
sys.exit(1) | |
# ensure that the killfile path and touchfile path exist | |
for directory in (1, 2): | |
try: os.makedirs(os.path.dirname(sys.argv[directory])) | |
except: pass | |
atexit.register(cleanup, sys.argv[1]) | |
start_loop(sys.argv[1], sys.argv[2], int(sys.argv[3])) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment