hoffrocket · May 23, 2012 05:03
diff --git a/ebsdiskhealth.py b/ebsdiskhealth.py
 #!/usr/bin/env python

 import atexit
 import datetime
 from datetime import timedelta
 import logging
 import os
 import smtplib
 import subprocess as sub
 import sys
 import time
 from multiprocessing import Process, Queue
 from Queue import Empty

 HOST = os.uname()[1]

 # lifted this from http://www.electricmonk.nl/log/2011/08/14/redirect-stdout-and-stderr-to-a-logger-in-python/
 class StreamToLogger(object):
  """
  Fake file-like stream object that redirects writes to a logger instance.
  """
  def __init__(self, logger, log_level=logging.INFO):
    self.logger = logger
    self.log_level = log_level
    self.linebuf = ''

  def write(self, buf):
    for line in buf.rstrip().splitlines():
      self.logger.log(self.log_level, line.rstrip())

  def flush(self):
    pass


 def time_write(file_path, q):
  "time the writes to a file on disk"
  start_time = time.time();
  file_size = 0

  if os.path.exists(file_path):
    file_size = os.path.getsize(file_path)

  # raid0 chunk size smaller than 256kb
  buffer_size = 256 * 1024
  # 4 ebs volumes in the raid
  chunks = 4

  if file_size < (chunks * buffer_size):
    # file doesn't exist, so we create it with all '1'
    with open(file_path, 'wb') as touchfile:
      buffer = '0' * buffer_size
      for i in range(0, chunks):
        touchfile.write(buffer)
      os.fsync(touchfile.fileno())
  else:
    # file exists, so we do sparse rates at every buffer_size offset
    with open(file_path, 'r+b') as touchfile:
      buffer = '1111'
      for i in range(0, chunks):
        touchfile.seek(i * buffer_size)
        touchfile.write(buffer)
      os.fsync(touchfile.fileno())

  q.put(int( (time.time() - start_time)*1000 ))

 def send_alert(subject, text):
  # tell someone?

 def touch(kill_path, subject, message):
  print("writing to killfile %s: %s" % (kill_path, message))
  try:
    with open(kill_path, 'w') as file:
      file.write(message)
  except Exception as e:
    # gracefully handle unwriteable kill file
    msg = "error writing to killfile %s" % kill_path
    print(msg, e)
    send_alert(msg, str(e))
    pass

  msg = message + "\n\n"
  # capture five seconds of iostat output
  try:
    p = sub.Popen(['iostat', '-xm', '1', '5'],stdout=sub.PIPE, stderr=sub.PIPE)
    output, errors = p.communicate()
    if p.returncode == None:
      p.terminate()
    msg += "iostat:\n%s\n\n%s" % (output, errors)
  except Exception as e:
    errmsg = "couldn't get iostat"
    print >> sys.stderr (errmsg, e)
    msg += errmsg + " " + str(e)

  send_alert(subject, msg)

 def start_loop(kill_path, file_path, max_time_ms):
  # setup logging
  logging.basicConfig(
    level=logging.DEBUG,
    format='[%(asctime)s] %(message)s',
  )

  sys.stdout = StreamToLogger(logging.getLogger('STDOUT'), logging.INFO)
  sys.stderr = StreamToLogger(logging.getLogger('STDERR'), logging.ERROR)

  # 10x hard timeout in seconds.  a single timeout of this length will write the kill file
  hard_timeout = (max_time_ms * 10) / 1000
  # keep track of timeouts we've seen.  increment on timeout, decrement when ok
  timeout_count = 0
  # number of consecutive events to trigger state change
  timeout_count_trigger = 5
  # number of times killfile has been written
  killfile_count = 1

  # email subject for why touchfile was written
  touchfile_subject = ""

  should_touch = True

  while True:
    # if the killfile is empty, it was most likely placed there for a reason
    is_killed = os.path.exists(kill_path) and os.path.getsize(kill_path) > 0
    is_hard_timeout = False
    q = Queue()
    # use process library in order to gracefully handle hung IO operations
    # this was tested with fusehalt, don't change without testing
    p = Process(target=time_write, args=(file_path,q))
    try:
      p.start()
      elapsed = q.get(timeout=hard_timeout)
      p.join()
      if elapsed > max_time_ms:
        timeout_count += 1
      else:
        timeout_count = max(0, timeout_count - 1)
    except os.error as e:
      print >> sys.stderr, e
      pass
    except Empty:
      # move timeout_count all the up so that it takes
      # trigger good periods to undo kill
      timeout_count = timeout_count_trigger
      is_hard_timeout = True
      p.terminate()

    if is_hard_timeout:
      if should_touch:
        touchfile_subject = "Killfile #%d written (reason: hard timeout breached)" % killfile_count
        message = "hard timeout of %d breached at %s" % (hard_timeout, time.ctime())
        touch(kill_path, touchfile_subject, message)
        should_touch = False
        killfile_count += 1
    elif timeout_count >= timeout_count_trigger:
      if should_touch:
        touchfile_subject = "Killfile #%d written: (reason: write exceeded timeout count threshold)" % killfile_count
        message = "timeout_count is %d. last write took %s ms at %s" % (timeout_count, str(elapsed), str(time.ctime()))
        touch(kill_path, touchfile_subject, message)
        should_touch = False
        killfile_count += 1
    elif is_killed and timeout_count == 0:
      print("removing kill file at %s" % time.ctime())
      try:
        os.remove(kill_path)
        should_touch = True
        message = "Disk cleared up at %s\n\n" % str(time.ctime())
        send_alert(touchfile_subject, message)
      except os.error as e:
        # gracefully handle existing kill file
        msg = "failed to remove killfile %s" % kill_path
        print >> sys.stderr, (msg, e)
        send_alert(msg, str(e))
        pass

    time.sleep(1)

 def cleanup(killfile):
  # clean up the kill file if we're gracefully killed
  try: os.unlink(killfile)
  except: pass

 if __name__ == '__main__':
  if len(sys.argv) != 4:
    print("Usage: %s /pathto/killfile /dbpath/touchfile timeout_millis" % sys.argv[0])
    sys.exit(1)
  # ensure that the killfile path and touchfile path exist
  for directory in (1, 2):
    try: os.makedirs(os.path.dirname(sys.argv[directory]))
    except: pass
  atexit.register(cleanup, sys.argv[1])
  start_loop(sys.argv[1], sys.argv[2], int(sys.argv[3]))
	#!/usr/bin/env python

	import atexit
	import datetime
	from datetime import timedelta
	import logging
	import os
	import smtplib
	import subprocess as sub
	import sys
	import time
	from multiprocessing import Process, Queue
	from Queue import Empty

	HOST = os.uname()[1]

	# lifted this from http://www.electricmonk.nl/log/2011/08/14/redirect-stdout-and-stderr-to-a-logger-in-python/
	class StreamToLogger(object):
	"""
	Fake file-like stream object that redirects writes to a logger instance.
	"""
	def __init__(self, logger, log_level=logging.INFO):
	self.logger = logger
	self.log_level = log_level
	self.linebuf = ''

	def write(self, buf):
	for line in buf.rstrip().splitlines():
	self.logger.log(self.log_level, line.rstrip())

	def flush(self):
	pass


	def time_write(file_path, q):
	"time the writes to a file on disk"
	start_time = time.time();
	file_size = 0

	if os.path.exists(file_path):
	file_size = os.path.getsize(file_path)

	# raid0 chunk size smaller than 256kb
	buffer_size = 256 * 1024
	# 4 ebs volumes in the raid
	chunks = 4

	if file_size < (chunks * buffer_size):
	# file doesn't exist, so we create it with all '1'
	with open(file_path, 'wb') as touchfile:
	buffer = '0' * buffer_size
	for i in range(0, chunks):
	touchfile.write(buffer)
	os.fsync(touchfile.fileno())
	else:
	# file exists, so we do sparse rates at every buffer_size offset
	with open(file_path, 'r+b') as touchfile:
	buffer = '1111'
	for i in range(0, chunks):
	touchfile.seek(i * buffer_size)
	touchfile.write(buffer)
	os.fsync(touchfile.fileno())

	q.put(int( (time.time() - start_time)*1000 ))

	def send_alert(subject, text):
	# tell someone?

	def touch(kill_path, subject, message):
	print("writing to killfile %s: %s" % (kill_path, message))
	try:
	with open(kill_path, 'w') as file:
	file.write(message)
	except Exception as e:
	# gracefully handle unwriteable kill file
	msg = "error writing to killfile %s" % kill_path
	print(msg, e)
	send_alert(msg, str(e))
	pass

	msg = message + "\n\n"
	# capture five seconds of iostat output
	try:
	p = sub.Popen(['iostat', '-xm', '1', '5'],stdout=sub.PIPE, stderr=sub.PIPE)
	output, errors = p.communicate()
	if p.returncode == None:
	p.terminate()
	msg += "iostat:\n%s\n\n%s" % (output, errors)
	except Exception as e:
	errmsg = "couldn't get iostat"
	print >> sys.stderr (errmsg, e)
	msg += errmsg + " " + str(e)

	send_alert(subject, msg)

	def start_loop(kill_path, file_path, max_time_ms):
	# setup logging
	logging.basicConfig(
	level=logging.DEBUG,
	format='[%(asctime)s] %(message)s',
	)

	sys.stdout = StreamToLogger(logging.getLogger('STDOUT'), logging.INFO)
	sys.stderr = StreamToLogger(logging.getLogger('STDERR'), logging.ERROR)

	# 10x hard timeout in seconds. a single timeout of this length will write the kill file
	hard_timeout = (max_time_ms * 10) / 1000
	# keep track of timeouts we've seen. increment on timeout, decrement when ok
	timeout_count = 0
	# number of consecutive events to trigger state change
	timeout_count_trigger = 5
	# number of times killfile has been written
	killfile_count = 1

	# email subject for why touchfile was written
	touchfile_subject = ""

	should_touch = True

	while True:
	# if the killfile is empty, it was most likely placed there for a reason
	is_killed = os.path.exists(kill_path) and os.path.getsize(kill_path) > 0
	is_hard_timeout = False
	q = Queue()
	# use process library in order to gracefully handle hung IO operations
	# this was tested with fusehalt, don't change without testing
	p = Process(target=time_write, args=(file_path,q))
	try:
	p.start()
	elapsed = q.get(timeout=hard_timeout)
	p.join()
	if elapsed > max_time_ms:
	timeout_count += 1
	else:
	timeout_count = max(0, timeout_count - 1)
	except os.error as e:
	print >> sys.stderr, e
	pass
	except Empty:
	# move timeout_count all the up so that it takes
	# trigger good periods to undo kill
	timeout_count = timeout_count_trigger
	is_hard_timeout = True
	p.terminate()

	if is_hard_timeout:
	if should_touch:
	touchfile_subject = "Killfile #%d written (reason: hard timeout breached)" % killfile_count
	message = "hard timeout of %d breached at %s" % (hard_timeout, time.ctime())
	touch(kill_path, touchfile_subject, message)
	should_touch = False
	killfile_count += 1
	elif timeout_count >= timeout_count_trigger:
	if should_touch:
	touchfile_subject = "Killfile #%d written: (reason: write exceeded timeout count threshold)" % killfile_count
	message = "timeout_count is %d. last write took %s ms at %s" % (timeout_count, str(elapsed), str(time.ctime()))
	touch(kill_path, touchfile_subject, message)
	should_touch = False
	killfile_count += 1
	elif is_killed and timeout_count == 0:
	print("removing kill file at %s" % time.ctime())
	try:
	os.remove(kill_path)
	should_touch = True
	message = "Disk cleared up at %s\n\n" % str(time.ctime())
	send_alert(touchfile_subject, message)
	except os.error as e:
	# gracefully handle existing kill file
	msg = "failed to remove killfile %s" % kill_path
	print >> sys.stderr, (msg, e)
	send_alert(msg, str(e))
	pass

	time.sleep(1)

	def cleanup(killfile):
	# clean up the kill file if we're gracefully killed
	try: os.unlink(killfile)
	except: pass

	if __name__ == '__main__':
	if len(sys.argv) != 4:
	print("Usage: %s /pathto/killfile /dbpath/touchfile timeout_millis" % sys.argv[0])
	sys.exit(1)
	# ensure that the killfile path and touchfile path exist
	for directory in (1, 2):
	try: os.makedirs(os.path.dirname(sys.argv[directory]))
	except: pass
	atexit.register(cleanup, sys.argv[1])
	start_loop(sys.argv[1], sys.argv[2], int(sys.argv[3]))