dound · July 21, 2010 00:07 · dound · Jul 21, 2010
diff --git a/encoding.py b/encoding.py
 # this file is from the google-chartwrapper project (http://code.google.com/p/google-chartwrapper/)

 coding = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789'
 ecoding = coding + '-.'
 codeset =  {
    'simple': {
        'coding': coding,
        'max_value':  61,
        'char': ',',
        'dchar': '',
        'none': '_',
        'value': lambda x: coding[x]
    },
    'text': {
        'coding': '',
        'max_value':  100,
        'none': '-1',
        'char': '|',
        'dchar': ',',
        'value': lambda x: '%.1f' % float(x)
    },
    'extended': {
        'coding':  ecoding,
        'max_value':  4095,
        'none':  '__',
        'dchar': '',
        'char': ',',
        'value': lambda x: '%s%s' % \
             (ecoding[int(float(x) / 64)], ecoding[int(x % 64)])
    }
 }

 class Encoder:
    """Data encoder that handles simple,text, and extended encodings

    Based on javascript encoding algorithm and pygooglecharts"""
    def __init__(self, encoding=None, scale=None, series=''):
        self.series = series or ''
        if encoding is None:
            encoding = 'text'
        assert(encoding in ('simple','text','extended')),\
            'Unknown encoding: %s'%encoding
        self.encoding = encoding
        self.scale = scale
        self.codeset = codeset[encoding]

    def scalevalue(self, value):
        return value # one day...
        if self.encoding != 'text' and self.scale and \
                isinstance(value, int) or isinstance(value, float):
            if type(self.scale) == type(()):
                lower,upper = self.scale
            else:
                lower,upper = 0,float(self.scale)
            value = int(round(float(value - lower) * \
                            self.codeset['max_value'] / upper))
        return min(value, self.codeset['max_value'])

    def encode(self,  *args, **kwargs):
        """Encode wrapper for a dataset with maximum value

        Datasets can be one or two dimensional
        Strings are ignored as ordinal encoding"""
        if isinstance(args[0], str):
            return self.encode([args[0]],**kwargs)
        elif isinstance(args[0], int) or isinstance(args[0], float):
            return self.encode([[args[0]]],**kwargs)
        if len(args)>1:
            dataset = args
        else:
            dataset = args[0]
        typemap = list(map(type,dataset))
        code = self.encoding[0]
        if type('') in typemap:
            data = ','.join(map(str,dataset))
        elif type([]) in typemap or type(()) in typemap:
            data = self.codeset['char'].join(map(self.encodedata, dataset))
        elif len(dataset) == 1 and hasattr(dataset[0], '__iter__'):
            data = self.encodedata(dataset[0])
        else:
            try:
                data = self.encodedata(dataset)
            except ValueError:
                data = self.encodedata(','.join(map(unicode,dataset)))
        if not '.' in data and code == 't':
            code = 'e'
        return '%s%s:%s'%(code,self.series,data)

    def encodedata(self, data):
        sub_data = []
        enc_size = len(self.codeset['coding'])
        for value in data:
            if value in (None,'None'):
                sub_data.append(self.codeset['none'])
            elif isinstance(value, str):
                sub_data.append(value)
            elif value >= -1:
                try:
                    sub_data.append(self.codeset['value'](self.scalevalue(value)))
                except:
                    raise ValueError('cannot encode value: %s'%value)
        return self.codeset['dchar'].join(sub_data)

    def decode(self, astr):
        e = astr[0]
        dec_data = []
        for data in astr[2:].split(self.codeset['char']):
            sub_data = []
            if e == 't':
                sub_data.extend(map(float, data.split(',')))
            elif e == 'e':
                flag = 0
                index = self.codeset['coding'].index
                for i in range(len(data)):
                    if not flag:
                        this,next = index(data[i]),index(data[i+1])
                        flag = 1
                        sub_data.append((64 * this) + next)
                    else: flag = 0
            elif e == 's':
                sub_data.extend(map(self.codeset['coding'].index, data))
            dec_data.append(sub_data)
        return dec_data
diff --git a/gae_sys_status_downloader.py b/gae_sys_status_downloader.py
 import datetime
 import gzip
 import re
 import StringIO
 import urllib2

 from encoding import Encoder # from google-chartwrapper
 from numpy import numarray
 import pylab

 DATASTORE_STATUS_URL = 'http://code.google.com/status/appengine/detail/datastore/%s'
 DATE_FORMAT = '%Y/%m/%d'
 ENC = Encoder('extended')
 #ERROR_RATES = ('get', 'put', 'update', 'delete', 'query')
 ERROR_RATES = ('put', 'update', 'delete')
 NUM_BINS = 24 * 4   # bins are 15 minutes wide
 RAW_MAX = 4095
 X_MAX = 24.0
 Y_MAX = 100.0
 Y_THRESH = 0.0

 RE_CHD = re.compile(r'&chd=([^&]*)')
 def extract_data_string_from_url(url):
    m = RE_CHD.search(url)
    if m:
        return m.group(1)
    else:
        raise Exception("url does not contain any data - missing query parameter 'chd'")

 RESTR_SET = r'[^,]*'
 RE_SECOND_SET = re.compile(r'e:%s,%s' % (RESTR_SET, RESTR_SET))
 def data_string_to_scaled_data(data_str):
    m = RE_SECOND_SET.search(data_str)
    if not m:
        raise Exception("data string does not contain data in the expected format")
    edata = m.group(0)
    if not edata:
        return []  # no downtime if there is only one data set

    xraw, yraw = ENC.decode(edata)
    xscl = [x*X_MAX/RAW_MAX for x in xraw]
    yscl = [y*Y_MAX/RAW_MAX for y in yraw]
    return zip(xscl, yscl)

 def bin_data(data_scaled, y_thresh=Y_THRESH):
    """Returns a list.  Each element represents a period of time throughout the
    day (length of time determined by NUM_BINS - e.g., 12 bins => each element
    covers 24/12=2hours).  The element is True iff there was downtime greater
    than y_thresh.
    """
    prev_down_i = None
    ret = [False] * NUM_BINS
    for x,y in data_scaled:
        if y > y_thresh:
            i = min(int(x/X_MAX * NUM_BINS), NUM_BINS-1)
            ret[i] = True
            # if previous reading was downtime too, then mark all bins between
            # the previous reading and this reading as downtime too
            if prev_down_i is not None:
                for j in xrange(prev_down_i, i):
                    ret[j] = True
            prev_down_i = i
        else:
            prev_down_i = None
    return ret

 def extract_data_for_specific_error_rate(html, name):
    m = re.search(r'"ae-trust-detail-datastore-%s-error_rate-link"[^(]*[(]([^)]*)[)]'%name, html, re.DOTALL)
    if not m:
        raise Exception("Missing data for %s" % name)
    chart_url = m.group(1)
    data_str = extract_data_string_from_url(chart_url)
    data_scaled = data_string_to_scaled_data(data_str)
    data_binned = bin_data(data_scaled)
    return data_binned

 def extract_error_rate_data(url, error_rates=ERROR_RATES):
    """Extracts error rate data from `url` for each error rate specified in
    error_rates.  The output is like that of bin_data(), except the results of
    each error_rate is OR'd together.  Thus each element/bin in the returned
    list indicates whether ANY error rate was greater than Y_THRESH for the
    chunk of time it represents.
    """
    try:
        resp = urllib2.urlopen(url)
    except urllib2.HTTPError, e:
        raise Exception('HTTP failure: %s (%s)' % (e,url))
    except urllib2.URLError, e:
        raise Exception('Fetch failure: %s (%s)' % (e,url))
    html = resp.read()
    if resp.info().get('content-encoding') == 'gzip':
        html = gzip.GzipFile(fileobj=StringIO.StringIO(html)).read()
    datas = []
    for name in error_rates:
        datas.append(extract_data_for_specific_error_rate(html, name))
    # combine the channels - if any is True, mark the bin as true
    return [any(t) for t in zip(*datas)]

 def plot_bins(data, num_days, num_bins, yticksz=5, width=0.5):
    labels = []
    if num_bins >= 24:
        for i in xrange(24):
            labels += ['%02d:00'%i]
            labels += ['']*(num_bins/24-1)
    else:
        for i in xrange(num_bins):
            labels += ['%d:00-%d:00' % (i*(24.0/num_bins), (i+1)*(24.0/num_bins))]
    xlocs = numarray.array(range(len(data))) + width
    pylab.bar(xlocs, data, width=width)
    pylab.xticks(xlocs+width/2.0, labels)
    pylab.xlabel("Time of Day")
    pylab.yticks(filter(lambda x: x%yticksz==0,range(0,max(data)+yticksz)))
    pylab.ylabel("# of Days with Downtime")
    pylab.xlim(0, xlocs[-1]+width*2)
    pylab.ylim(0, int((max(data)+yticksz)/yticksz)*yticksz+1)
    pylab.title("# Days with Downtime in the Past %d Days" % num_days)
    pylab.gca().get_xaxis().tick_bottom()
    pylab.gca().get_yaxis().tick_left()
    pylab.show()

 def main(start=datetime.datetime(2009,07,20), end=datetime.datetime.now()):
    """Time of day is divided up into NUM_BINS equi-sized chunks.  Then downtime
    from each day from `start` to `end` is retrieved.  Each chunk counts the
    number of days which experienced downtime greater than Y_THRESH during the
    time block allocated to the chunk.

    Data is printed to stdout and shown in a plot.
    """
    dt = start
    one_day = datetime.timedelta(days=1)
    counts = [0] * NUM_BINS
    num_days = 0
    while dt < end:
        dt_str = dt.strftime(DATE_FORMAT)
        print 'working on', dt_str
        datas_for_day = extract_error_rate_data(DATASTORE_STATUS_URL % dt_str)
        print '%s => %s' % (dt_str, datas_for_day)
        for i,b in enumerate(datas_for_day):
            if b:
                counts[i] += 1
        dt = dt + one_day
        num_days += 1

    print 'num days = %d' % num_days
    print 'chunk downtime counts =', counts
    plot_bins(counts, num_days, NUM_BINS)

 if __name__ == '__main__':
    main()
	# this file is from the google-chartwrapper project (http://code.google.com/p/google-chartwrapper/)

	coding = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789'
	ecoding = coding + '-.'
	codeset = {
	'simple': {
	'coding': coding,
	'max_value': 61,
	'char': ',',
	'dchar': '',
	'none': '_',
	'value': lambda x: coding[x]
	},
	'text': {
	'coding': '',
	'max_value': 100,
	'none': '-1',
	'char': '\|',
	'dchar': ',',
	'value': lambda x: '%.1f' % float(x)
	},
	'extended': {
	'coding': ecoding,
	'max_value': 4095,
	'none': '__',
	'dchar': '',
	'char': ',',
	'value': lambda x: '%s%s' % \
	(ecoding[int(float(x) / 64)], ecoding[int(x % 64)])
	}
	}

	class Encoder:
	"""Data encoder that handles simple,text, and extended encodings

	Based on javascript encoding algorithm and pygooglecharts"""
	def __init__(self, encoding=None, scale=None, series=''):
	self.series = series or ''
	if encoding is None:
	encoding = 'text'
	assert(encoding in ('simple','text','extended')),\
	'Unknown encoding: %s'%encoding
	self.encoding = encoding
	self.scale = scale
	self.codeset = codeset[encoding]

	def scalevalue(self, value):
	return value # one day...
	if self.encoding != 'text' and self.scale and \
	isinstance(value, int) or isinstance(value, float):
	if type(self.scale) == type(()):
	lower,upper = self.scale
	else:
	lower,upper = 0,float(self.scale)
	value = int(round(float(value - lower) * \
	self.codeset['max_value'] / upper))
	return min(value, self.codeset['max_value'])

	def encode(self, args, *kwargs):
	"""Encode wrapper for a dataset with maximum value

	Datasets can be one or two dimensional
	Strings are ignored as ordinal encoding"""
	if isinstance(args[0], str):
	return self.encode([args[0]],**kwargs)
	elif isinstance(args[0], int) or isinstance(args[0], float):
	return self.encode([[args[0]]],**kwargs)
	if len(args)>1:
	dataset = args
	else:
	dataset = args[0]
	typemap = list(map(type,dataset))
	code = self.encoding[0]
	if type('') in typemap:
	data = ','.join(map(str,dataset))
	elif type([]) in typemap or type(()) in typemap:
	data = self.codeset['char'].join(map(self.encodedata, dataset))
	elif len(dataset) == 1 and hasattr(dataset[0], '__iter__'):
	data = self.encodedata(dataset[0])
	else:
	try:
	data = self.encodedata(dataset)
	except ValueError:
	data = self.encodedata(','.join(map(unicode,dataset)))
	if not '.' in data and code == 't':
	code = 'e'
	return '%s%s:%s'%(code,self.series,data)

	def encodedata(self, data):
	sub_data = []
	enc_size = len(self.codeset['coding'])
	for value in data:
	if value in (None,'None'):
	sub_data.append(self.codeset['none'])
	elif isinstance(value, str):
	sub_data.append(value)
	elif value >= -1:
	try:
	sub_data.append(self.codeset['value'](self.scalevalue(value)))
	except:
	raise ValueError('cannot encode value: %s'%value)
	return self.codeset['dchar'].join(sub_data)

	def decode(self, astr):
	e = astr[0]
	dec_data = []
	for data in astr[2:].split(self.codeset['char']):
	sub_data = []
	if e == 't':
	sub_data.extend(map(float, data.split(',')))
	elif e == 'e':
	flag = 0
	index = self.codeset['coding'].index
	for i in range(len(data)):
	if not flag:
	this,next = index(data[i]),index(data[i+1])
	flag = 1
	sub_data.append((64 * this) + next)
	else: flag = 0
	elif e == 's':
	sub_data.extend(map(self.codeset['coding'].index, data))
	dec_data.append(sub_data)
	return dec_data
	import datetime
	import gzip
	import re
	import StringIO
	import urllib2

	from encoding import Encoder # from google-chartwrapper
	from numpy import numarray
	import pylab

	DATASTORE_STATUS_URL = 'http://code.google.com/status/appengine/detail/datastore/%s'
	DATE_FORMAT = '%Y/%m/%d'
	ENC = Encoder('extended')
	#ERROR_RATES = ('get', 'put', 'update', 'delete', 'query')
	ERROR_RATES = ('put', 'update', 'delete')
	NUM_BINS = 24 * 4 # bins are 15 minutes wide
	RAW_MAX = 4095
	X_MAX = 24.0
	Y_MAX = 100.0
	Y_THRESH = 0.0

	RE_CHD = re.compile(r'&chd=([^&]*)')
	def extract_data_string_from_url(url):
	m = RE_CHD.search(url)
	if m:
	return m.group(1)
	else:
	raise Exception("url does not contain any data - missing query parameter 'chd'")

	RESTR_SET = r'[^,]*'
	RE_SECOND_SET = re.compile(r'e:%s,%s' % (RESTR_SET, RESTR_SET))
	def data_string_to_scaled_data(data_str):
	m = RE_SECOND_SET.search(data_str)
	if not m:
	raise Exception("data string does not contain data in the expected format")
	edata = m.group(0)
	if not edata:
	return [] # no downtime if there is only one data set

	xraw, yraw = ENC.decode(edata)
	xscl = [x*X_MAX/RAW_MAX for x in xraw]
	yscl = [y*Y_MAX/RAW_MAX for y in yraw]
	return zip(xscl, yscl)

	def bin_data(data_scaled, y_thresh=Y_THRESH):
	"""Returns a list. Each element represents a period of time throughout the
	day (length of time determined by NUM_BINS - e.g., 12 bins => each element
	covers 24/12=2hours). The element is True iff there was downtime greater
	than y_thresh.
	"""
	prev_down_i = None
	ret = [False] * NUM_BINS
	for x,y in data_scaled:
	if y > y_thresh:
	i = min(int(x/X_MAX * NUM_BINS), NUM_BINS-1)
	ret[i] = True
	# if previous reading was downtime too, then mark all bins between
	# the previous reading and this reading as downtime too
	if prev_down_i is not None:
	for j in xrange(prev_down_i, i):
	ret[j] = True
	prev_down_i = i
	else:
	prev_down_i = None
	return ret

	def extract_data_for_specific_error_rate(html, name):
	m = re.search(r'"ae-trust-detail-datastore-%s-error_rate-link"[^(][(]([^)])[)]'%name, html, re.DOTALL)
	if not m:
	raise Exception("Missing data for %s" % name)
	chart_url = m.group(1)
	data_str = extract_data_string_from_url(chart_url)
	data_scaled = data_string_to_scaled_data(data_str)
	data_binned = bin_data(data_scaled)
	return data_binned

	def extract_error_rate_data(url, error_rates=ERROR_RATES):
	"""Extracts error rate data from `url` for each error rate specified in
	error_rates. The output is like that of bin_data(), except the results of
	each error_rate is OR'd together. Thus each element/bin in the returned
	list indicates whether ANY error rate was greater than Y_THRESH for the
	chunk of time it represents.
	"""
	try:
	resp = urllib2.urlopen(url)
	except urllib2.HTTPError, e:
	raise Exception('HTTP failure: %s (%s)' % (e,url))
	except urllib2.URLError, e:
	raise Exception('Fetch failure: %s (%s)' % (e,url))
	html = resp.read()
	if resp.info().get('content-encoding') == 'gzip':
	html = gzip.GzipFile(fileobj=StringIO.StringIO(html)).read()
	datas = []
	for name in error_rates:
	datas.append(extract_data_for_specific_error_rate(html, name))
	# combine the channels - if any is True, mark the bin as true
	return [any(t) for t in zip(*datas)]

	def plot_bins(data, num_days, num_bins, yticksz=5, width=0.5):
	labels = []
	if num_bins >= 24:
	for i in xrange(24):
	labels += ['%02d:00'%i]
	labels += ['']*(num_bins/24-1)
	else:
	for i in xrange(num_bins):
	labels += ['%d:00-%d:00' % (i(24.0/num_bins), (i+1)(24.0/num_bins))]
	xlocs = numarray.array(range(len(data))) + width
	pylab.bar(xlocs, data, width=width)
	pylab.xticks(xlocs+width/2.0, labels)
	pylab.xlabel("Time of Day")
	pylab.yticks(filter(lambda x: x%yticksz==0,range(0,max(data)+yticksz)))
	pylab.ylabel("# of Days with Downtime")
	pylab.xlim(0, xlocs[-1]+width*2)
	pylab.ylim(0, int((max(data)+yticksz)/yticksz)*yticksz+1)
	pylab.title("# Days with Downtime in the Past %d Days" % num_days)
	pylab.gca().get_xaxis().tick_bottom()
	pylab.gca().get_yaxis().tick_left()
	pylab.show()

	def main(start=datetime.datetime(2009,07,20), end=datetime.datetime.now()):
	"""Time of day is divided up into NUM_BINS equi-sized chunks. Then downtime
	from each day from `start` to `end` is retrieved. Each chunk counts the
	number of days which experienced downtime greater than Y_THRESH during the
	time block allocated to the chunk.

	Data is printed to stdout and shown in a plot.
	"""
	dt = start
	one_day = datetime.timedelta(days=1)
	counts = [0] * NUM_BINS
	num_days = 0
	while dt < end:
	dt_str = dt.strftime(DATE_FORMAT)
	print 'working on', dt_str
	datas_for_day = extract_error_rate_data(DATASTORE_STATUS_URL % dt_str)
	print '%s => %s' % (dt_str, datas_for_day)
	for i,b in enumerate(datas_for_day):
	if b:
	counts[i] += 1
	dt = dt + one_day
	num_days += 1

	print 'num days = %d' % num_days
	print 'chunk downtime counts =', counts
	plot_bins(counts, num_days, NUM_BINS)

	if __name__ == '__main__':
	main()