decadecity · July 29, 2012 11:46
diff --git a/data_extract.py b/data_extract.py
 #!/usr/bin/python
 """
 Data processing script to calculate the relationship between request time and
 DOM load time.

 See: http://blog.decadecity.net/2012/09/15/how-long-does-an-http-request-take/
 """
 import apachelog # https://code.google.com/p/apachelog/
 import re
 import urlparse

 def get_stats(log_data, format=None, max_time=6000):
    """
    Parses an apache log file for two parameters ("dom" and "request").

    Returns a list of tuples of related request and dom times.
    """
    if format is None:
        format = r'%h %l %u %t \"%r\" %>s %b \"%{Referer}i\" \"%{User-Agent}i\"'
    p = apachelog.parser(format)
    log = [] # Stats collected from the log file.
    for line in log_data:
        parsed = p.parse(line)
        url = re.match(r'(.*) (.*) (.*)', parsed['%r']) # Break out the URL components.
        parsed['method'] = url.group(1)
        parsed['url'] = url.group(2)
        parsed['protocol'] = url.group(3)
        parsed['urlparsed'] = urlparse.urlparse(url.group(2))
        parsed['params'] = urlparse.parse_qs(parsed['urlparsed'].query)
        if 'dom' in parsed['params'] and 'request' in parsed['params']:
            # We have the two params we need.
            log.append(parsed)
    result = [] # Output data.
    for entry in log:
        try:
            dom = int(entry['params']['dom'][0])
            request = int(entry['params']['request'][0])
            if max_time > request > 0 and max_time > dom > 0:
                # We have a pair of valid values so add to result.
                result.append((request, dom))
        except ValueError:
            next
    return result


 if __name__ == '__main__':
    import sys
    import numpy
    import matplotlib.pyplot as plt
    data = sys.stdin.readlines() # Take log data from stdin - quick and dirty.
    stats = get_stats(data)

    # Working out y = nx + c where y is the Request time, x is the DOM time and n is the fudge factor.
    x_list = []
    y_list = []
    for point in stats:
        y_list.append(point[0])
        x_list.append(point[1])

    average_request = int(sum(y_list)) / len(y_list)

    # This is the mechanics of solving y = nx + c using numpy.
    x = numpy.array(x_list)
    y = numpy.array(y_list)
    A = numpy.vstack([x, numpy.ones(len(x))]).T
    # Linear regression using least squares: https://en.wikipedia.org/wiki/Ordinary_least_squares
    n, c = numpy.linalg.lstsq(A, y)[0]

    # Now draw the result as a graph.
    plt.axhline(y=average_request, label='Avg request (%d)' % (average_request), color='g')
    plt.plot(x, y, 'o', label='Original data points (%d)' % (len(x)), markersize=2.5)
    plt.plot(x, n*x + c, 'r', label='Fitted line: y = %sx + %s' % (round(n,3), int(c)))
    plt.legend(loc='upper left')
    plt.xlabel('DOM timer (ms)')
    plt.ylabel('Request timer (ms)')
    plt.show()
	#!/usr/bin/python
	"""
	Data processing script to calculate the relationship between request time and
	DOM load time.

	See: http://blog.decadecity.net/2012/09/15/how-long-does-an-http-request-take/
	"""
	import apachelog # https://code.google.com/p/apachelog/
	import re
	import urlparse

	def get_stats(log_data, format=None, max_time=6000):
	"""
	Parses an apache log file for two parameters ("dom" and "request").

	Returns a list of tuples of related request and dom times.
	"""
	if format is None:
	format = r'%h %l %u %t \"%r\" %>s %b \"%{Referer}i\" \"%{User-Agent}i\"'
	p = apachelog.parser(format)
	log = [] # Stats collected from the log file.
	for line in log_data:
	parsed = p.parse(line)
	url = re.match(r'(.) (.) (.*)', parsed['%r']) # Break out the URL components.
	parsed['method'] = url.group(1)
	parsed['url'] = url.group(2)
	parsed['protocol'] = url.group(3)
	parsed['urlparsed'] = urlparse.urlparse(url.group(2))
	parsed['params'] = urlparse.parse_qs(parsed['urlparsed'].query)
	if 'dom' in parsed['params'] and 'request' in parsed['params']:
	# We have the two params we need.
	log.append(parsed)
	result = [] # Output data.
	for entry in log:
	try:
	dom = int(entry['params']['dom'][0])
	request = int(entry['params']['request'][0])
	if max_time > request > 0 and max_time > dom > 0:
	# We have a pair of valid values so add to result.
	result.append((request, dom))
	except ValueError:
	next
	return result


	if __name__ == '__main__':
	import sys
	import numpy
	import matplotlib.pyplot as plt
	data = sys.stdin.readlines() # Take log data from stdin - quick and dirty.
	stats = get_stats(data)

	# Working out y = nx + c where y is the Request time, x is the DOM time and n is the fudge factor.
	x_list = []
	y_list = []
	for point in stats:
	y_list.append(point[0])
	x_list.append(point[1])

	average_request = int(sum(y_list)) / len(y_list)

	# This is the mechanics of solving y = nx + c using numpy.
	x = numpy.array(x_list)
	y = numpy.array(y_list)
	A = numpy.vstack([x, numpy.ones(len(x))]).T
	# Linear regression using least squares: https://en.wikipedia.org/wiki/Ordinary_least_squares
	n, c = numpy.linalg.lstsq(A, y)[0]

	# Now draw the result as a graph.
	plt.axhline(y=average_request, label='Avg request (%d)' % (average_request), color='g')
	plt.plot(x, y, 'o', label='Original data points (%d)' % (len(x)), markersize=2.5)
	plt.plot(x, n*x + c, 'r', label='Fitted line: y = %sx + %s' % (round(n,3), int(c)))
	plt.legend(loc='upper left')
	plt.xlabel('DOM timer (ms)')
	plt.ylabel('Request timer (ms)')
	plt.show()