Skip to content

Instantly share code, notes, and snippets.

@decadecity
Created July 29, 2012 11:46
Show Gist options
  • Save decadecity/3197982 to your computer and use it in GitHub Desktop.
Save decadecity/3197982 to your computer and use it in GitHub Desktop.
Data processing script to calculate the relationship between request time and DOM load time.
#!/usr/bin/python
"""
Data processing script to calculate the relationship between request time and
DOM load time.
See: http://blog.decadecity.net/2012/09/15/how-long-does-an-http-request-take/
"""
import apachelog # https://code.google.com/p/apachelog/
import re
import urlparse
def get_stats(log_data, format=None, max_time=6000):
"""
Parses an apache log file for two parameters ("dom" and "request").
Returns a list of tuples of related request and dom times.
"""
if format is None:
format = r'%h %l %u %t \"%r\" %>s %b \"%{Referer}i\" \"%{User-Agent}i\"'
p = apachelog.parser(format)
log = [] # Stats collected from the log file.
for line in log_data:
parsed = p.parse(line)
url = re.match(r'(.*) (.*) (.*)', parsed['%r']) # Break out the URL components.
parsed['method'] = url.group(1)
parsed['url'] = url.group(2)
parsed['protocol'] = url.group(3)
parsed['urlparsed'] = urlparse.urlparse(url.group(2))
parsed['params'] = urlparse.parse_qs(parsed['urlparsed'].query)
if 'dom' in parsed['params'] and 'request' in parsed['params']:
# We have the two params we need.
log.append(parsed)
result = [] # Output data.
for entry in log:
try:
dom = int(entry['params']['dom'][0])
request = int(entry['params']['request'][0])
if max_time > request > 0 and max_time > dom > 0:
# We have a pair of valid values so add to result.
result.append((request, dom))
except ValueError:
next
return result
if __name__ == '__main__':
import sys
import numpy
import matplotlib.pyplot as plt
data = sys.stdin.readlines() # Take log data from stdin - quick and dirty.
stats = get_stats(data)
# Working out y = nx + c where y is the Request time, x is the DOM time and n is the fudge factor.
x_list = []
y_list = []
for point in stats:
y_list.append(point[0])
x_list.append(point[1])
average_request = int(sum(y_list)) / len(y_list)
# This is the mechanics of solving y = nx + c using numpy.
x = numpy.array(x_list)
y = numpy.array(y_list)
A = numpy.vstack([x, numpy.ones(len(x))]).T
# Linear regression using least squares: https://en.wikipedia.org/wiki/Ordinary_least_squares
n, c = numpy.linalg.lstsq(A, y)[0]
# Now draw the result as a graph.
plt.axhline(y=average_request, label='Avg request (%d)' % (average_request), color='g')
plt.plot(x, y, 'o', label='Original data points (%d)' % (len(x)), markersize=2.5)
plt.plot(x, n*x + c, 'r', label='Fitted line: y = %sx + %s' % (round(n,3), int(c)))
plt.legend(loc='upper left')
plt.xlabel('DOM timer (ms)')
plt.ylabel('Request timer (ms)')
plt.show()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment