Created
July 29, 2012 11:46
-
-
Save decadecity/3197982 to your computer and use it in GitHub Desktop.
Data processing script to calculate the relationship between request time and DOM load time.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python | |
""" | |
Data processing script to calculate the relationship between request time and | |
DOM load time. | |
See: http://blog.decadecity.net/2012/09/15/how-long-does-an-http-request-take/ | |
""" | |
import apachelog # https://code.google.com/p/apachelog/ | |
import re | |
import urlparse | |
def get_stats(log_data, format=None, max_time=6000): | |
""" | |
Parses an apache log file for two parameters ("dom" and "request"). | |
Returns a list of tuples of related request and dom times. | |
""" | |
if format is None: | |
format = r'%h %l %u %t \"%r\" %>s %b \"%{Referer}i\" \"%{User-Agent}i\"' | |
p = apachelog.parser(format) | |
log = [] # Stats collected from the log file. | |
for line in log_data: | |
parsed = p.parse(line) | |
url = re.match(r'(.*) (.*) (.*)', parsed['%r']) # Break out the URL components. | |
parsed['method'] = url.group(1) | |
parsed['url'] = url.group(2) | |
parsed['protocol'] = url.group(3) | |
parsed['urlparsed'] = urlparse.urlparse(url.group(2)) | |
parsed['params'] = urlparse.parse_qs(parsed['urlparsed'].query) | |
if 'dom' in parsed['params'] and 'request' in parsed['params']: | |
# We have the two params we need. | |
log.append(parsed) | |
result = [] # Output data. | |
for entry in log: | |
try: | |
dom = int(entry['params']['dom'][0]) | |
request = int(entry['params']['request'][0]) | |
if max_time > request > 0 and max_time > dom > 0: | |
# We have a pair of valid values so add to result. | |
result.append((request, dom)) | |
except ValueError: | |
next | |
return result | |
if __name__ == '__main__': | |
import sys | |
import numpy | |
import matplotlib.pyplot as plt | |
data = sys.stdin.readlines() # Take log data from stdin - quick and dirty. | |
stats = get_stats(data) | |
# Working out y = nx + c where y is the Request time, x is the DOM time and n is the fudge factor. | |
x_list = [] | |
y_list = [] | |
for point in stats: | |
y_list.append(point[0]) | |
x_list.append(point[1]) | |
average_request = int(sum(y_list)) / len(y_list) | |
# This is the mechanics of solving y = nx + c using numpy. | |
x = numpy.array(x_list) | |
y = numpy.array(y_list) | |
A = numpy.vstack([x, numpy.ones(len(x))]).T | |
# Linear regression using least squares: https://en.wikipedia.org/wiki/Ordinary_least_squares | |
n, c = numpy.linalg.lstsq(A, y)[0] | |
# Now draw the result as a graph. | |
plt.axhline(y=average_request, label='Avg request (%d)' % (average_request), color='g') | |
plt.plot(x, y, 'o', label='Original data points (%d)' % (len(x)), markersize=2.5) | |
plt.plot(x, n*x + c, 'r', label='Fitted line: y = %sx + %s' % (round(n,3), int(c))) | |
plt.legend(loc='upper left') | |
plt.xlabel('DOM timer (ms)') | |
plt.ylabel('Request timer (ms)') | |
plt.show() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment