Skip to content

Instantly share code, notes, and snippets.

@kwellman
Created October 18, 2010 16:00
Show Gist options
  • Save kwellman/632478 to your computer and use it in GitHub Desktop.
Save kwellman/632478 to your computer and use it in GitHub Desktop.
readability_benchmarks.py
"""Quick and dirty benchmarking for readability functions.
"""
import re, time, os, json
from urllib import urlopen
from hn import grabContent
from lxml_readability import extract
import socket
socket.setdefaulttimeout(30)
FILES_DIR = os.path.join(os.path.abspath(os.path.dirname(__file__)), 'files')
if not os.path.exists(FILES_DIR):
os.mkdir(FILES_DIR)
def get_test_pages(num_hn_pages=2, max_links=99, sleep_time=3):
"""Fetch pages from Hacker News.
"""
url = 'http://news.ycombinator.com/'
num_links = 0
for i in range(num_hn_pages):
print 'Fetching %s...' % url
hn_html = urlopen(url).read()
for match in re.findall(r'<a href="([^"]+?)">[^<]+?</a><span class="comhead">', hn_html):
print 'Link page: %s' % match
# fetch link page
try:
link_html = urlopen(match).read()
except IOError:
continue
open(os.path.join(FILES_DIR, '%s.html' % num_links), 'w').write(link_html)
num_links += 1
if num_links >= max_links:
return
next_page = re.findall(r'<a href="([^"]+?)" rel="nofollow">More</a>', hn_html)[0]
url = 'http://news.ycombinator.com' + next_page
# be nice
time.sleep(sleep_time)
def run_benchmarks(funcs=[grabContent, extract]):
contents = []
all_results = []
for filename in os.listdir(FILES_DIR):
contents.append(open(os.path.join(FILES_DIR, filename)).read())
for func in funcs:
results = []
for content in contents:
print '.'
t1 = time.time()
# use a dummy link (example.com) because it doesn't matter
excerpt = func('http://example.com/', content)
t2 = time.time()
results.append(t2-t1)
all_results.append(results)
open('speeds.json', 'w').write(json.dumps(all_results))
if __name__ == '__main__':
get_test_pages(3)
run_benchmarks()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment