dbrgn · December 20, 2011 14:20 · dbrgn · Dec 20, 2011
diff --git a/fetch.py b/fetch.py
 # WARNING: Ugly hack.

 import re
 import requests
 from BeautifulSoup import BeautifulSoup
 import imdb


 print 'Getting data from Wikipedia...'

 # Request Wikipedia page
 r = requests.get('http://en.wikipedia.org/wiki/List_of_films_that_most_frequently_use_the_word_%22fuck%22')


 print 'Parsing HTML data...'

 # Fetch HTML table
 soup = BeautifulSoup(r.content)
 table = soup.find('table', {'class': 'wikitable sortable'})
 rows = table.findAll('tr')

 # Parse HTML table
 keys = {
    0: 'film',
    1: 'year',
    2: 'fuckcount',
    3: 'minutes',
    4: 'uses_per_minute',
    5: 'source',
    6: 'ref'
 }
 movies = []
 for row in rows[1:]:
    rowmap = {}
    for i, field in enumerate(row.findAll('td')):
        if i == 2:
            text = re.sub(r'^([0-9]+).*', r'\1', field.text)
        else:
            text = field.text
        rowmap[keys[i]] = text
    movies.append(rowmap)

 # Fetch imdb data
 ia = imdb.IMDb()
 for movie in movies:
    search_result = ia.search_movie(movie['film'])
    print 'Fetching IMDB rating for movie %s...' % movie['film']
    try:
        match = filter(lambda x: x['year'] == int(movie['year']), search_result)[0]
        ia.update(match)
        print 'Found match "%s (%s)" with rating %s.' % (match['title'], match['year'], match['rating'])
        movie['imdb_rating'] = match['rating']
    except (IndexError, KeyError):
        movie['imdb_rating'] = None
        print 'Found no match.'
        pass


 print 'Exporting to HTML...'

 row_tpl = '<tr><td>%s</td><td>%s</td><td>%s</td><td>%s</td></tr>\n'
 rows = []
 for movie in movies:
    rows.append(row_tpl % (movie['film'], movie['fuckcount'], movie['uses_per_minute'], movie['imdb_rating']))
 table = '<table><tr><th>Movie</th><th>"Fuck" count</th><th>Uses/Minute</th><th>IMDB Rating</th></tr>%s</table>' % ''.join(rows)
 html = '<html><head><title>FCPM IMDB Ratings</title></head><body>%s</body></html>' % table

 outfile = open('data.html', 'w')
 outfile.write(html)
 outfile.close()

 print '...done.'


 print 'Exporting to JSON...'

 print '...not yet implemented.'


 print 'Done.'
	# WARNING: Ugly hack.

	import re
	import requests
	from BeautifulSoup import BeautifulSoup
	import imdb


	print 'Getting data from Wikipedia...'

	# Request Wikipedia page
	r = requests.get('http://en.wikipedia.org/wiki/List_of_films_that_most_frequently_use_the_word_%22fuck%22')


	print 'Parsing HTML data...'

	# Fetch HTML table
	soup = BeautifulSoup(r.content)
	table = soup.find('table', {'class': 'wikitable sortable'})
	rows = table.findAll('tr')

	# Parse HTML table
	keys = {
	0: 'film',
	1: 'year',
	2: 'fuckcount',
	3: 'minutes',
	4: 'uses_per_minute',
	5: 'source',
	6: 'ref'
	}
	movies = []
	for row in rows[1:]:
	rowmap = {}
	for i, field in enumerate(row.findAll('td')):
	if i == 2:
	text = re.sub(r'^([0-9]+).*', r'\1', field.text)
	else:
	text = field.text
	rowmap[keys[i]] = text
	movies.append(rowmap)

	# Fetch imdb data
	ia = imdb.IMDb()
	for movie in movies:
	search_result = ia.search_movie(movie['film'])
	print 'Fetching IMDB rating for movie %s...' % movie['film']
	try:
	match = filter(lambda x: x['year'] == int(movie['year']), search_result)[0]
	ia.update(match)
	print 'Found match "%s (%s)" with rating %s.' % (match['title'], match['year'], match['rating'])
	movie['imdb_rating'] = match['rating']
	except (IndexError, KeyError):
	movie['imdb_rating'] = None
	print 'Found no match.'
	pass


	print 'Exporting to HTML...'

	row_tpl = '<tr><td>%s</td><td>%s</td><td>%s</td><td>%s</td></tr>\n'
	rows = []
	for movie in movies:
	rows.append(row_tpl % (movie['film'], movie['fuckcount'], movie['uses_per_minute'], movie['imdb_rating']))
	table = '<table><tr><th>Movie</th><th>"Fuck" count</th><th>Uses/Minute</th><th>IMDB Rating</th></tr>%s</table>' % ''.join(rows)
	html = '<html><head><title>FCPM IMDB Ratings</title></head><body>%s</body></html>' % table

	outfile = open('data.html', 'w')
	outfile.write(html)
	outfile.close()

	print '...done.'


	print 'Exporting to JSON...'

	print '...not yet implemented.'


	print 'Done.'