Skip to content

Instantly share code, notes, and snippets.

@dbrgn
Created December 20, 2011 14:20
Show Gist options
  • Save dbrgn/1501715 to your computer and use it in GitHub Desktop.
Save dbrgn/1501715 to your computer and use it in GitHub Desktop.
Get list of films that most frequently use the word "fuck" and fetch the corresponding IMDB ratings.
# WARNING: Ugly hack.
import re
import requests
from BeautifulSoup import BeautifulSoup
import imdb
print 'Getting data from Wikipedia...'
# Request Wikipedia page
r = requests.get('http://en.wikipedia.org/wiki/List_of_films_that_most_frequently_use_the_word_%22fuck%22')
print 'Parsing HTML data...'
# Fetch HTML table
soup = BeautifulSoup(r.content)
table = soup.find('table', {'class': 'wikitable sortable'})
rows = table.findAll('tr')
# Parse HTML table
keys = {
0: 'film',
1: 'year',
2: 'fuckcount',
3: 'minutes',
4: 'uses_per_minute',
5: 'source',
6: 'ref'
}
movies = []
for row in rows[1:]:
rowmap = {}
for i, field in enumerate(row.findAll('td')):
if i == 2:
text = re.sub(r'^([0-9]+).*', r'\1', field.text)
else:
text = field.text
rowmap[keys[i]] = text
movies.append(rowmap)
# Fetch imdb data
ia = imdb.IMDb()
for movie in movies:
search_result = ia.search_movie(movie['film'])
print 'Fetching IMDB rating for movie %s...' % movie['film']
try:
match = filter(lambda x: x['year'] == int(movie['year']), search_result)[0]
ia.update(match)
print 'Found match "%s (%s)" with rating %s.' % (match['title'], match['year'], match['rating'])
movie['imdb_rating'] = match['rating']
except (IndexError, KeyError):
movie['imdb_rating'] = None
print 'Found no match.'
pass
print 'Exporting to HTML...'
row_tpl = '<tr><td>%s</td><td>%s</td><td>%s</td><td>%s</td></tr>\n'
rows = []
for movie in movies:
rows.append(row_tpl % (movie['film'], movie['fuckcount'], movie['uses_per_minute'], movie['imdb_rating']))
table = '<table><tr><th>Movie</th><th>"Fuck" count</th><th>Uses/Minute</th><th>IMDB Rating</th></tr>%s</table>' % ''.join(rows)
html = '<html><head><title>FCPM IMDB Ratings</title></head><body>%s</body></html>' % table
outfile = open('data.html', 'w')
outfile.write(html)
outfile.close()
print '...done.'
print 'Exporting to JSON...'
print '...not yet implemented.'
print 'Done.'
@dbrgn
Copy link
Author

dbrgn commented Dec 20, 2011

Note: Some of the years on the Wikipedia page seem to be wrong, so it won't fetch those ratings.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment