Created
March 1, 2011 22:21
-
-
Save jbeluch/850010 to your computer and use it in GitHub Desktop.
Scrapes movies listed on classiccinemaonline.com and creates a histogram of movies per year.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
''' | |
Jonathan Beluch - [email protected] | |
Scrapes movie titles from http://www.classiccinemaonline.com and | |
creates a histogram showing the distribution of titles by year. | |
Requires BeautifulSoup, matplotlib and numpy. | |
''' | |
from BeautifulSoup import BeautifulSoup as BS | |
from urlparse import urljoin, parse_qs | |
from urllib import urlencode | |
from itertools import chain | |
import urllib2 | |
import re | |
BASE_URL = 'http://www.classiccinemaonline.com' | |
HOMEPAGE_URL = 'http://www.classiccinemaonline.com/1/index.php' | |
CHART_FN = 'histogram.png' | |
TITLES_FN = 'movie_titles' | |
def get_page(url, data=None): | |
u = urllib2.urlopen(url, data) | |
src = u.read() | |
u.close() | |
return src | |
def get_genre_urls(url): | |
src = get_page(url) | |
#fix terrible html so beautiful soup doesn't barf | |
src = src.replace('</font color>', '</font>') | |
src = src.replace(r'<ol class=\"latestnews \">', '<ol class="latestnews">') | |
html = BS(src) | |
# <a> tag class names aren't consistent, so grab the 'rightcol' div and | |
# parse the child <a> tags | |
div = html.find('div', {'id': 'rightcol'}) | |
return [urljoin(BASE_URL, a['href']) for a in div.ul.findAll('a')] | |
def parse_movies(url): | |
_, qs = url.split('?', 1) | |
params = parse_qs(qs) | |
data = {'id': params['id'][0], 'limit': '0'} | |
src = get_page(url, urlencode(data)) | |
html = BS(src) | |
print 'Parsing movies from \'%s\'' % html.title.string.strip() | |
tr_tags = html.findAll('tr', {'class': re.compile('sectiontableentry')}) | |
return [tr.a.string.strip() for tr in tr_tags] | |
p = re.compile(r'\((\d{4})\)') | |
def get_year(title): | |
m = p.search(title) | |
if m: | |
return int(m.group(1)) | |
print "Warning: No year match for '%s'." % title | |
def print_to_file(items, fn): | |
with open(fn, 'w') as f: | |
f.writelines((str(i) + '\n' for i in items)) | |
def get_from_file(fn): | |
with open(fn) as f: | |
lines = map(str.strip, f.readlines()) | |
return lines | |
def create_plot(years): | |
import numpy as np | |
import matplotlib.pyplot as plt | |
from matplotlib.ticker import MultipleLocator | |
# Set up the figure and main plot | |
fig = plt.figure() | |
ax= fig.add_subplot(111) | |
# Calculate the range so we can ensure each bin covers a 1 year period | |
bin_range = max(years) - min(years) | |
n, bins, patches = ax.hist(years, bins=bin_range, align='left') | |
# Change major axes tickers to multiples of 5 instead of default 10 | |
ax.xaxis.set_major_locator(MultipleLocator(5)) | |
ax.yaxis.set_major_locator(MultipleLocator(5)) | |
ax.grid(True) | |
# Set up axes labels and plot title | |
ax.set_ylabel('Number of Movies + TV Shows') | |
ax.set_xlabel('Year') | |
ax.set_title('Number of Movies and TV Shows per Year') | |
# Enlarge the output figure so things aren't as cramped | |
fig = plt.gcf() | |
# Defaults is (8, 6) | |
fig.set_size_inches(12, 9) | |
# Save output | |
print "Saving histogram to '%s' ..." % CHART_FN | |
plt.savefig(CHART_FN) | |
if __name__ == '__main__': | |
print 'Downloading genres...' | |
genre_urls = get_genre_urls(HOMEPAGE_URL) | |
print 'Downloading movie titles...' | |
moviess = map(parse_movies, genre_urls) | |
# flatten list of lists | |
titles = list(chain(*moviess)) | |
# Write movie titles to file in case you want to run again without | |
# downloading everything | |
print "Saving movie titles to '%s' ..." % TITLES_FN | |
print_to_file(titles, TITLES_FN) | |
# If you want to rerun without downloading everything, comment out above | |
# and start here: | |
#titles = get_from_file(TITLES_FN) | |
# parse year from title strings | |
print 'Parsing years...' | |
years = map(get_year, titles) | |
# filter out non-matches | |
years = filter(None, years) | |
# build and save the graph | |
create_plot(years) | |
print 'Done.' | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment