Last active
August 29, 2015 14:13
-
-
Save rhiever/b4806dfa7304df741a0e to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import urllib2 | |
import time | |
import os | |
# Make a directory to store all of the HTML pages | |
os.system("mkdir pages") | |
# Download the raw HTML of all pages | |
for year in range(1982, 2015): | |
for week in range(1, 53): | |
try: | |
page_text = urllib2.urlopen("http://boxofficemojo.com/weekly/chart/?yr=%d&wk=%d&p=.htm" % (year, week)).read() | |
with open("pages/%d-%d.txt" % (year, week), "wb") as out_file: | |
out_file.write(page_text) | |
time.sleep(1) | |
except: | |
print("Error with week %d of %d" % (week, year)) | |
# Parse all of the HTML into a tsv | |
from glob import glob | |
from BeautifulSoup import BeautifulSoup | |
with open("top-movies-by-week.tsv", "wb") as out_file: | |
header = ["year", "week", "rank_this_week", "rank_last_week", "title", "studio", "weekly_gross", | |
"pct_change", "theater_count", "theater_count_change", "average_gross_per_theater", | |
"total_gross_so_far", "budget", "weeks_running"] | |
out_file.write("\t".join(header)) | |
for filename in sorted(glob("pages/*.txt")): | |
date = filename.split("/")[-1].split(".")[0] | |
year = date.split("-")[0] | |
week = date.split("-")[1] | |
page_text = "" | |
with open(filename, "rb") as in_file: | |
page_text = in_file.read() | |
soup = BeautifulSoup(page_text) | |
movie_table = soup.findAll("table")[3] | |
movie_entries = movie_table.findAll("tr")[3:-2] | |
for movie_entry in movie_entries: | |
movie_lines = movie_entry.findAll("td") | |
entries = ["0" if x.text == "-" else x.text.encode("utf-8", "replace") for x in movie_lines] | |
entries = [str(year), str(week)] + entries | |
if len(entries) != len(header): | |
print("Error: %s, %s" % (filename, entries[2])) | |
out_file.write("\n" + "\t".join(entries)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment