Last active
August 29, 2015 14:08
-
-
Save gcr/061fbf19b9f7e15a633c to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import gzip | |
import re | |
def load_all_movies(filename): | |
""" | |
Load and parse 'plot.list.gz'. Yields each consecutive movie as a dictionary: | |
{"title": "The movie's title", | |
"year": The decade of the movie, like 1950 or 1980, | |
"identifier": Full key of IMDB's text string, | |
"summary": "The movie's plot summary" | |
} | |
You can download `plot.list.gz` from http://www.imdb.com/interfaces | |
""" | |
assert "plot.list.gz" in filename # Or whatever you called it | |
current_movie = None | |
movie_regexp = re.compile("MV: ((.*?) \(([0-9]+).*\)(.*))") | |
skipped = 0 | |
for line in gzip.open(filename): | |
if line.startswith("MV"): | |
if current_movie: | |
# Fix up description and send it on | |
current_movie['summary'] = "\n".join(current_movie['summary'] ) | |
yield current_movie | |
current_movie = None | |
try: | |
identifier, title, year, episode = movie_regexp.match(line).groups() | |
if int(year) < 1930 or int(year) > 2014: | |
# Something went wrong here | |
raise ValueError(identifier) | |
current_movie = {"title": title, | |
"year": 10*int(int(year)/10), | |
'identifier': identifier, | |
'episode': episode, | |
"summary": []} | |
except: | |
skipped += 1 | |
if line.startswith("PL: ") and current_movie: | |
# Add to the current movie's description | |
current_movie['summary'].append(line.replace("PL: ","")) | |
print "Skipped",skipped | |
all_movies = list(load_all_movies("/home/michael/tmp/plot.list.gz")) | |
len(all_movies) | |
# => 379451 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment