Skip to content

Instantly share code, notes, and snippets.

@gcr
Last active August 29, 2015 14:08
Show Gist options
  • Save gcr/061fbf19b9f7e15a633c to your computer and use it in GitHub Desktop.
Save gcr/061fbf19b9f7e15a633c to your computer and use it in GitHub Desktop.
import gzip
import re
def load_all_movies(filename):
"""
Load and parse 'plot.list.gz'. Yields each consecutive movie as a dictionary:
{"title": "The movie's title",
"year": The decade of the movie, like 1950 or 1980,
"identifier": Full key of IMDB's text string,
"summary": "The movie's plot summary"
}
You can download `plot.list.gz` from http://www.imdb.com/interfaces
"""
assert "plot.list.gz" in filename # Or whatever you called it
current_movie = None
movie_regexp = re.compile("MV: ((.*?) \(([0-9]+).*\)(.*))")
skipped = 0
for line in gzip.open(filename):
if line.startswith("MV"):
if current_movie:
# Fix up description and send it on
current_movie['summary'] = "\n".join(current_movie['summary'] )
yield current_movie
current_movie = None
try:
identifier, title, year, episode = movie_regexp.match(line).groups()
if int(year) < 1930 or int(year) > 2014:
# Something went wrong here
raise ValueError(identifier)
current_movie = {"title": title,
"year": 10*int(int(year)/10),
'identifier': identifier,
'episode': episode,
"summary": []}
except:
skipped += 1
if line.startswith("PL: ") and current_movie:
# Add to the current movie's description
current_movie['summary'].append(line.replace("PL: ",""))
print "Skipped",skipped
all_movies = list(load_all_movies("/home/michael/tmp/plot.list.gz"))
len(all_movies)
# => 379451
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment