gcr · August 29, 2015 14:08
diff --git a/parse_movies_example.py b/parse_movies_example.py
 import gzip
 import re
 def load_all_movies(filename):
    """
    Load and parse 'plot.list.gz'. Yields each consecutive movie as a dictionary:
        {"title": "The movie's title",
         "year": The decade of the movie, like 1950 or 1980,
         "identifier": Full key of IMDB's text string,
         "summary": "The movie's plot summary"
        }
    You can download `plot.list.gz` from http://www.imdb.com/interfaces
    """
    assert "plot.list.gz" in filename # Or whatever you called it
    current_movie = None
    movie_regexp = re.compile("MV: ((.*?) \(([0-9]+).*\)(.*))")
    skipped = 0
    for line in gzip.open(filename):
        if line.startswith("MV"):
            if current_movie:
                # Fix up description and send it on
                current_movie['summary'] = "\n".join(current_movie['summary'] )
                yield current_movie
            current_movie = None
            try:
                identifier, title, year, episode = movie_regexp.match(line).groups()
                if int(year) < 1930 or int(year) > 2014:
                    # Something went wrong here
                    raise ValueError(identifier)
                current_movie = {"title": title,
                                 "year": 10*int(int(year)/10),
                                 'identifier': identifier,
                                 'episode': episode,
                                 "summary": []}
            except:
                skipped += 1
        if line.startswith("PL: ") and current_movie:
            # Add to the current movie's description
            current_movie['summary'].append(line.replace("PL: ",""))
    print "Skipped",skipped


 all_movies = list(load_all_movies("/home/michael/tmp/plot.list.gz"))

 len(all_movies)
 # => 379451
	import gzip
	import re
	def load_all_movies(filename):
	"""
	Load and parse 'plot.list.gz'. Yields each consecutive movie as a dictionary:
	{"title": "The movie's title",
	"year": The decade of the movie, like 1950 or 1980,
	"identifier": Full key of IMDB's text string,
	"summary": "The movie's plot summary"
	}
	You can download `plot.list.gz` from http://www.imdb.com/interfaces
	"""
	assert "plot.list.gz" in filename # Or whatever you called it
	current_movie = None
	movie_regexp = re.compile("MV: ((.?) \(([0-9]+).\)(.*))")
	skipped = 0
	for line in gzip.open(filename):
	if line.startswith("MV"):
	if current_movie:
	# Fix up description and send it on
	current_movie['summary'] = "\n".join(current_movie['summary'] )
	yield current_movie
	current_movie = None
	try:
	identifier, title, year, episode = movie_regexp.match(line).groups()
	if int(year) < 1930 or int(year) > 2014:
	# Something went wrong here
	raise ValueError(identifier)
	current_movie = {"title": title,
	"year": 10*int(int(year)/10),
	'identifier': identifier,
	'episode': episode,
	"summary": []}
	except:
	skipped += 1
	if line.startswith("PL: ") and current_movie:
	# Add to the current movie's description
	current_movie['summary'].append(line.replace("PL: ",""))
	print "Skipped",skipped


	all_movies = list(load_all_movies("/home/michael/tmp/plot.list.gz"))

	len(all_movies)
	# => 379451