Last active
November 5, 2016 15:36
-
-
Save kartoch/5f3f6e75deb10c6fad35 to your computer and use it in GitHub Desktop.
The script connect to several IMDB pages to gather the whole list of episodes as JSON, each entry includes the cast as returned by IMDB. Great dataset for beginning a mongodb lab for my students.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# This script connect to several IMDB pages to gather the whole list of episodes as JSON, each entry includes the cast as | |
# returned by IMDB. Great dataset for beginning a mongodb lab for my students. | |
from bs4 import BeautifulSoup | |
from dateutil import parser | |
import dateutil | |
import json | |
import locale | |
import logging | |
import re | |
import requests | |
import sys | |
logging.basicConfig(level=logging.DEBUG) | |
logger = logging.getLogger(__name__) | |
locale.setlocale( locale.LC_ALL, 'en_US.UTF-8' ) | |
file = open("friends-full.json",'w') | |
episode_counter = 0 | |
def seasons_url_generator(): | |
for season in range(1,11): | |
yield("http://www.imdb.com/title/tt0108778/episodes?season=" + str(season), season) | |
def seasons_extract(season, entry): | |
global episode_counter | |
episode_counter += 1 | |
data = { | |
"position" : { | |
"season" : season, | |
"episode" : int(entry.meta["content"]) | |
}, | |
"number" : episode_counter, | |
"airdate" : entry.find("div", {"class" : "airdate"}).contents[0].strip(), | |
"title" : entry.strong.a["title"], | |
"description" : entry.find("div", {"itemprop" : "description"}).contents[0].strip(), | |
"cast" : [] | |
} | |
url = "http://www.imdb.com" + entry.strong.a["href"] | |
url = url[:url.index('?')] | |
episodes_extract(url,data) | |
json.dump(data, file) | |
def episodes_extract(url, episode_data): | |
logger.info('Get URL for episode ' + str(episode_data['position']['episode']) + " of season " + str(episode_data['position']['season'])) | |
logger.debug('URL: ' + url) | |
r = requests.get(url) | |
soup = BeautifulSoup(r.text, "html.parser") | |
episode_data["ratingValue"] = locale.atof(soup.find("span", { "itemprop" : "ratingValue" }).contents[0].strip()) | |
episode_data["ratingCount"] = locale.atoi(soup.find("span", { "itemprop" : "ratingCount" }).contents[0].strip()) | |
fullcast_extract(url + "fullcredits", episode_data) | |
def fullcast_extract(url, episode_data): | |
logger.info('Get URL for fullcredit of episode ' + str(episode_data['position']['episode']) + " of season " + str(episode_data['position']['season'])) | |
logger.debug('URL: ' + url) | |
r = requests.get(url) | |
soup = BeautifulSoup(r.text, "html.parser") | |
credits = soup.find("div", { "id" : "fullcredits_content"}).findAll(["table","h4"]) | |
# director(s) | |
assert "Directed by" in credits[0].contents[0] | |
episode_data["directors"] = [] | |
for directors in credits[1].tbody.findAll("tr"): | |
director = directors.find("td", {"class" : "name"}) | |
if director is None: | |
continue | |
elif director.find('a') is None: | |
episode_data["directors"].append(director.contents[0].strip()) | |
else: | |
episode_data["directors"].append(director.find('a').contents[0].strip()) | |
assert len(episode_data["directors"]) > 0 | |
# writers | |
assert "Writing Credits" in credits[2].contents[0] | |
episode_data["writers"] = [] | |
for writers in credits[3].tbody.findAll("tr"): | |
writer = writers.find("td", {"class" : "name"}) | |
if writer is None: | |
continue | |
elif writer.find('a') is None: | |
episode_data["writers"].append(writer.contents[0].strip()) | |
else: | |
episode_data["writers"].append(writer.find('a').contents[0].strip()) | |
episode_data["writers"] = sorted(set(episode_data["writers"])) | |
assert len(episode_data["writers"]) > 0 | |
# cast | |
episode_data["cast"] = [] | |
actor_entries = soup.find("table", { "class": "cast_list"}).findAll("tr", { "class" : ["odd","even"]}) | |
for entry in actor_entries: | |
actor_extract(entry, episode_data) | |
assert len(episode_data["cast"]) > 0 | |
def actor_extract(entry, episode_data): | |
data = { | |
"actor" : entry.find("td",{ "itemprop" : "actor"}).text.strip(), | |
"character" : entry.find("td",{ "class" : "character"}).text.strip() | |
.replace("\n ","").replace("/ ","/ ") | |
} | |
episode_data["cast"].append(data) | |
for (url,season) in seasons_url_generator(): | |
logger.info('Get URL for season ' + str(season)) | |
logger.debug('URL: ' + url) | |
r = requests.get(url) | |
soup = BeautifulSoup(r.text, "html.parser") | |
episode_entries = soup.findAll("div", { "class" : "info", "itemprop": "episodes"}) | |
for entry in episode_entries: | |
seasons_extract(season, entry) | |
file.close() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment