Skip to content

Instantly share code, notes, and snippets.

@SohierDane
Created September 27, 2017 21:51
Show Gist options
  • Save SohierDane/4a84cb96d220fc4791f52562be37968b to your computer and use it in GitHub Desktop.
Save SohierDane/4a84cb96d220fc4791f52562be37968b to your computer and use it in GitHub Desktop.
"""
Pull movie metadata from the https://www.themoviedb.org API.
Requires an API key stored in a .config file
The code is currently restricted to the movie category. To get it to run with
other categories, update the constants
(CATEGORY_SPECIFIC_CALLS, JSON_COLUMNS, KEYS_TO_DROP)
and delete the movie specific section of the export_data() function.
Please see attribution requirements here before posting data:
https://www.themoviedb.org/faq/api
API documentation:
https://developers.themoviedb.org/3/getting-started
https://www.themoviedb.org/documentation/api
"""
import gzip
import json
import os
import pandas as pd
import requests
from io import BytesIO
from time import sleep
BASE_API_CALL = 'https://api.themoviedb.org/3/{category}/{entry_id}?api_key={api_key}{category_specifics}'
CATEGORIES = ['movie']
DOWNLOADS_PER_DISK_WRITE = 40
MAX_DOWNLOADS_PER_SECOND = 4
MAX_ATTEMPTS = 3
RATE_LIMITER_DELAY_SECONDS = 10
RATE_LIMIT_EXCEEDED_STATUS_CODE = 429
SUCCESSFUL_CALL_STATUS_CODE = 200
CATEGORY_SPECIFIC_CALLS = {
'movie': '&append_to_response=credits,keywords',
}
JSON_COLUMNS = {
'genres',
'keywords',
'production_countries',
'production_companies',
'spoken_languages'
}
KEYS_TO_DROP = {
'adult',
'backdrop_path',
'belongs_to_collection',
'imdb_id',
'poster_path',
'profile_path',
'video',
}
def was_successful(response):
return response.status_code == SUCCESSFUL_CALL_STATUS_CODE
def was_rate_limited(response):
return response.status_code == RATE_LIMIT_EXCEEDED_STATUS_CODE
def make_request(call_url, prior_attempts=0):
if prior_attempts >= MAX_ATTEMPTS:
return None
response = requests.get(call_url)
if was_rate_limited(response):
sleep(RATE_LIMITER_DELAY_SECONDS)
sleep(1 / MAX_DOWNLOADS_PER_SECOND)
if was_successful(response):
return response.json()
else:
sleep(1) # attempt to sleep through any intermittent issues
return make_request(call_url, prior_attempts + 1)
def make_detail_request(category, entry_id):
category_specifics = ''
if category in CATEGORY_SPECIFIC_CALLS:
category_specifics = CATEGORY_SPECIFIC_CALLS[category]
call_url = BASE_API_CALL.format(
category=category,
entry_id=entry_id,
api_key=API_KEY,
category_specifics=category_specifics,
)
return make_request(call_url)
def load_api_key():
return json.load(open('.config'))['api_key']
def make_category_id_url_suffix(category, extension='json'):
year = str(pd.datetime.today().year)
month = str(pd.datetime.today().month).zfill(2)
day = str(pd.datetime.today().day - 1).zfill(2)
return '_'.join([category, 'ids', month, day, year]) + '.' + extension
def download_id_list_as_csv(category):
# see daily file export list docs at:
# https://developers.themoviedb.org/3/getting-started/daily-file-exports
print(f'Downloading list of ids for {category}')
id_list_name = make_category_id_url_suffix(category)
ID_LISTS_RAW_URL = 'http://files.tmdb.org/p/exports/{0}.gz'.format(id_list_name)
with gzip.open(BytesIO(requests.get(ID_LISTS_RAW_URL).content), 'r') as f_open:
id_list = f_open.readlines().decode('utf-8')
# original 'json' is malformed, is actually one dict per line
ids = pd.DataFrame([json.loads(x) for x in id_list])
# some entries in the movie id list appear to be collections rather than movies
if 'original_title' in ids.columns:
ids.original_title = ids.original_title.apply(str)
ids = ids[~ids.original_title.str.endswith(' Collection')].copy()
# You have to drop adult films if you want to post any new data to Kaggle.
if 'adult' in ids.columns:
ids = ids[~ids['adult']].copy()
ids.to_csv(category + '_ids.csv', index=False)
def load_id_list(category):
if not os.path.exists(category + '_ids.csv'):
download_id_list_as_csv(category)
df = pd.read_csv(category + '_ids.csv')
return df.id.values.tolist()
def unpack_credits(df):
# credits were downloaded with the movie details to cut down on the
# total number of requests, but it should probably be stored separately
credits = pd.DataFrame(df[['credits', 'id', 'title']])
credits.rename(columns={'id': 'movie_id'}, inplace=True)
new_columns = ['cast', 'crew']
for column in new_columns:
credits[column] = credits['credits'].apply(
lambda x: x[column] if column in x else [])
credits[column] = credits[column].apply(lambda x:
[{k: v for k, v in i.items() if k not in {'profile_path'}} for i in x])
credits[column] = credits[column].apply(json.dumps)
del credits['credits']
del df['credits']
return df, credits
def export_data(category, all_entries):
if not all_entries:
return None
df = pd.DataFrame(all_entries)
df = df[[x for x in df.columns if x not in KEYS_TO_DROP]].copy()
if len(df[df.id.isnull()]) > 0:
print(f'Dropping {len(df[df.id.isnull()])} entries without ids')
df = df[~df.id.isnull()]
df = df[df.id.apply(lambda x: str(x).isnumeric())]
# this section about credits is specific to the movie category
df, credits = unpack_credits(df)
df['keywords'] = df['keywords'].apply(lambda x:
x['keywords'] if 'keywords' in x else [])
for column in JSON_COLUMNS:
df[column] = df[column].apply(json.dumps)
needs_header = not(os.path.exists(category + '_data.csv'))
df.to_csv(category + '_data.csv', index=False, mode='a+', header=needs_header)
credits.to_csv(category + '_credits.csv', index=False, mode='a+', header=needs_header)
def download_ids(category, id_list):
if os.path.exists(category + '_data.csv'):
existing_ids = pd.read_csv(category + '_data.csv', usecols=['id'], dtype=object)
set(existing_ids.id.values.tolist())
id_list = [x for x in id_list if str(x) not in existing_ids]
counter = 0
all_entries = []
print(f'Downloading details for {category}')
for movie_id in id_list:
current_data = make_detail_request(category, movie_id)
if not current_data:
print(f'Failed on id # {movie_id}')
continue
counter += 1
all_entries.append(current_data)
if counter % DOWNLOADS_PER_DISK_WRITE == 0:
print(f'Finished downloading {counter} entries for {category}')
export_data(category, all_entries)
all_entries = []
export_data(category, all_entries)
def download_all_data():
for category in CATEGORIES:
download_ids(category, load_id_list(category))
if __name__ == '__main__':
API_KEY = load_api_key()
download_all_data()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment