Last active
August 24, 2024 20:07
-
-
Save edgartanaka/a1bc5a0d7bb843f19b62669cd9bb3f8e to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
import urllib.request | |
from tqdm import tqdm | |
from multiprocessing.pool import ThreadPool | |
import os.path | |
# | |
# This gist downloads all series from TMDB. You can easily modify this to download all movies. | |
# It uses threads to parallelize downloads and speed up this process. | |
# Depends on python 3. Tested on anaconda. | |
# Steps: | |
# 1. download this file from http://files.tmdb.org/p/exports/tv_series_ids_07_05_2020.json.gz and uncompress in the local directory | |
# 2. create an api key in TMDB site | |
# 3. set your api key in the script | |
# 4. install python libs: pandas, tqdm | |
# 5. run script | |
# | |
# TODO: add your api key here | |
api_key = '' | |
# Ref: https://developers.themoviedb.org/3/getting-started/daily-file-exports | |
df = pd.read_json('tv_series_ids_07_05_2020.json', lines=True) | |
ids = list(df['id']) | |
urls = [(f"series_{id}.json", f"https://api.themoviedb.org/3/tv/{id}?api_key={api_key}") for id in ids] | |
def log_failed(uri): | |
with open('series_failed.txt', 'w') as writer: | |
writer.write(uri) | |
def is_file_exists(path): | |
return os.path.isfile(path) | |
def fetch_url(entry): | |
try: | |
path, uri = entry | |
if not is_file_exists(path): | |
urllib.request.urlretrieve(uri, path) | |
return path | |
except: | |
log_failed(uri) | |
results = ThreadPool(8).imap_unordered(fetch_url, urls) | |
for path in tqdm(results): | |
pass |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
I have published the downloaded datasets in kaggle as well: https://www.kaggle.com/edgartanaka1/tmdb-movies-and-series