Last active
August 24, 2024 19:09
-
-
Save galli-leo/6398f9128ffc20af70c6c7eedfeb0a65 to your computer and use it in GitHub Desktop.
Dumping the entire TMDB database's json inside a directory. Usage: python3 tmdbdump.py START_ID (defaults to 1)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import requests | |
import time | |
import json | |
import sys | |
import threading | |
import queue | |
from datetime import timedelta | |
API_KEY = "YOUR_TMDB_APIKEY" | |
append_to_response = "images,alternative_titles,videos,credits,keywords,release_dates,similar_movies,recommendations" //Change this if you do not want all the data. | |
BASE = "https://api.themoviedb.org/3" | |
DIR = "TMDBDUMP" //Directory where json files will be stored. Must exist. | |
class Worker(threading.Thread): | |
def __init__(self, q, *args, **kwargs): | |
self.q = q | |
super().__init__(*args, **kwargs) | |
def run(self): | |
while True: | |
try: | |
work = self.q.get(timeout=3) # 3s timeout | |
do_work(work) | |
except queue.Empty: | |
return | |
# do whatever work you have to do on work | |
self.q.task_done() | |
class TMDBNotfoundException(Exception): | |
"""docstring for TMDBNotfoundException.""" | |
def __init__(self): | |
super(TMDBNotfoundException, self).__init__() | |
def get_from_tmdb(path, query, timeout_pause = 5): | |
global API_KEY, append_to_response, BASE | |
url = BASE + path | |
query["api_key"] = API_KEY | |
r = requests.get(url, params=query, allow_redirects=True) | |
if r.status_code == 404: | |
raise TMDBNotfoundException() | |
if "X-RateLimit-Remaining" in r.headers: | |
rate_limit = r.headers["X-RateLimit-Remaining"] | |
if int(rate_limit) <= 2: | |
print("Request limit almost reached. Sleeping.") | |
time.sleep(timeout_pause) | |
else: | |
time.sleep(timeout_pause) | |
return r.text | |
def get_movie(tmdbid): | |
global append_to_response, LATEST_ID | |
return get_from_tmdb("/movie/{0}".format(tmdbid), {"append_to_response" : append_to_response, "language" : "en-US", "include_image_language" : "en"}) | |
def get_latest_id(): | |
res = get_from_tmdb("/movie/latest", {}) | |
jres = json.loads(res) | |
if not "id" in jres: | |
time.sleep(1) | |
return get_latest_id() | |
return jres["id"] | |
LATEST_ID = get_latest_id() | |
def do_update_latest_id(): | |
global LATEST_ID, q | |
while not q.empty(): | |
lid = get_latest_id() | |
if lid != LATEST_ID: | |
print("Updating latest id from {0} to {1}".format(LATEST_ID, lid)) | |
for newId in range(LATEST_ID+1, lid+1): | |
q.put_nowait(newId) | |
LATEST_ID = lid | |
time.sleep(200) | |
def do_work(tmdbid): | |
global LATEST_ID, start, fromId | |
try: | |
res = get_movie(tmdbid) | |
jres = json.loads(res) | |
if "id" not in jres: | |
time.sleep(1) | |
do_work(tmdbid) | |
return | |
with open("TMDBDUMP/{0}.json".format(tmdbid), "a") as f: | |
f.write(res) | |
current = time.time() | |
difference = float(current - start) / float(tmdbid - fromId + 1) | |
eta = (LATEST_ID - tmdbid) * difference | |
print("Downloaded data for movie {0} ({1}).".format(jres["original_title"], tmdbid)) | |
print("\t {0}/{1} (Total: {2}/{3}), {4} left.".format(tmdbid - fromId + 1, LATEST_ID-fromId + 1, tmdbid, LATEST_ID, str(timedelta(seconds=eta)))) | |
except TMDBNotfoundException as e: | |
print("No movie with TMDBID {0} found".format(tmdbid)) | |
def download_all_json(): | |
global LATEST_ID, start, fromId, q | |
tmdbid = fromId | |
start = time.time() | |
q = queue.Queue() | |
for tmdbid in range(fromId, LATEST_ID+1): | |
q.put_nowait(tmdbid) | |
for _ in range(10): | |
Worker(q).start() | |
try: | |
t = threading.Thread(target=do_update_latest_id) | |
t.start() | |
q.join() | |
except Exception as e: | |
with q.mutex: | |
q.queue.clear() | |
raise e | |
# try: | |
# while True: | |
# | |
# tmdbid += 1 | |
# if tmdbid >= LATEST_ID-2: | |
# LATEST_ID = get_latest_id() | |
# except Exception as e: | |
# print(e) | |
# print("Last TMDBID was: {0}".format(tmdbid)) | |
if __name__ == "__main__": | |
if len(sys.argv) < 2: | |
fromId = 1 | |
download_all_json() | |
else: | |
fromId = int(sys.argv[1]) | |
download_all_json() |
Hey, love the script. I was wondering why there is no failsafe when a movie was already downloaded. I think it would be useful something like this, for those times that the script is run muitiple times in the same folder:
global LATEST_ID, start, fromId
if (os.path.isfile("TMDBDUMP/{0}.json".format(tmdbid))):
print("Skipping TMDBID {0}: movie already present".format(tmdbid))
return```
Doing a latest id update every 3 minutes is wasteful.
This script doesn't save series info - only movies.
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Hi -
Using the script on Windows 10, I was getting a Unicode error (UnicodeEncodeError: 'charmap' codec can't encode characters in position xxx). I fixed it by modifying line 89 to:
with open("TMDBDUMP/{0}.json".format(tmdbid), "a", encoding="utf-8") as f: