Created
July 15, 2024 18:41
-
-
Save Verpous/e254e8c56aadd400d50f861906458292 to your computer and use it in GitHub Desktop.
Download IMDb lists as CSV automatically
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #! python | |
| # USAGE: run mcsv.py <list-id> <downloads-dir> and it will download the list CSV using Firefox, and print the name of the downloaded file. | |
| import sys | |
| import argparse | |
| import os | |
| import time | |
| import glob | |
| from selenium import webdriver | |
| from selenium.webdriver.common.by import By | |
| class Timeout: | |
| def __init__(self, timeout_secs=float('inf'), operation='N/A'): | |
| self._timeout_secs = timeout_secs | |
| self._operation = operation | |
| self._enter_time = float('nan') | |
| def tick(self): | |
| if time.time() - self._enter_time > self._timeout_secs: | |
| raise TimeoutError(f"Operation: {self._operation} timed out after: {self._timeout_secs} seconds.") | |
| def __enter__(self): | |
| self._enter_time = time.time() | |
| return self | |
| def __exit__(self, exc_type, exc_value, traceback) -> None: | |
| self._enter_time = float('nan') | |
| def get_latest_in_downloads(): | |
| files_in_dowloads = glob.glob(os.path.join(downloads_dir, '*.csv')) | |
| latest_file = max(files_in_dowloads, key=os.path.getctime, default=None) | |
| return latest_file | |
| def do_with_retries(action, num_retries=10, sleep_between_retries=1): | |
| for i in range(num_retries): | |
| try: | |
| return action() | |
| except: | |
| if i == num_retries - 1: | |
| raise | |
| time.sleep(sleep_between_retries) | |
| list_id = sys.argv[1] | |
| downloads_dir = sys.argv[2] | |
| with webdriver.Firefox() as driver: | |
| driver.get(f'https://www.imdb.com/list/ls{list_id}') | |
| # Annoying popup that asks you to sign in hides the export button. | |
| close_popup_button = do_with_retries( | |
| lambda: driver.find_element(By.XPATH, "//button[@aria-label='Close']")) | |
| do_with_retries(close_popup_button.click) | |
| # Begin exporting. | |
| export_button = do_with_retries( | |
| lambda: driver.find_element(By.XPATH, "//button[@aria-label='Export']")) | |
| do_with_retries(export_button.click) | |
| # Go to exports page once the popup tells us. | |
| exports_page_link = do_with_retries( | |
| lambda: driver.find_element(By.XPATH, "//a[@aria-label='Open exports page']")) | |
| do_with_retries(exports_page_link.click) | |
| # Hit the download button once the list is ready. | |
| def get_download_button(): | |
| driver.refresh() | |
| return driver.find_element(By.XPATH, "//button[contains(@aria-label, 'Start download for')]") | |
| download_button = do_with_retries(get_download_button) | |
| # Wait for the file to be downloaded before calling it a wrap. | |
| with Timeout(20, 'download CSV') as timeout: | |
| latest_file_before = get_latest_in_downloads() | |
| do_with_retries(download_button.click) | |
| # We hit a button in the browser that should download a CSV. Now we'll monitor the downloads directory until the latest CSV there is different than what it was before. | |
| # When that happens, we will have the CSV that was downloaded. | |
| while (latest_file := get_latest_in_downloads()) == latest_file_before or latest_file is None: | |
| timeout.tick() | |
| # When the file is created it's sometimes empty for a bit. At some point it jumps to being fully written, without any inbetween. | |
| # So this waits for the file size to not be zero. | |
| while True: | |
| # This sleep is meant to alleviate a problem that is way too much to explain so just see my question about it on StackOverflow: | |
| # https://stackoverflow.com/questions/78300917/checking-the-size-of-a-file-thats-being-downloaded-by-the-browser-causes-it-to | |
| time.sleep(1) | |
| if os.path.getsize(latest_file) != 0: | |
| break | |
| try: | |
| timeout.tick() | |
| except TimeoutError: | |
| # For the same reason as the sleep above, we do a trick: we'll check if a different file with a similar name has appeared, and return it. | |
| latest_file_last_minute = get_latest_in_downloads() | |
| if (latest_file_last_minute is not None | |
| and latest_file_last_minute != latest_file | |
| and latest_file.removesuffix('.csv') in latest_file_last_minute | |
| and os.path.getsize(latest_file_last_minute) != 0): | |
| os.remove(latest_file) | |
| latest_file = latest_file_last_minute | |
| break | |
| raise | |
| print(os.path.basename(latest_file)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment