Last active
July 8, 2022 02:08
-
-
Save KokoseiJ/e7ce7b636183eb3c2ec27e4b44afd2f1 to your computer and use it in GitHub Desktop.
Batch download files from archive.org collections
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import re | |
import sys | |
import requests | |
import threading | |
import subprocess | |
from subprocess import DEVNULL | |
from urllib.parse import unquote, urljoin | |
def download_file(url, semaphore): | |
proc = subprocess.Popen(["wget", url], stderr=DEVNULL) | |
proc.wait() | |
print(f"[*] *** Finished downloading <{unquote(url)}>! ***") | |
semaphore.release() | |
if len(sys.argv) < 2: | |
print(f"Usage: {sys.executable} {sys.argv[0]} url [extension] [threads]") | |
sys.exit(1) | |
url = sys.argv[1] | |
if len(sys.argv) > 2: | |
ext = sys.argv[2] | |
else: | |
ext = "zip" | |
if len(sys.argv) > 3: | |
threads = int(sys.argv[3]) | |
else: | |
threads = 5 | |
print(f"[*] Downloading {url.rsplit('/', 1)[-1]}...\n") | |
r = requests.get(url) | |
zips = re.findall(f"href=\"(.*?\.{ext})\"", r.text) | |
# zips = sorted(zips, key=lambda x: "Korea" not in x) | |
print(f"[*] Searching extension: <{ext}>") | |
print(f"[*] Amount of files to download: {len(zips)}") | |
print("[*] Starting download now.\n") | |
print("================================================\n") | |
baseurl = f"{url}/" if not url.endswith("/") else url | |
semaphore = threading.BoundedSemaphore(threads) | |
for filename in zips: | |
semaphore.acquire() | |
print(f"[*] Downloading <{unquote(filename)}>...") | |
url = urljoin(baseurl, filename) | |
thread = threading.Thread(target=download_file, args=(url, semaphore)) | |
thread.start() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment