KokoseiJ · July 8, 2022 02:08
diff --git a/archiveorg_batchdl.py b/archiveorg_batchdl.py
 import re
 import sys
 import requests
 import threading
 import subprocess
 from subprocess import DEVNULL
 from urllib.parse import unquote, urljoin


 def download_file(url, semaphore):
    proc = subprocess.Popen(["wget", url], stderr=DEVNULL)
    proc.wait()
    print(f"[*] *** Finished downloading <{unquote(url)}>! ***")
    semaphore.release()


 if len(sys.argv) < 2:
    print(f"Usage: {sys.executable} {sys.argv[0]} url [extension] [threads]")
    sys.exit(1)

 url = sys.argv[1]

 if len(sys.argv) > 2:
    ext = sys.argv[2]
 else:
    ext = "zip"

 if len(sys.argv) > 3:
    threads = int(sys.argv[3])
 else:
    threads = 5

 print(f"[*] Downloading {url.rsplit('/', 1)[-1]}...\n")

 r = requests.get(url)

 zips = re.findall(f"href=\"(.*?\.{ext})\"", r.text)
 # zips = sorted(zips, key=lambda x: "Korea" not in x)

 print(f"[*] Searching extension: <{ext}>")
 print(f"[*] Amount of files to download: {len(zips)}")
 print("[*] Starting download now.\n")
 print("================================================\n")

 baseurl = f"{url}/" if not url.endswith("/") else url

 semaphore = threading.BoundedSemaphore(threads)

 for filename in zips:
    semaphore.acquire()
    print(f"[*] Downloading <{unquote(filename)}>...")
    url = urljoin(baseurl, filename)

    thread = threading.Thread(target=download_file, args=(url, semaphore))
    thread.start()
	import re
	import sys
	import requests
	import threading
	import subprocess
	from subprocess import DEVNULL
	from urllib.parse import unquote, urljoin


	def download_file(url, semaphore):
	proc = subprocess.Popen(["wget", url], stderr=DEVNULL)
	proc.wait()
	print(f"[] Finished downloading <{unquote(url)}>! *")
	semaphore.release()


	if len(sys.argv) < 2:
	print(f"Usage: {sys.executable} {sys.argv[0]} url [extension] [threads]")
	sys.exit(1)

	url = sys.argv[1]

	if len(sys.argv) > 2:
	ext = sys.argv[2]
	else:
	ext = "zip"

	if len(sys.argv) > 3:
	threads = int(sys.argv[3])
	else:
	threads = 5

	print(f"[*] Downloading {url.rsplit('/', 1)[-1]}...\n")

	r = requests.get(url)

	zips = re.findall(f"href=\"(.*?\.{ext})\"", r.text)
	# zips = sorted(zips, key=lambda x: "Korea" not in x)

	print(f"[*] Searching extension: <{ext}>")
	print(f"[*] Amount of files to download: {len(zips)}")
	print("[*] Starting download now.\n")
	print("================================================\n")

	baseurl = f"{url}/" if not url.endswith("/") else url

	semaphore = threading.BoundedSemaphore(threads)

	for filename in zips:
	semaphore.acquire()
	print(f"[*] Downloading <{unquote(filename)}>...")
	url = urljoin(baseurl, filename)

	thread = threading.Thread(target=download_file, args=(url, semaphore))
	thread.start()