Last active
August 6, 2025 14:32
-
-
Save themorgantown/1b98b33a13b50078010a50e3fa0e1461 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
""" | |
Parallel IA downloader - creates individual download commands for each file | |
and runs them concurrently using the ia command line tool. | |
USAGE: | |
First install the Internet Archive command line tool: | |
curl -LOs https://archive.org/download/ia-pex/ia | |
chmod +x ia | |
# install concurrent futures if not already installed for concurrent downloads | |
# pip install futures | |
# on a mac, run: | |
./ia configure | |
Place the parallel_ia_download.py script in the same directory as the ia tool (on a drive with lots of space!) | |
Next, run this to execute, replacing jillem-full-archive with your collection name. Use concurrency to control how many files to download at once. The timeout is 1 hour for files. This will will automatically retry failed downloads, resume if interrupted, and show live progress in the console: | |
python parallel_ia_download.py jillem-full-archive --destdir ./jillem_zips --concurrency 10 --progress-timeout 3600 | |
Examples: | |
# Download all .zip files to ./jillem_zips directory | |
python parallel_ia_download.py jillem-full-archive --destdir ./jillem_zips | |
# Download with custom concurrency and timeouts | |
python parallel_ia_download.py jillem-full-archive --destdir ./downloads --concurrency 6 --progress-timeout 300 | |
# Download specific file patterns | |
python parallel_ia_download.py jillem-full-archive --destdir ./music --glob "*.flac" --concurrency 3 | |
""" | |
import subprocess | |
import sys | |
import os | |
from concurrent.futures import ThreadPoolExecutor, as_completed | |
import argparse | |
def download_single_file(identifier: str, filename: str, destdir: str, retries: int = 5, | |
progress_timeout: int = 300, max_timeout: int = 3600) -> tuple[str, bool, str]: | |
"""Download a single file using the ia command line tool.""" | |
import time | |
import threading | |
import os | |
try: | |
# Build the ia download command | |
cmd = [ | |
'./ia', 'download', identifier, filename, | |
'--destdir', destdir, | |
'--checksum', | |
'--retries', str(retries) | |
] | |
# Calculate expected file path to monitor progress | |
expected_path = os.path.join(destdir, filename) | |
# Start the download process | |
process = subprocess.Popen( | |
cmd, | |
stdout=subprocess.PIPE, | |
stderr=subprocess.PIPE, | |
text=True | |
) | |
# Monitor the download with intelligent timeout and live progress | |
start_time = time.time() | |
last_size = 0 | |
last_progress_time = start_time | |
last_display_time = start_time | |
display_interval = 2.0 # Update progress every 2 seconds | |
def format_size(bytes_size): | |
"""Convert bytes to human readable format.""" | |
for unit in ['B', 'KB', 'MB', 'GB']: | |
if bytes_size < 1024.0: | |
return f"{bytes_size:.1f} {unit}" | |
bytes_size /= 1024.0 | |
return f"{bytes_size:.1f} TB" | |
def format_speed(bytes_per_second): | |
"""Convert bytes per second to human readable format.""" | |
return f"{format_size(bytes_per_second)}/s" | |
def display_progress(current_size, speed, elapsed_time): | |
"""Display live progress in Napster-style format.""" | |
# Try to estimate total size from IA metadata if possible | |
# For now, show current size and speed | |
speed_str = format_speed(speed) if speed > 0 else "starting..." | |
size_str = format_size(current_size) | |
# Create animated progress indicator | |
spinner_chars = "⠋⠙⠹⠸⠼⠴⠦⠧⠇⠏" | |
spinner = spinner_chars[int(elapsed_time * 2) % len(spinner_chars)] | |
# Format: ⠋ filename.zip - 15.2 MB @ 850.3 KB/s (2m 15s) | |
elapsed_str = f"{int(elapsed_time//60)}m {int(elapsed_time%60)}s" if elapsed_time > 60 else f"{int(elapsed_time)}s" | |
# Use a unique line for each file to avoid conflicts with concurrent downloads | |
progress_line = f"{spinner} {filename[:40]:<40} {size_str:>8} @ {speed_str:>12} ({elapsed_str})" | |
# Clear and rewrite the line | |
print(f"\r{progress_line:<80}", end="", flush=True) | |
def monitor_progress(): | |
nonlocal last_size, last_progress_time, last_display_time | |
while process.poll() is None: | |
time.sleep(1) # Check every second for responsive display | |
current_time = time.time() | |
# Check if file exists and is growing | |
current_size = 0 | |
if os.path.exists(expected_path): | |
try: | |
current_size = os.path.getsize(expected_path) | |
if current_size > last_size: | |
# File is growing, reset progress timer | |
last_size = current_size | |
last_progress_time = current_time | |
except OSError: | |
# File might be locked/in-use, that's normal during download | |
pass | |
# Update display every few seconds | |
if current_time - last_display_time >= display_interval and current_size > 0: | |
elapsed_since_display = current_time - last_display_time | |
speed = (current_size - last_size) / elapsed_since_display if elapsed_since_display > 0 else 0 | |
elapsed_total = current_time - start_time | |
display_progress(current_size, speed, elapsed_total) | |
last_display_time = current_time | |
# Check timeouts (but check less frequently to avoid spam) | |
if int(current_time) % 10 == 0: # Every 10 seconds | |
time_since_progress = current_time - last_progress_time | |
total_time = current_time - start_time | |
# Kill if no progress for too long or absolute timeout | |
if time_since_progress > progress_timeout: | |
print(f"\n⏰ {filename}: No progress for {progress_timeout/60:.1f} minutes, terminating", file=sys.stderr) | |
process.terminate() | |
break | |
elif total_time > max_timeout: | |
print(f"\n⏰ {filename}: Maximum timeout ({max_timeout/60:.1f} minutes) reached, terminating", file=sys.stderr) | |
process.terminate() | |
break | |
# Start monitoring in background thread | |
monitor_thread = threading.Thread(target=monitor_progress, daemon=True) | |
monitor_thread.start() | |
# Wait for process to complete | |
stdout, stderr = process.communicate() | |
# Clear the progress line and show final result | |
print(f"\r{' ' * 80}", end="", flush=True) # Clear the line | |
if process.returncode == 0: | |
# Show final size and average speed | |
if os.path.exists(expected_path): | |
final_size = os.path.getsize(expected_path) | |
elapsed = time.time() - start_time | |
avg_speed = final_size / elapsed if elapsed > 0 else 0 | |
return (filename, True, f"✅ {filename} - {format_size(final_size)} @ {format_speed(avg_speed)} avg") | |
else: | |
return (filename, True, f"✅ {filename} - Downloaded successfully") | |
elif process.returncode == -15: # SIGTERM (our timeout) | |
return (filename, False, "Timed out - no progress detected") | |
else: | |
error_msg = stderr.strip() or stdout.strip() or "Unknown error" | |
return (filename, False, f"Command failed (code {process.returncode}): {error_msg}") | |
except Exception as e: | |
return (filename, False, f"Exception: {str(e)}") | |
def get_file_list(identifier: str, glob_pattern: str) -> list[str]: | |
"""Get list of files matching the glob pattern.""" | |
try: | |
cmd = ['./ia', 'list', identifier, '--glob', glob_pattern] | |
result = subprocess.run(cmd, capture_output=True, text=True, check=True) | |
files = [line.strip() for line in result.stdout.split('\n') if line.strip()] | |
return files | |
except subprocess.CalledProcessError as e: | |
print(f"Error getting file list: {e}", file=sys.stderr) | |
return [] | |
def main(): | |
parser = argparse.ArgumentParser(description="Concurrent IA downloads using ia command") | |
parser.add_argument("identifier", help="IA item identifier") | |
parser.add_argument("--destdir", default="./downloads", help="Destination directory") | |
parser.add_argument("--glob", default="*.zip", help="File pattern to match") | |
parser.add_argument("--concurrency", "-j", type=int, default=6, help="Number of concurrent downloads") | |
parser.add_argument("--retries", type=int, default=5, help="Retries per file") | |
parser.add_argument("--progress-timeout", type=int, default=300, | |
help="Timeout in seconds if no progress detected (default: 300=5min)") | |
parser.add_argument("--max-timeout", type=int, default=3600, | |
help="Maximum timeout per file in seconds (default: 3600=1hour)") | |
args = parser.parse_args() | |
# Create destination directory | |
os.makedirs(args.destdir, exist_ok=True) | |
print(f"🔍 Getting file list for {args.identifier} matching {args.glob}...") | |
files = get_file_list(args.identifier, args.glob) | |
if not files: | |
print("❌ No files found matching pattern") | |
sys.exit(1) | |
print(f"📋 Found {len(files)} files to download") | |
print(f"⚡ Using {args.concurrency} concurrent downloads") | |
print(f"⏰ Timeout: {args.progress_timeout/60:.1f} minutes without progress, {args.max_timeout/60:.1f} minutes maximum per file") | |
print("\n🚀 Starting downloads... (live progress shown below)") | |
print("-" * 80) | |
# Download files concurrently | |
completed = 0 | |
failures = [] | |
with ThreadPoolExecutor(max_workers=args.concurrency) as executor: | |
# Submit all download tasks | |
future_to_file = { | |
executor.submit(download_single_file, args.identifier, filename, args.destdir, | |
args.retries, args.progress_timeout, args.max_timeout): filename | |
for filename in files | |
} | |
# Process completed downloads | |
for future in as_completed(future_to_file): | |
filename, success, message = future.result() | |
completed += 1 | |
progress = f"({completed}/{len(files)})" | |
if success: | |
print(f"\n{message} {progress}") | |
else: | |
print(f"\n❌ {progress} {filename}: {message}") | |
failures.append(filename) | |
# Summary | |
print() | |
print("-" * 80) | |
print("📊 Download Summary:") | |
print(f"✅ Successful: {len(files) - len(failures)}") | |
print(f"❌ Failed: {len(failures)}") | |
# Calculate total downloaded size and time | |
total_size = 0 | |
total_time = 0 | |
if len(files) - len(failures) > 0: | |
import time | |
try: | |
for filename in files: | |
if filename not in failures: | |
file_path = os.path.join(args.destdir, filename) | |
if os.path.exists(file_path): | |
total_size += os.path.getsize(file_path) | |
except: | |
pass | |
if total_size > 0: | |
def format_size(bytes_size): | |
for unit in ['B', 'KB', 'MB', 'GB']: | |
if bytes_size < 1024.0: | |
return f"{bytes_size:.1f} {unit}" | |
bytes_size /= 1024.0 | |
return f"{bytes_size:.1f} TB" | |
print(f"📦 Total downloaded: {format_size(total_size)}") | |
if failures: | |
print("\n❌ Failed files:") | |
for filename in failures: | |
print(f" - {filename}") | |
# Write failures to file for retry | |
with open("ia_failures.txt", "w") as f: | |
for filename in failures: | |
f.write(f"{filename}\n") | |
print(f"\n🔄 To retry failures:") | |
print(f"python {sys.argv[0]} {args.identifier} --destdir {args.destdir} --retries 10") | |
sys.exit(1) | |
else: | |
print("\n🎉 All downloads completed successfully!") | |
sys.exit(0) | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment