Skip to content

Instantly share code, notes, and snippets.

@themorgantown
Last active August 6, 2025 14:32
Show Gist options
  • Save themorgantown/1b98b33a13b50078010a50e3fa0e1461 to your computer and use it in GitHub Desktop.
Save themorgantown/1b98b33a13b50078010a50e3fa0e1461 to your computer and use it in GitHub Desktop.
#!/usr/bin/env python3
"""
Parallel IA downloader - creates individual download commands for each file
and runs them concurrently using the ia command line tool.
USAGE:
First install the Internet Archive command line tool:
curl -LOs https://archive.org/download/ia-pex/ia
chmod +x ia
# install concurrent futures if not already installed for concurrent downloads
# pip install futures
# on a mac, run:
./ia configure
Place the parallel_ia_download.py script in the same directory as the ia tool (on a drive with lots of space!)
Next, run this to execute, replacing jillem-full-archive with your collection name. Use concurrency to control how many files to download at once. The timeout is 1 hour for files. This will will automatically retry failed downloads, resume if interrupted, and show live progress in the console:
python parallel_ia_download.py jillem-full-archive --destdir ./jillem_zips --concurrency 10 --progress-timeout 3600
Examples:
# Download all .zip files to ./jillem_zips directory
python parallel_ia_download.py jillem-full-archive --destdir ./jillem_zips
# Download with custom concurrency and timeouts
python parallel_ia_download.py jillem-full-archive --destdir ./downloads --concurrency 6 --progress-timeout 300
# Download specific file patterns
python parallel_ia_download.py jillem-full-archive --destdir ./music --glob "*.flac" --concurrency 3
"""
import subprocess
import sys
import os
from concurrent.futures import ThreadPoolExecutor, as_completed
import argparse
def download_single_file(identifier: str, filename: str, destdir: str, retries: int = 5,
progress_timeout: int = 300, max_timeout: int = 3600) -> tuple[str, bool, str]:
"""Download a single file using the ia command line tool."""
import time
import threading
import os
try:
# Build the ia download command
cmd = [
'./ia', 'download', identifier, filename,
'--destdir', destdir,
'--checksum',
'--retries', str(retries)
]
# Calculate expected file path to monitor progress
expected_path = os.path.join(destdir, filename)
# Start the download process
process = subprocess.Popen(
cmd,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
text=True
)
# Monitor the download with intelligent timeout and live progress
start_time = time.time()
last_size = 0
last_progress_time = start_time
last_display_time = start_time
display_interval = 2.0 # Update progress every 2 seconds
def format_size(bytes_size):
"""Convert bytes to human readable format."""
for unit in ['B', 'KB', 'MB', 'GB']:
if bytes_size < 1024.0:
return f"{bytes_size:.1f} {unit}"
bytes_size /= 1024.0
return f"{bytes_size:.1f} TB"
def format_speed(bytes_per_second):
"""Convert bytes per second to human readable format."""
return f"{format_size(bytes_per_second)}/s"
def display_progress(current_size, speed, elapsed_time):
"""Display live progress in Napster-style format."""
# Try to estimate total size from IA metadata if possible
# For now, show current size and speed
speed_str = format_speed(speed) if speed > 0 else "starting..."
size_str = format_size(current_size)
# Create animated progress indicator
spinner_chars = "⠋⠙⠹⠸⠼⠴⠦⠧⠇⠏"
spinner = spinner_chars[int(elapsed_time * 2) % len(spinner_chars)]
# Format: ⠋ filename.zip - 15.2 MB @ 850.3 KB/s (2m 15s)
elapsed_str = f"{int(elapsed_time//60)}m {int(elapsed_time%60)}s" if elapsed_time > 60 else f"{int(elapsed_time)}s"
# Use a unique line for each file to avoid conflicts with concurrent downloads
progress_line = f"{spinner} {filename[:40]:<40} {size_str:>8} @ {speed_str:>12} ({elapsed_str})"
# Clear and rewrite the line
print(f"\r{progress_line:<80}", end="", flush=True)
def monitor_progress():
nonlocal last_size, last_progress_time, last_display_time
while process.poll() is None:
time.sleep(1) # Check every second for responsive display
current_time = time.time()
# Check if file exists and is growing
current_size = 0
if os.path.exists(expected_path):
try:
current_size = os.path.getsize(expected_path)
if current_size > last_size:
# File is growing, reset progress timer
last_size = current_size
last_progress_time = current_time
except OSError:
# File might be locked/in-use, that's normal during download
pass
# Update display every few seconds
if current_time - last_display_time >= display_interval and current_size > 0:
elapsed_since_display = current_time - last_display_time
speed = (current_size - last_size) / elapsed_since_display if elapsed_since_display > 0 else 0
elapsed_total = current_time - start_time
display_progress(current_size, speed, elapsed_total)
last_display_time = current_time
# Check timeouts (but check less frequently to avoid spam)
if int(current_time) % 10 == 0: # Every 10 seconds
time_since_progress = current_time - last_progress_time
total_time = current_time - start_time
# Kill if no progress for too long or absolute timeout
if time_since_progress > progress_timeout:
print(f"\n⏰ {filename}: No progress for {progress_timeout/60:.1f} minutes, terminating", file=sys.stderr)
process.terminate()
break
elif total_time > max_timeout:
print(f"\n⏰ {filename}: Maximum timeout ({max_timeout/60:.1f} minutes) reached, terminating", file=sys.stderr)
process.terminate()
break
# Start monitoring in background thread
monitor_thread = threading.Thread(target=monitor_progress, daemon=True)
monitor_thread.start()
# Wait for process to complete
stdout, stderr = process.communicate()
# Clear the progress line and show final result
print(f"\r{' ' * 80}", end="", flush=True) # Clear the line
if process.returncode == 0:
# Show final size and average speed
if os.path.exists(expected_path):
final_size = os.path.getsize(expected_path)
elapsed = time.time() - start_time
avg_speed = final_size / elapsed if elapsed > 0 else 0
return (filename, True, f"✅ {filename} - {format_size(final_size)} @ {format_speed(avg_speed)} avg")
else:
return (filename, True, f"✅ {filename} - Downloaded successfully")
elif process.returncode == -15: # SIGTERM (our timeout)
return (filename, False, "Timed out - no progress detected")
else:
error_msg = stderr.strip() or stdout.strip() or "Unknown error"
return (filename, False, f"Command failed (code {process.returncode}): {error_msg}")
except Exception as e:
return (filename, False, f"Exception: {str(e)}")
def get_file_list(identifier: str, glob_pattern: str) -> list[str]:
"""Get list of files matching the glob pattern."""
try:
cmd = ['./ia', 'list', identifier, '--glob', glob_pattern]
result = subprocess.run(cmd, capture_output=True, text=True, check=True)
files = [line.strip() for line in result.stdout.split('\n') if line.strip()]
return files
except subprocess.CalledProcessError as e:
print(f"Error getting file list: {e}", file=sys.stderr)
return []
def main():
parser = argparse.ArgumentParser(description="Concurrent IA downloads using ia command")
parser.add_argument("identifier", help="IA item identifier")
parser.add_argument("--destdir", default="./downloads", help="Destination directory")
parser.add_argument("--glob", default="*.zip", help="File pattern to match")
parser.add_argument("--concurrency", "-j", type=int, default=6, help="Number of concurrent downloads")
parser.add_argument("--retries", type=int, default=5, help="Retries per file")
parser.add_argument("--progress-timeout", type=int, default=300,
help="Timeout in seconds if no progress detected (default: 300=5min)")
parser.add_argument("--max-timeout", type=int, default=3600,
help="Maximum timeout per file in seconds (default: 3600=1hour)")
args = parser.parse_args()
# Create destination directory
os.makedirs(args.destdir, exist_ok=True)
print(f"🔍 Getting file list for {args.identifier} matching {args.glob}...")
files = get_file_list(args.identifier, args.glob)
if not files:
print("❌ No files found matching pattern")
sys.exit(1)
print(f"📋 Found {len(files)} files to download")
print(f"⚡ Using {args.concurrency} concurrent downloads")
print(f"⏰ Timeout: {args.progress_timeout/60:.1f} minutes without progress, {args.max_timeout/60:.1f} minutes maximum per file")
print("\n🚀 Starting downloads... (live progress shown below)")
print("-" * 80)
# Download files concurrently
completed = 0
failures = []
with ThreadPoolExecutor(max_workers=args.concurrency) as executor:
# Submit all download tasks
future_to_file = {
executor.submit(download_single_file, args.identifier, filename, args.destdir,
args.retries, args.progress_timeout, args.max_timeout): filename
for filename in files
}
# Process completed downloads
for future in as_completed(future_to_file):
filename, success, message = future.result()
completed += 1
progress = f"({completed}/{len(files)})"
if success:
print(f"\n{message} {progress}")
else:
print(f"\n❌ {progress} {filename}: {message}")
failures.append(filename)
# Summary
print()
print("-" * 80)
print("📊 Download Summary:")
print(f"✅ Successful: {len(files) - len(failures)}")
print(f"❌ Failed: {len(failures)}")
# Calculate total downloaded size and time
total_size = 0
total_time = 0
if len(files) - len(failures) > 0:
import time
try:
for filename in files:
if filename not in failures:
file_path = os.path.join(args.destdir, filename)
if os.path.exists(file_path):
total_size += os.path.getsize(file_path)
except:
pass
if total_size > 0:
def format_size(bytes_size):
for unit in ['B', 'KB', 'MB', 'GB']:
if bytes_size < 1024.0:
return f"{bytes_size:.1f} {unit}"
bytes_size /= 1024.0
return f"{bytes_size:.1f} TB"
print(f"📦 Total downloaded: {format_size(total_size)}")
if failures:
print("\n❌ Failed files:")
for filename in failures:
print(f" - {filename}")
# Write failures to file for retry
with open("ia_failures.txt", "w") as f:
for filename in failures:
f.write(f"{filename}\n")
print(f"\n🔄 To retry failures:")
print(f"python {sys.argv[0]} {args.identifier} --destdir {args.destdir} --retries 10")
sys.exit(1)
else:
print("\n🎉 All downloads completed successfully!")
sys.exit(0)
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment