themorgantown · August 6, 2025 14:32
diff --git a/parallel_ia_download.py b/parallel_ia_download.py
 #!/usr/bin/env python3
 """
 Parallel IA downloader - creates individual download commands for each file
 and runs them concurrently using the ia command line tool.

 USAGE:
 First install the Internet Archive command line tool:
 curl -LOs https://archive.org/download/ia-pex/ia
 chmod +x ia
 # install concurrent futures if not already installed for concurrent downloads
 # pip install futures
 # on a mac, run:
 ./ia configure
 Place the parallel_ia_download.py script in the same directory as the ia tool (on a drive with lots of space!)
 Next, run this to execute, replacing jillem-full-archive with your collection name. Use concurrency to control how many files to download at once. The timeout is 1 hour for files. This will will automatically retry failed downloads, resume if interrupted, and show live progress in the console:

 python parallel_ia_download.py jillem-full-archive --destdir ./jillem_zips --concurrency 10 --progress-timeout 3600

 Examples:
  # Download all .zip files to ./jillem_zips directory
  python parallel_ia_download.py jillem-full-archive --destdir ./jillem_zips

  # Download with custom concurrency and timeouts
  python parallel_ia_download.py jillem-full-archive --destdir ./downloads --concurrency 6 --progress-timeout 300
  
  # Download specific file patterns
  python parallel_ia_download.py jillem-full-archive --destdir ./music --glob "*.flac" --concurrency 3
 """

 import subprocess
 import sys
 import os
 from concurrent.futures import ThreadPoolExecutor, as_completed
 import argparse

 def download_single_file(identifier: str, filename: str, destdir: str, retries: int = 5, 
                        progress_timeout: int = 300, max_timeout: int = 3600) -> tuple[str, bool, str]:
    """Download a single file using the ia command line tool."""
    import time
    import threading
    import os
    
    try:
        # Build the ia download command
        cmd = [
            './ia', 'download', identifier, filename,
            '--destdir', destdir,
            '--checksum',
            '--retries', str(retries)
        ]

        # Calculate expected file path to monitor progress
        expected_path = os.path.join(destdir, filename)
        
        # Start the download process
        process = subprocess.Popen(
            cmd,
            stdout=subprocess.PIPE,
            stderr=subprocess.PIPE,
            text=True
        )
        
        # Monitor the download with intelligent timeout and live progress
        start_time = time.time()
        last_size = 0
        last_progress_time = start_time
        last_display_time = start_time
        display_interval = 2.0  # Update progress every 2 seconds
        
        def format_size(bytes_size):
            """Convert bytes to human readable format."""
            for unit in ['B', 'KB', 'MB', 'GB']:
                if bytes_size < 1024.0:
                    return f"{bytes_size:.1f} {unit}"
                bytes_size /= 1024.0
            return f"{bytes_size:.1f} TB"
        
        def format_speed(bytes_per_second):
            """Convert bytes per second to human readable format."""
            return f"{format_size(bytes_per_second)}/s"
        
        def display_progress(current_size, speed, elapsed_time):
            """Display live progress in Napster-style format."""
            # Try to estimate total size from IA metadata if possible
            # For now, show current size and speed
            speed_str = format_speed(speed) if speed > 0 else "starting..."
            size_str = format_size(current_size)
            
            # Create animated progress indicator
            spinner_chars = "⠋⠙⠹⠸⠼⠴⠦⠧⠇⠏"
            spinner = spinner_chars[int(elapsed_time * 2) % len(spinner_chars)]
            
            # Format: ⠋ filename.zip - 15.2 MB @ 850.3 KB/s (2m 15s)
            elapsed_str = f"{int(elapsed_time//60)}m {int(elapsed_time%60)}s" if elapsed_time > 60 else f"{int(elapsed_time)}s"
            
            # Use a unique line for each file to avoid conflicts with concurrent downloads
            progress_line = f"{spinner} {filename[:40]:<40} {size_str:>8} @ {speed_str:>12} ({elapsed_str})"
            
            # Clear and rewrite the line
            print(f"\r{progress_line:<80}", end="", flush=True)
        
        def monitor_progress():
            nonlocal last_size, last_progress_time, last_display_time
            
            while process.poll() is None:
                time.sleep(1)  # Check every second for responsive display
                
                current_time = time.time()
                
                # Check if file exists and is growing
                current_size = 0
                if os.path.exists(expected_path):
                    try:
                        current_size = os.path.getsize(expected_path)
                        if current_size > last_size:
                            # File is growing, reset progress timer
                            last_size = current_size
                            last_progress_time = current_time
                    except OSError:
                        # File might be locked/in-use, that's normal during download
                        pass
                
                # Update display every few seconds
                if current_time - last_display_time >= display_interval and current_size > 0:
                    elapsed_since_display = current_time - last_display_time
                    speed = (current_size - last_size) / elapsed_since_display if elapsed_since_display > 0 else 0
                    elapsed_total = current_time - start_time
                    
                    display_progress(current_size, speed, elapsed_total)
                    last_display_time = current_time
                
                # Check timeouts (but check less frequently to avoid spam)
                if int(current_time) % 10 == 0:  # Every 10 seconds
                    time_since_progress = current_time - last_progress_time
                    total_time = current_time - start_time
                    
                    # Kill if no progress for too long or absolute timeout
                    if time_since_progress > progress_timeout:
                        print(f"\n⏰ {filename}: No progress for {progress_timeout/60:.1f} minutes, terminating", file=sys.stderr)
                        process.terminate()
                        break
                    elif total_time > max_timeout:
                        print(f"\n⏰ {filename}: Maximum timeout ({max_timeout/60:.1f} minutes) reached, terminating", file=sys.stderr)
                        process.terminate()
                        break
        
        # Start monitoring in background thread
        monitor_thread = threading.Thread(target=monitor_progress, daemon=True)
        monitor_thread.start()
        
        # Wait for process to complete
        stdout, stderr = process.communicate()
        
        # Clear the progress line and show final result
        print(f"\r{' ' * 80}", end="", flush=True)  # Clear the line
        
        if process.returncode == 0:
            # Show final size and average speed
            if os.path.exists(expected_path):
                final_size = os.path.getsize(expected_path)
                elapsed = time.time() - start_time
                avg_speed = final_size / elapsed if elapsed > 0 else 0
                return (filename, True, f"✅ {filename} - {format_size(final_size)} @ {format_speed(avg_speed)} avg")
            else:
                return (filename, True, f"✅ {filename} - Downloaded successfully")
        elif process.returncode == -15:  # SIGTERM (our timeout)
            return (filename, False, "Timed out - no progress detected")
        else:
            error_msg = stderr.strip() or stdout.strip() or "Unknown error"
            return (filename, False, f"Command failed (code {process.returncode}): {error_msg}")
            
    except Exception as e:
        return (filename, False, f"Exception: {str(e)}")

 def get_file_list(identifier: str, glob_pattern: str) -> list[str]:
    """Get list of files matching the glob pattern."""
    try:
        cmd = ['./ia', 'list', identifier, '--glob', glob_pattern]
        result = subprocess.run(cmd, capture_output=True, text=True, check=True)
        
        files = [line.strip() for line in result.stdout.split('\n') if line.strip()]
        return files
        
    except subprocess.CalledProcessError as e:
        print(f"Error getting file list: {e}", file=sys.stderr)
        return []

 def main():
    parser = argparse.ArgumentParser(description="Concurrent IA downloads using ia command")
    parser.add_argument("identifier", help="IA item identifier")
    parser.add_argument("--destdir", default="./downloads", help="Destination directory")
    parser.add_argument("--glob", default="*.zip", help="File pattern to match")
    parser.add_argument("--concurrency", "-j", type=int, default=6, help="Number of concurrent downloads")
    parser.add_argument("--retries", type=int, default=5, help="Retries per file")
    parser.add_argument("--progress-timeout", type=int, default=300, 
                       help="Timeout in seconds if no progress detected (default: 300=5min)")
    parser.add_argument("--max-timeout", type=int, default=3600, 
                       help="Maximum timeout per file in seconds (default: 3600=1hour)")
    
    args = parser.parse_args()
    
    # Create destination directory
    os.makedirs(args.destdir, exist_ok=True)
    
    print(f"🔍 Getting file list for {args.identifier} matching {args.glob}...")
    files = get_file_list(args.identifier, args.glob)
    
    if not files:
        print("❌ No files found matching pattern")
        sys.exit(1)
    
    print(f"📋 Found {len(files)} files to download")
    print(f"⚡ Using {args.concurrency} concurrent downloads")
    print(f"⏰ Timeout: {args.progress_timeout/60:.1f} minutes without progress, {args.max_timeout/60:.1f} minutes maximum per file")
    print("\n🚀 Starting downloads... (live progress shown below)")
    print("-" * 80)
    
    # Download files concurrently
    completed = 0
    failures = []
    
    with ThreadPoolExecutor(max_workers=args.concurrency) as executor:
        # Submit all download tasks
        future_to_file = {
            executor.submit(download_single_file, args.identifier, filename, args.destdir, 
                           args.retries, args.progress_timeout, args.max_timeout): filename
            for filename in files
        }
        
        # Process completed downloads
        for future in as_completed(future_to_file):
            filename, success, message = future.result()
            completed += 1
            progress = f"({completed}/{len(files)})"
            
            if success:
                print(f"\n{message} {progress}")
            else:
                print(f"\n❌ {progress} {filename}: {message}")
                failures.append(filename)
    
    # Summary
    print()
    print("-" * 80)
    print("📊 Download Summary:")
    print(f"✅ Successful: {len(files) - len(failures)}")
    print(f"❌ Failed: {len(failures)}")
    
    # Calculate total downloaded size and time
    total_size = 0
    total_time = 0
    if len(files) - len(failures) > 0:
        import time
        try:
            for filename in files:
                if filename not in failures:
                    file_path = os.path.join(args.destdir, filename)
                    if os.path.exists(file_path):
                        total_size += os.path.getsize(file_path)
        except:
            pass
        
        if total_size > 0:
            def format_size(bytes_size):
                for unit in ['B', 'KB', 'MB', 'GB']:
                    if bytes_size < 1024.0:
                        return f"{bytes_size:.1f} {unit}"
                    bytes_size /= 1024.0
                return f"{bytes_size:.1f} TB"
            
            print(f"📦 Total downloaded: {format_size(total_size)}")
    
    if failures:
        print("\n❌ Failed files:")
        for filename in failures:
            print(f"  - {filename}")
        
        # Write failures to file for retry
        with open("ia_failures.txt", "w") as f:
            for filename in failures:
                f.write(f"{filename}\n")
        
        print(f"\n🔄 To retry failures:")
        print(f"python {sys.argv[0]} {args.identifier} --destdir {args.destdir} --retries 10")
        sys.exit(1)
    else:
        print("\n🎉 All downloads completed successfully!")
        sys.exit(0)

 if __name__ == "__main__":
    main()
	#!/usr/bin/env python3
	"""
	Parallel IA downloader - creates individual download commands for each file
	and runs them concurrently using the ia command line tool.

	USAGE:
	First install the Internet Archive command line tool:
	curl -LOs https://archive.org/download/ia-pex/ia
	chmod +x ia
	# install concurrent futures if not already installed for concurrent downloads
	# pip install futures
	# on a mac, run:
	./ia configure
	Place the parallel_ia_download.py script in the same directory as the ia tool (on a drive with lots of space!)
	Next, run this to execute, replacing jillem-full-archive with your collection name. Use concurrency to control how many files to download at once. The timeout is 1 hour for files. This will will automatically retry failed downloads, resume if interrupted, and show live progress in the console:

	python parallel_ia_download.py jillem-full-archive --destdir ./jillem_zips --concurrency 10 --progress-timeout 3600

	Examples:
	# Download all .zip files to ./jillem_zips directory
	python parallel_ia_download.py jillem-full-archive --destdir ./jillem_zips

	# Download with custom concurrency and timeouts
	python parallel_ia_download.py jillem-full-archive --destdir ./downloads --concurrency 6 --progress-timeout 300

	# Download specific file patterns
	python parallel_ia_download.py jillem-full-archive --destdir ./music --glob "*.flac" --concurrency 3
	"""

	import subprocess
	import sys
	import os
	from concurrent.futures import ThreadPoolExecutor, as_completed
	import argparse

	def download_single_file(identifier: str, filename: str, destdir: str, retries: int = 5,
	progress_timeout: int = 300, max_timeout: int = 3600) -> tuple[str, bool, str]:
	"""Download a single file using the ia command line tool."""
	import time
	import threading
	import os

	try:
	# Build the ia download command
	cmd = [
	'./ia', 'download', identifier, filename,
	'--destdir', destdir,
	'--checksum',
	'--retries', str(retries)
	]

	# Calculate expected file path to monitor progress
	expected_path = os.path.join(destdir, filename)

	# Start the download process
	process = subprocess.Popen(
	cmd,
	stdout=subprocess.PIPE,
	stderr=subprocess.PIPE,
	text=True
	)

	# Monitor the download with intelligent timeout and live progress
	start_time = time.time()
	last_size = 0
	last_progress_time = start_time
	last_display_time = start_time
	display_interval = 2.0 # Update progress every 2 seconds

	def format_size(bytes_size):
	"""Convert bytes to human readable format."""
	for unit in ['B', 'KB', 'MB', 'GB']:
	if bytes_size < 1024.0:
	return f"{bytes_size:.1f} {unit}"
	bytes_size /= 1024.0
	return f"{bytes_size:.1f} TB"

	def format_speed(bytes_per_second):
	"""Convert bytes per second to human readable format."""
	return f"{format_size(bytes_per_second)}/s"

	def display_progress(current_size, speed, elapsed_time):
	"""Display live progress in Napster-style format."""
	# Try to estimate total size from IA metadata if possible
	# For now, show current size and speed
	speed_str = format_speed(speed) if speed > 0 else "starting..."
	size_str = format_size(current_size)

	# Create animated progress indicator
	spinner_chars = "⠋⠙⠹⠸⠼⠴⠦⠧⠇⠏"
	spinner = spinner_chars[int(elapsed_time * 2) % len(spinner_chars)]

	# Format: ⠋ filename.zip - 15.2 MB @ 850.3 KB/s (2m 15s)
	elapsed_str = f"{int(elapsed_time//60)}m {int(elapsed_time%60)}s" if elapsed_time > 60 else f"{int(elapsed_time)}s"

	# Use a unique line for each file to avoid conflicts with concurrent downloads
	progress_line = f"{spinner} {filename[:40]:<40} {size_str:>8} @ {speed_str:>12} ({elapsed_str})"

	# Clear and rewrite the line
	print(f"\r{progress_line:<80}", end="", flush=True)

	def monitor_progress():
	nonlocal last_size, last_progress_time, last_display_time

	while process.poll() is None:
	time.sleep(1) # Check every second for responsive display

	current_time = time.time()

	# Check if file exists and is growing
	current_size = 0
	if os.path.exists(expected_path):
	try:
	current_size = os.path.getsize(expected_path)
	if current_size > last_size:
	# File is growing, reset progress timer
	last_size = current_size
	last_progress_time = current_time
	except OSError:
	# File might be locked/in-use, that's normal during download
	pass

	# Update display every few seconds
	if current_time - last_display_time >= display_interval and current_size > 0:
	elapsed_since_display = current_time - last_display_time
	speed = (current_size - last_size) / elapsed_since_display if elapsed_since_display > 0 else 0
	elapsed_total = current_time - start_time

	display_progress(current_size, speed, elapsed_total)
	last_display_time = current_time

	# Check timeouts (but check less frequently to avoid spam)
	if int(current_time) % 10 == 0: # Every 10 seconds
	time_since_progress = current_time - last_progress_time
	total_time = current_time - start_time

	# Kill if no progress for too long or absolute timeout
	if time_since_progress > progress_timeout:
	print(f"\n⏰ {filename}: No progress for {progress_timeout/60:.1f} minutes, terminating", file=sys.stderr)
	process.terminate()
	break
	elif total_time > max_timeout:
	print(f"\n⏰ {filename}: Maximum timeout ({max_timeout/60:.1f} minutes) reached, terminating", file=sys.stderr)
	process.terminate()
	break

	# Start monitoring in background thread
	monitor_thread = threading.Thread(target=monitor_progress, daemon=True)
	monitor_thread.start()

	# Wait for process to complete
	stdout, stderr = process.communicate()

	# Clear the progress line and show final result
	print(f"\r{' ' * 80}", end="", flush=True) # Clear the line

	if process.returncode == 0:
	# Show final size and average speed
	if os.path.exists(expected_path):
	final_size = os.path.getsize(expected_path)
	elapsed = time.time() - start_time
	avg_speed = final_size / elapsed if elapsed > 0 else 0
	return (filename, True, f"✅ {filename} - {format_size(final_size)} @ {format_speed(avg_speed)} avg")
	else:
	return (filename, True, f"✅ {filename} - Downloaded successfully")
	elif process.returncode == -15: # SIGTERM (our timeout)
	return (filename, False, "Timed out - no progress detected")
	else:
	error_msg = stderr.strip() or stdout.strip() or "Unknown error"
	return (filename, False, f"Command failed (code {process.returncode}): {error_msg}")

	except Exception as e:
	return (filename, False, f"Exception: {str(e)}")

	def get_file_list(identifier: str, glob_pattern: str) -> list[str]:
	"""Get list of files matching the glob pattern."""
	try:
	cmd = ['./ia', 'list', identifier, '--glob', glob_pattern]
	result = subprocess.run(cmd, capture_output=True, text=True, check=True)

	files = [line.strip() for line in result.stdout.split('\n') if line.strip()]
	return files

	except subprocess.CalledProcessError as e:
	print(f"Error getting file list: {e}", file=sys.stderr)
	return []

	def main():
	parser = argparse.ArgumentParser(description="Concurrent IA downloads using ia command")
	parser.add_argument("identifier", help="IA item identifier")
	parser.add_argument("--destdir", default="./downloads", help="Destination directory")
	parser.add_argument("--glob", default="*.zip", help="File pattern to match")
	parser.add_argument("--concurrency", "-j", type=int, default=6, help="Number of concurrent downloads")
	parser.add_argument("--retries", type=int, default=5, help="Retries per file")
	parser.add_argument("--progress-timeout", type=int, default=300,
	help="Timeout in seconds if no progress detected (default: 300=5min)")
	parser.add_argument("--max-timeout", type=int, default=3600,
	help="Maximum timeout per file in seconds (default: 3600=1hour)")

	args = parser.parse_args()

	# Create destination directory
	os.makedirs(args.destdir, exist_ok=True)

	print(f"🔍 Getting file list for {args.identifier} matching {args.glob}...")
	files = get_file_list(args.identifier, args.glob)

	if not files:
	print("❌ No files found matching pattern")
	sys.exit(1)

	print(f"📋 Found {len(files)} files to download")
	print(f"⚡ Using {args.concurrency} concurrent downloads")
	print(f"⏰ Timeout: {args.progress_timeout/60:.1f} minutes without progress, {args.max_timeout/60:.1f} minutes maximum per file")
	print("\n🚀 Starting downloads... (live progress shown below)")
	print("-" * 80)

	# Download files concurrently
	completed = 0
	failures = []

	with ThreadPoolExecutor(max_workers=args.concurrency) as executor:
	# Submit all download tasks
	future_to_file = {
	executor.submit(download_single_file, args.identifier, filename, args.destdir,
	args.retries, args.progress_timeout, args.max_timeout): filename
	for filename in files
	}

	# Process completed downloads
	for future in as_completed(future_to_file):
	filename, success, message = future.result()
	completed += 1
	progress = f"({completed}/{len(files)})"

	if success:
	print(f"\n{message} {progress}")
	else:
	print(f"\n❌ {progress} {filename}: {message}")
	failures.append(filename)

	# Summary
	print()
	print("-" * 80)
	print("📊 Download Summary:")
	print(f"✅ Successful: {len(files) - len(failures)}")
	print(f"❌ Failed: {len(failures)}")

	# Calculate total downloaded size and time
	total_size = 0
	total_time = 0
	if len(files) - len(failures) > 0:
	import time
	try:
	for filename in files:
	if filename not in failures:
	file_path = os.path.join(args.destdir, filename)
	if os.path.exists(file_path):
	total_size += os.path.getsize(file_path)
	except:
	pass

	if total_size > 0:
	def format_size(bytes_size):
	for unit in ['B', 'KB', 'MB', 'GB']:
	if bytes_size < 1024.0:
	return f"{bytes_size:.1f} {unit}"
	bytes_size /= 1024.0
	return f"{bytes_size:.1f} TB"

	print(f"📦 Total downloaded: {format_size(total_size)}")

	if failures:
	print("\n❌ Failed files:")
	for filename in failures:
	print(f" - {filename}")

	# Write failures to file for retry
	with open("ia_failures.txt", "w") as f:
	for filename in failures:
	f.write(f"{filename}\n")

	print(f"\n🔄 To retry failures:")
	print(f"python {sys.argv[0]} {args.identifier} --destdir {args.destdir} --retries 10")
	sys.exit(1)
	else:
	print("\n🎉 All downloads completed successfully!")
	sys.exit(0)

	if __name__ == "__main__":
	main()
No results found