nullenc0de · October 9, 2025 20:11
diff --git a/github_to_txt.py b/github_to_txt.py
 #!/usr/bin/env python3
 """
 Enhanced GitHub Repository to Text File Converter
 Downloads a GitHub repository and combines all text files into a single output file.
 Uses a robust "blacklist and inspect" method with additional features.
 """

 import os
 import sys
 import shutil
 import tempfile
 import argparse
 from pathlib import Path
 from urllib.parse import urlparse
 import subprocess
 from datetime import datetime
 import hashlib
 try:
    from tqdm import tqdm
    HAS_TQDM = True
 except ImportError:
    HAS_TQDM = False
    print("Note: Install 'tqdm' for progress bars: pip install tqdm")

 # --- Default Configuration ---

 # Blacklist of extensions for files that are almost certainly not text.
 BINARY_EXTENSIONS = {
    # Images
    '.png', '.jpg', '.jpeg', '.gif', '.bmp', '.ico', '.webp', '.tiff', '.tif', '.svg',
    '.psd', '.ai', '.sketch', '.fig', '.xd',
    # Audio/Video
    '.mp3', '.wav', '.ogg', '.flac', '.mp4', '.avi', '.mov', '.mkv', '.webm', '.aac',
    '.m4a', '.wmv', '.flv', '.mpg', '.mpeg', '.3gp',
    # Compressed Archives
    '.zip', '.rar', '.7z', '.tar', '.gz', '.bz2', '.iso', '.dmg', '.xz', '.tgz',
    # Fonts
    '.woff', '.woff2', '.ttf', '.eot', '.otf',
    # Documents
    '.pdf', '.doc', '.docx', '.xls', '.xlsx', '.ppt', '.pptx', '.odt', '.ods', '.odp',
    # Binaries/Executables
    '.exe', '.dll', '.so', '.o', '.a', '.jar', '.pyc', '.class', '.com', '.app', '.deb', '.rpm',
    # Database files
    '.db', '.sqlite', '.sqlite3', '.mdb', '.accdb',
    # Other common binary formats
    '.lock', '.bin', '.dat', '.pkl', '.model', '.h5', '.joblib', '.npy', '.npz',
    # IDE/Editor specific
    '.suo', '.user', '.userosscache', '.sln.docstates',
    # Package files
    '.whl', '.egg', '.gem',
 }

 # Default files/directories to exclude
 DEFAULT_EXCLUDE_PATTERNS = {
    '.git', '.svn', '.hg', 'node_modules', '__pycache__', '.pytest_cache',
    'venv', 'env', '.env', 'dist', 'build', 'target', '.idea', '.vscode',
    '*.pyc', '*.pyo', '*.pyd', '.DS_Store', 'Thumbs.db', '*.swp', '*.swo',
    'coverage', '.coverage', 'htmlcov', '.tox', '.nox', '.hypothesis',
    'vendor', 'bower_components', '.sass-cache', '.gradle', '.m2',
 }

 def format_size(bytes_size):
    """Format bytes into human-readable size."""
    for unit in ['B', 'KB', 'MB', 'GB']:
        if bytes_size < 1024.0:
            return f"{bytes_size:.2f} {unit}"
        bytes_size /= 1024.0
    return f"{bytes_size:.2f} TB"

 def is_likely_text_file(filepath, sample_size=8192):
    """
    Determine if a file is likely text-based by checking for binary extension
    and inspecting content for null bytes and text encoding.
    """
    path = Path(filepath)

    # Fast check: binary extension blacklist
    if path.suffix.lower() in BINARY_EXTENSIONS:
        return False

    # Content check with larger sample
    try:
        with open(filepath, 'rb') as f:
            chunk = f.read(sample_size)
            
            # Check for null bytes (strong binary indicator)
            if b'\0' in chunk:
                return False
            
            # Try to decode as UTF-8
            try:
                chunk.decode('utf-8')
                return True
            except UnicodeDecodeError:
                # Try other common encodings
                for encoding in ['latin-1', 'cp1252', 'iso-8859-1']:
                    try:
                        chunk.decode(encoding)
                        return True
                    except UnicodeDecodeError:
                        continue
                return False
                
    except (IOError, PermissionError):
        return False
        
    return True

 def should_exclude(filepath, exclude_set):
    """Check if file/directory should be excluded."""
    path = Path(filepath)
    parts = path.parts
    
    for pattern in exclude_set:
        if pattern.startswith('*'):  # Glob pattern
            if path.match(pattern):
                return True
        elif pattern in parts:  # Directory or filename
            return True
            
    return False

 def get_file_encoding(filepath):
    """Try to detect file encoding."""
    encodings = ['utf-8', 'latin-1', 'cp1252', 'iso-8859-1', 'utf-16']
    
    for encoding in encodings:
        try:
            with open(filepath, 'r', encoding=encoding) as f:
                f.read(1024)  # Test read
            return encoding
        except (UnicodeDecodeError, UnicodeError):
            continue
    
    return 'utf-8'  # Fallback with errors='ignore'

 def clone_repository(repo_url, temp_dir, branch=None):
    """Clone the GitHub repository using git."""
    print(f"Cloning repository: {repo_url}")
    
    cmd = ['git', 'clone', '--depth', '1']
    if branch:
        cmd.extend(['-b', branch])
    cmd.extend([repo_url, temp_dir])
    
    try:
        result = subprocess.run(cmd, check=True, capture_output=True, text=True)
        print(f"Successfully cloned to {temp_dir}")
        return True
    except FileNotFoundError:
        print("\nERROR: git is not installed or not in your PATH.")
        print("Please install git: https://git-scm.com/downloads")
        return False
    except subprocess.CalledProcessError as e:
        print(f"\nERROR: Failed to clone repository.")
        if "Repository not found" in e.stderr:
            print("The repository may be private or doesn't exist.")
        else:
            print(f"Git error: {e.stderr}")
        return False

 def process_repository(repo_path, files_to_process, output_file, options):
    """Process files and write to output file with enhanced formatting."""
    processed_files = 0
    error_files = 0
    
    # Calculate content hash for duplicate detection
    content_hashes = {}
    duplicate_files = []
    
    with open(output_file, 'w', encoding='utf-8') as out:
        # Write enhanced header with metadata
        out.write("=" * 80 + "\n")
        out.write(f"GitHub Repository Contents\n")
        out.write(f"Repository: {repo_path.name}\n")
        out.write(f"Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n")
        out.write(f"Total files to process: {len(files_to_process)}\n")
        out.write("=" * 80 + "\n\n")
        
        # Write table of contents if requested
        if options.get('toc', False):
            out.write("TABLE OF CONTENTS\n")
            out.write("-" * 40 + "\n")
            for i, filepath in enumerate(files_to_process, 1):
                rel_path = filepath.relative_to(repo_path)
                out.write(f"{i:4}. {rel_path.as_posix()}\n")
            out.write("\n" + "=" * 80 + "\n\n")
        
        # Process files with progress indicator
        iterator = tqdm(files_to_process, desc="Processing", unit="file", ncols=100) if HAS_TQDM else files_to_process
        
        for filepath in iterator:
            try:
                rel_path = filepath.relative_to(repo_path)
                file_size = filepath.stat().st_size
                
                # Detect encoding
                encoding = get_file_encoding(filepath)
                
                # Read content
                with open(filepath, 'r', encoding=encoding, errors='ignore') as f:
                    content = f.read()
                
                # Check for duplicates
                content_hash = hashlib.md5(content.encode('utf-8')).hexdigest()
                if content_hash in content_hashes:
                    duplicate_files.append((rel_path, content_hashes[content_hash]))
                    if options.get('skip_duplicates', False):
                        continue
                content_hashes[content_hash] = rel_path
                
                # Determine file type/language for syntax hint
                extension = filepath.suffix.lower()
                
                # Write enhanced file header
                out.write("#" * 80 + "\n")
                out.write(f"# File: {rel_path.as_posix()}\n")
                out.write(f"# Size: {format_size(file_size)}\n")
                out.write(f"# Encoding: {encoding}\n")
                if extension:
                    out.write(f"# Type: {extension[1:] if extension else 'text'}\n")
                out.write("#" * 80 + "\n\n")
                
                # Add language hint for potential syntax highlighting
                if options.get('markdown', False) and extension in {'.py', '.js', '.java', '.cpp', '.c', '.go', '.rs'}:
                    lang = extension[1:]
                    out.write(f"```{lang}\n")
                
                # Write content with optional line numbers
                if options.get('line_numbers', False):
                    lines = content.splitlines()
                    width = len(str(len(lines)))
                    for i, line in enumerate(lines, 1):
                        out.write(f"{i:>{width}} | {line}\n")
                else:
                    out.write(content)
                    if not content.endswith('\n'):
                        out.write('\n')
                
                if options.get('markdown', False) and extension in {'.py', '.js', '.java', '.cpp', '.c', '.go', '.rs'}:
                    out.write("```\n")
                
                out.write("\n\n")
                processed_files += 1
                
            except Exception as e:
                error_files += 1
                out.write(f"ERROR: Could not read {rel_path.as_posix()}: {e}\n\n")
        
        # Write summary
        out.write("=" * 80 + "\n")
        out.write("PROCESSING SUMMARY\n")
        out.write("=" * 80 + "\n")
        out.write(f"Files processed successfully: {processed_files}\n")
        out.write(f"Files with errors: {error_files}\n")
        
        if duplicate_files:
            out.write(f"\nDuplicate files detected: {len(duplicate_files)}\n")
            for dup, original in duplicate_files[:5]:  # Show first 5
                out.write(f"  - {dup} (duplicate of {original})\n")
            if len(duplicate_files) > 5:
                out.write(f"  ... and {len(duplicate_files) - 5} more\n")
        
        out.write("=" * 80 + "\n")
    
    return processed_files, error_files

 def parse_github_url(url):
    """Parse various GitHub URL formats."""
    url = url.strip()
    
    # Handle git@ SSH URLs
    if url.startswith('[email protected]:'):
        url = url.replace('[email protected]:', 'https://github.com/')
    
    # Remove .git suffix
    if url.endswith('.git'):
        url = url[:-4]
    
    # Handle short format (owner/repo)
    if '/' in url and not url.startswith(('http', 'git@')):
        return f"https://github.com/{url}.git"
    
    # Handle full URLs
    if 'github.com' in url:
        return f"{url}.git" if not url.endswith('.git') else url
        
    return url

 def main():
    parser = argparse.ArgumentParser(
        description='Download a GitHub repository and combine all text files into one output file.',
        formatter_class=argparse.RawTextHelpFormatter
    )
    parser.add_argument('repo', help='GitHub repository (owner/repo or URL)')
    parser.add_argument('-o', '--output', default='repository_contents.txt',
                      help='Output file name (default: repository_contents.txt)')
    parser.add_argument('-b', '--branch', help='Specific branch to clone')
    parser.add_argument('--max-file-size', type=int, default=10,
                      help='Max file size in MB (default: 10)')
    parser.add_argument('--total-size-limit', type=int, default=100,
                      help='Warning threshold for total size in MB (default: 100)')
    parser.add_argument('--line-numbers', action='store_true',
                      help='Add line numbers to output')
    parser.add_argument('--toc', action='store_true',
                      help='Add table of contents at the beginning')
    parser.add_argument('--markdown', action='store_true',
                      help='Add markdown code blocks for better formatting')
    parser.add_argument('--skip-duplicates', action='store_true',
                      help='Skip duplicate files (same content)')
    parser.add_argument('--exclude-dir', action='append', default=[],
                      help='Directory to exclude (repeatable)')
    parser.add_argument('--exclude-ext', action='append', default=[],
                      help='File extension to exclude (repeatable)')
    parser.add_argument('--include-only-ext', action='append', default=[],
                      help='Process ONLY these extensions (repeatable)')
    parser.add_argument('--keep-temp', action='store_true',
                      help='Keep temporary clone after processing')
    parser.add_argument('--stats', action='store_true',
                      help='Show detailed statistics after processing')
    
    args = parser.parse_args()
    
    # Setup
    repo_url = parse_github_url(args.repo)
    max_file_size_bytes = args.max_file_size * 1024 * 1024
    total_size_limit_bytes = args.total_size_limit * 1024 * 1024
    
    # Build exclusion patterns
    exclude_patterns = DEFAULT_EXCLUDE_PATTERNS.copy()
    for d in args.exclude_dir:
        exclude_patterns.add(d)
    for ext in args.exclude_ext:
        exclude_patterns.add(f"*{ext if ext.startswith('.') else '.' + ext}")
    
    temp_dir = tempfile.mkdtemp(prefix='github_repo_')
    
    try:
        # Clone repository
        if not clone_repository(repo_url, temp_dir, args.branch):
            return 1
        
        repo_path = Path(temp_dir)
        
        # Scan repository
        print("\nScanning repository...")
        files_to_process = []
        total_size = 0
        skipped_count = 0
        stats = {'by_extension': {}, 'by_size': {'<1KB': 0, '1-10KB': 0, '10-100KB': 0, '100KB-1MB': 0, '>1MB': 0}}
        
        all_files = list(repo_path.rglob('*'))
        all_files = [f for f in all_files if f.is_file()]
        
        iterator = tqdm(all_files, desc="Scanning", unit="file", ncols=100) if HAS_TQDM else all_files
        
        for filepath in iterator:
            rel_path = filepath.relative_to(repo_path)
            file_size = filepath.stat().st_size
            
            # Track statistics
            ext = filepath.suffix.lower() or 'no_extension'
            stats['by_extension'][ext] = stats['by_extension'].get(ext, 0) + 1
            
            if file_size < 1024:
                stats['by_size']['<1KB'] += 1
            elif file_size < 10240:
                stats['by_size']['1-10KB'] += 1
            elif file_size < 102400:
                stats['by_size']['10-100KB'] += 1
            elif file_size < 1048576:
                stats['by_size']['100KB-1MB'] += 1
            else:
                stats['by_size']['>1MB'] += 1
            
            # Apply filters
            if should_exclude(rel_path, exclude_patterns):
                skipped_count += 1
                continue
                
            if file_size > max_file_size_bytes:
                skipped_count += 1
                continue
                
            # Include-only filter
            if args.include_only_ext:
                ext = filepath.suffix.lower()
                if not any(ext == (e if e.startswith('.') else f'.{e}') for e in args.include_only_ext):
                    skipped_count += 1
                    continue
            
            if not is_likely_text_file(filepath):
                skipped_count += 1
                continue
            
            files_to_process.append(filepath)
            total_size += file_size
        
        print(f"\nFound {len(all_files)} total files")
        print(f"Will process {len(files_to_process)} text files")
        print(f"Skipped {skipped_count} files (binary/excluded/oversized)")
        print(f"Total size to process: {format_size(total_size)}")
        
        # Show statistics if requested
        if args.stats:
            print("\n" + "="*40)
            print("FILE STATISTICS")
            print("="*40)
            print("\nTop 10 extensions by count:")
            sorted_exts = sorted(stats['by_extension'].items(), key=lambda x: x[1], reverse=True)
            for ext, count in sorted_exts[:10]:
                print(f"  {ext:15} {count:5} files")
            print("\nFile size distribution:")
            for size_range, count in stats['by_size'].items():
                print(f"  {size_range:15} {count:5} files")
        
        # Size warning
        if total_size > total_size_limit_bytes:
            print(f"\n⚠️  WARNING: Total size ({format_size(total_size)}) exceeds limit ({format_size(total_size_limit_bytes)})")
            if input("Continue anyway? (y/n): ").lower() != 'y':
                print("Cancelled.")
                return 1
        
        # Process files
        if files_to_process:
            print(f"\nWriting to {args.output}...")
            options = {
                'line_numbers': args.line_numbers,
                'toc': args.toc,
                'markdown': args.markdown,
                'skip_duplicates': args.skip_duplicates
            }
            processed, errors = process_repository(repo_path, files_to_process, args.output, options)
            
            # Summary
            print(f"\n{'='*50}")
            print("✅ COMPLETE!")
            print(f"  Processed: {processed} files")
            print(f"  Errors: {errors} files")
            print(f"  Output: {args.output}")
            print(f"  Size: {format_size(Path(args.output).stat().st_size)}")
            print(f"{'='*50}")
        else:
            print("\n⚠️  No text files found to process!")
            
        return 0
        
    except KeyboardInterrupt:
        print("\n\n❌ Cancelled by user")
        return 1
    except Exception as e:
        print(f"\n❌ ERROR: {e}")
        import traceback
        traceback.print_exc()
        return 1
    finally:
        if not args.keep_temp and os.path.exists(temp_dir):
            print("\nCleaning up...")
            shutil.rmtree(temp_dir)

 if __name__ == "__main__":
    sys.exit(main())
	#!/usr/bin/env python3
	"""
	Enhanced GitHub Repository to Text File Converter
	Downloads a GitHub repository and combines all text files into a single output file.
	Uses a robust "blacklist and inspect" method with additional features.
	"""

	import os
	import sys
	import shutil
	import tempfile
	import argparse
	from pathlib import Path
	from urllib.parse import urlparse
	import subprocess
	from datetime import datetime
	import hashlib
	try:
	from tqdm import tqdm
	HAS_TQDM = True
	except ImportError:
	HAS_TQDM = False
	print("Note: Install 'tqdm' for progress bars: pip install tqdm")

	# --- Default Configuration ---

	# Blacklist of extensions for files that are almost certainly not text.
	BINARY_EXTENSIONS = {
	# Images
	'.png', '.jpg', '.jpeg', '.gif', '.bmp', '.ico', '.webp', '.tiff', '.tif', '.svg',
	'.psd', '.ai', '.sketch', '.fig', '.xd',
	# Audio/Video
	'.mp3', '.wav', '.ogg', '.flac', '.mp4', '.avi', '.mov', '.mkv', '.webm', '.aac',
	'.m4a', '.wmv', '.flv', '.mpg', '.mpeg', '.3gp',
	# Compressed Archives
	'.zip', '.rar', '.7z', '.tar', '.gz', '.bz2', '.iso', '.dmg', '.xz', '.tgz',
	# Fonts
	'.woff', '.woff2', '.ttf', '.eot', '.otf',
	# Documents
	'.pdf', '.doc', '.docx', '.xls', '.xlsx', '.ppt', '.pptx', '.odt', '.ods', '.odp',
	# Binaries/Executables
	'.exe', '.dll', '.so', '.o', '.a', '.jar', '.pyc', '.class', '.com', '.app', '.deb', '.rpm',
	# Database files
	'.db', '.sqlite', '.sqlite3', '.mdb', '.accdb',
	# Other common binary formats
	'.lock', '.bin', '.dat', '.pkl', '.model', '.h5', '.joblib', '.npy', '.npz',
	# IDE/Editor specific
	'.suo', '.user', '.userosscache', '.sln.docstates',
	# Package files
	'.whl', '.egg', '.gem',
	}

	# Default files/directories to exclude
	DEFAULT_EXCLUDE_PATTERNS = {
	'.git', '.svn', '.hg', 'node_modules', '__pycache__', '.pytest_cache',
	'venv', 'env', '.env', 'dist', 'build', 'target', '.idea', '.vscode',
	'.pyc', '.pyo', '.pyd', '.DS_Store', 'Thumbs.db', '.swp', '*.swo',
	'coverage', '.coverage', 'htmlcov', '.tox', '.nox', '.hypothesis',
	'vendor', 'bower_components', '.sass-cache', '.gradle', '.m2',
	}

	def format_size(bytes_size):
	"""Format bytes into human-readable size."""
	for unit in ['B', 'KB', 'MB', 'GB']:
	if bytes_size < 1024.0:
	return f"{bytes_size:.2f} {unit}"
	bytes_size /= 1024.0
	return f"{bytes_size:.2f} TB"

	def is_likely_text_file(filepath, sample_size=8192):
	"""
	Determine if a file is likely text-based by checking for binary extension
	and inspecting content for null bytes and text encoding.
	"""
	path = Path(filepath)

	# Fast check: binary extension blacklist
	if path.suffix.lower() in BINARY_EXTENSIONS:
	return False

	# Content check with larger sample
	try:
	with open(filepath, 'rb') as f:
	chunk = f.read(sample_size)

	# Check for null bytes (strong binary indicator)
	if b'\0' in chunk:
	return False

	# Try to decode as UTF-8
	try:
	chunk.decode('utf-8')
	return True
	except UnicodeDecodeError:
	# Try other common encodings
	for encoding in ['latin-1', 'cp1252', 'iso-8859-1']:
	try:
	chunk.decode(encoding)
	return True
	except UnicodeDecodeError:
	continue
	return False

	except (IOError, PermissionError):
	return False

	return True

	def should_exclude(filepath, exclude_set):
	"""Check if file/directory should be excluded."""
	path = Path(filepath)
	parts = path.parts

	for pattern in exclude_set:
	if pattern.startswith('*'): # Glob pattern
	if path.match(pattern):
	return True
	elif pattern in parts: # Directory or filename
	return True

	return False

	def get_file_encoding(filepath):
	"""Try to detect file encoding."""
	encodings = ['utf-8', 'latin-1', 'cp1252', 'iso-8859-1', 'utf-16']

	for encoding in encodings:
	try:
	with open(filepath, 'r', encoding=encoding) as f:
	f.read(1024) # Test read
	return encoding
	except (UnicodeDecodeError, UnicodeError):
	continue

	return 'utf-8' # Fallback with errors='ignore'

	def clone_repository(repo_url, temp_dir, branch=None):
	"""Clone the GitHub repository using git."""
	print(f"Cloning repository: {repo_url}")

	cmd = ['git', 'clone', '--depth', '1']
	if branch:
	cmd.extend(['-b', branch])
	cmd.extend([repo_url, temp_dir])

	try:
	result = subprocess.run(cmd, check=True, capture_output=True, text=True)
	print(f"Successfully cloned to {temp_dir}")
	return True
	except FileNotFoundError:
	print("\nERROR: git is not installed or not in your PATH.")
	print("Please install git: https://git-scm.com/downloads")
	return False
	except subprocess.CalledProcessError as e:
	print(f"\nERROR: Failed to clone repository.")
	if "Repository not found" in e.stderr:
	print("The repository may be private or doesn't exist.")
	else:
	print(f"Git error: {e.stderr}")
	return False

	def process_repository(repo_path, files_to_process, output_file, options):
	"""Process files and write to output file with enhanced formatting."""
	processed_files = 0
	error_files = 0

	# Calculate content hash for duplicate detection
	content_hashes = {}
	duplicate_files = []

	with open(output_file, 'w', encoding='utf-8') as out:
	# Write enhanced header with metadata
	out.write("=" * 80 + "\n")
	out.write(f"GitHub Repository Contents\n")
	out.write(f"Repository: {repo_path.name}\n")
	out.write(f"Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n")
	out.write(f"Total files to process: {len(files_to_process)}\n")
	out.write("=" * 80 + "\n\n")

	# Write table of contents if requested
	if options.get('toc', False):
	out.write("TABLE OF CONTENTS\n")
	out.write("-" * 40 + "\n")
	for i, filepath in enumerate(files_to_process, 1):
	rel_path = filepath.relative_to(repo_path)
	out.write(f"{i:4}. {rel_path.as_posix()}\n")
	out.write("\n" + "=" * 80 + "\n\n")

	# Process files with progress indicator
	iterator = tqdm(files_to_process, desc="Processing", unit="file", ncols=100) if HAS_TQDM else files_to_process

	for filepath in iterator:
	try:
	rel_path = filepath.relative_to(repo_path)
	file_size = filepath.stat().st_size

	# Detect encoding
	encoding = get_file_encoding(filepath)

	# Read content
	with open(filepath, 'r', encoding=encoding, errors='ignore') as f:
	content = f.read()

	# Check for duplicates
	content_hash = hashlib.md5(content.encode('utf-8')).hexdigest()
	if content_hash in content_hashes:
	duplicate_files.append((rel_path, content_hashes[content_hash]))
	if options.get('skip_duplicates', False):
	continue
	content_hashes[content_hash] = rel_path

	# Determine file type/language for syntax hint
	extension = filepath.suffix.lower()

	# Write enhanced file header
	out.write("#" * 80 + "\n")
	out.write(f"# File: {rel_path.as_posix()}\n")
	out.write(f"# Size: {format_size(file_size)}\n")
	out.write(f"# Encoding: {encoding}\n")
	if extension:
	out.write(f"# Type: {extension[1:] if extension else 'text'}\n")
	out.write("#" * 80 + "\n\n")

	# Add language hint for potential syntax highlighting
	if options.get('markdown', False) and extension in {'.py', '.js', '.java', '.cpp', '.c', '.go', '.rs'}:
	lang = extension[1:]
	out.write(f"```{lang}\n")

	# Write content with optional line numbers
	if options.get('line_numbers', False):
	lines = content.splitlines()
	width = len(str(len(lines)))
	for i, line in enumerate(lines, 1):
	out.write(f"{i:>{width}} \| {line}\n")
	else:
	out.write(content)
	if not content.endswith('\n'):
	out.write('\n')

	if options.get('markdown', False) and extension in {'.py', '.js', '.java', '.cpp', '.c', '.go', '.rs'}:
	out.write("```\n")

	out.write("\n\n")
	processed_files += 1

	except Exception as e:
	error_files += 1
	out.write(f"ERROR: Could not read {rel_path.as_posix()}: {e}\n\n")

	# Write summary
	out.write("=" * 80 + "\n")
	out.write("PROCESSING SUMMARY\n")
	out.write("=" * 80 + "\n")
	out.write(f"Files processed successfully: {processed_files}\n")
	out.write(f"Files with errors: {error_files}\n")

	if duplicate_files:
	out.write(f"\nDuplicate files detected: {len(duplicate_files)}\n")
	for dup, original in duplicate_files[:5]: # Show first 5
	out.write(f" - {dup} (duplicate of {original})\n")
	if len(duplicate_files) > 5:
	out.write(f" ... and {len(duplicate_files) - 5} more\n")

	out.write("=" * 80 + "\n")

	return processed_files, error_files

	def parse_github_url(url):
	"""Parse various GitHub URL formats."""
	url = url.strip()

	# Handle git@ SSH URLs
	if url.startswith('[email protected]:'):
	url = url.replace('[email protected]:', 'https://github.com/')

	# Remove .git suffix
	if url.endswith('.git'):
	url = url[:-4]

	# Handle short format (owner/repo)
	if '/' in url and not url.startswith(('http', 'git@')):
	return f"https://github.com/{url}.git"

	# Handle full URLs
	if 'github.com' in url:
	return f"{url}.git" if not url.endswith('.git') else url

	return url

	def main():
	parser = argparse.ArgumentParser(
	description='Download a GitHub repository and combine all text files into one output file.',
	formatter_class=argparse.RawTextHelpFormatter
	)
	parser.add_argument('repo', help='GitHub repository (owner/repo or URL)')
	parser.add_argument('-o', '--output', default='repository_contents.txt',
	help='Output file name (default: repository_contents.txt)')
	parser.add_argument('-b', '--branch', help='Specific branch to clone')
	parser.add_argument('--max-file-size', type=int, default=10,
	help='Max file size in MB (default: 10)')
	parser.add_argument('--total-size-limit', type=int, default=100,
	help='Warning threshold for total size in MB (default: 100)')
	parser.add_argument('--line-numbers', action='store_true',
	help='Add line numbers to output')
	parser.add_argument('--toc', action='store_true',
	help='Add table of contents at the beginning')
	parser.add_argument('--markdown', action='store_true',
	help='Add markdown code blocks for better formatting')
	parser.add_argument('--skip-duplicates', action='store_true',
	help='Skip duplicate files (same content)')
	parser.add_argument('--exclude-dir', action='append', default=[],
	help='Directory to exclude (repeatable)')
	parser.add_argument('--exclude-ext', action='append', default=[],
	help='File extension to exclude (repeatable)')
	parser.add_argument('--include-only-ext', action='append', default=[],
	help='Process ONLY these extensions (repeatable)')
	parser.add_argument('--keep-temp', action='store_true',
	help='Keep temporary clone after processing')
	parser.add_argument('--stats', action='store_true',
	help='Show detailed statistics after processing')

	args = parser.parse_args()

	# Setup
	repo_url = parse_github_url(args.repo)
	max_file_size_bytes = args.max_file_size * 1024 * 1024
	total_size_limit_bytes = args.total_size_limit * 1024 * 1024

	# Build exclusion patterns
	exclude_patterns = DEFAULT_EXCLUDE_PATTERNS.copy()
	for d in args.exclude_dir:
	exclude_patterns.add(d)
	for ext in args.exclude_ext:
	exclude_patterns.add(f"*{ext if ext.startswith('.') else '.' + ext}")

	temp_dir = tempfile.mkdtemp(prefix='github_repo_')

	try:
	# Clone repository
	if not clone_repository(repo_url, temp_dir, args.branch):
	return 1

	repo_path = Path(temp_dir)

	# Scan repository
	print("\nScanning repository...")
	files_to_process = []
	total_size = 0
	skipped_count = 0
	stats = {'by_extension': {}, 'by_size': {'<1KB': 0, '1-10KB': 0, '10-100KB': 0, '100KB-1MB': 0, '>1MB': 0}}

	all_files = list(repo_path.rglob('*'))
	all_files = [f for f in all_files if f.is_file()]

	iterator = tqdm(all_files, desc="Scanning", unit="file", ncols=100) if HAS_TQDM else all_files

	for filepath in iterator:
	rel_path = filepath.relative_to(repo_path)
	file_size = filepath.stat().st_size

	# Track statistics
	ext = filepath.suffix.lower() or 'no_extension'
	stats['by_extension'][ext] = stats['by_extension'].get(ext, 0) + 1

	if file_size < 1024:
	stats['by_size']['<1KB'] += 1
	elif file_size < 10240:
	stats['by_size']['1-10KB'] += 1
	elif file_size < 102400:
	stats['by_size']['10-100KB'] += 1
	elif file_size < 1048576:
	stats['by_size']['100KB-1MB'] += 1
	else:
	stats['by_size']['>1MB'] += 1

	# Apply filters
	if should_exclude(rel_path, exclude_patterns):
	skipped_count += 1
	continue

	if file_size > max_file_size_bytes:
	skipped_count += 1
	continue

	# Include-only filter
	if args.include_only_ext:
	ext = filepath.suffix.lower()
	if not any(ext == (e if e.startswith('.') else f'.{e}') for e in args.include_only_ext):
	skipped_count += 1
	continue

	if not is_likely_text_file(filepath):
	skipped_count += 1
	continue

	files_to_process.append(filepath)
	total_size += file_size

	print(f"\nFound {len(all_files)} total files")
	print(f"Will process {len(files_to_process)} text files")
	print(f"Skipped {skipped_count} files (binary/excluded/oversized)")
	print(f"Total size to process: {format_size(total_size)}")

	# Show statistics if requested
	if args.stats:
	print("\n" + "="*40)
	print("FILE STATISTICS")
	print("="*40)
	print("\nTop 10 extensions by count:")
	sorted_exts = sorted(stats['by_extension'].items(), key=lambda x: x[1], reverse=True)
	for ext, count in sorted_exts[:10]:
	print(f" {ext:15} {count:5} files")
	print("\nFile size distribution:")
	for size_range, count in stats['by_size'].items():
	print(f" {size_range:15} {count:5} files")

	# Size warning
	if total_size > total_size_limit_bytes:
	print(f"\n⚠️ WARNING: Total size ({format_size(total_size)}) exceeds limit ({format_size(total_size_limit_bytes)})")
	if input("Continue anyway? (y/n): ").lower() != 'y':
	print("Cancelled.")
	return 1

	# Process files
	if files_to_process:
	print(f"\nWriting to {args.output}...")
	options = {
	'line_numbers': args.line_numbers,
	'toc': args.toc,
	'markdown': args.markdown,
	'skip_duplicates': args.skip_duplicates
	}
	processed, errors = process_repository(repo_path, files_to_process, args.output, options)

	# Summary
	print(f"\n{'='*50}")
	print("✅ COMPLETE!")
	print(f" Processed: {processed} files")
	print(f" Errors: {errors} files")
	print(f" Output: {args.output}")
	print(f" Size: {format_size(Path(args.output).stat().st_size)}")
	print(f"{'='*50}")
	else:
	print("\n⚠️ No text files found to process!")

	return 0

	except KeyboardInterrupt:
	print("\n\n❌ Cancelled by user")
	return 1
	except Exception as e:
	print(f"\n❌ ERROR: {e}")
	import traceback
	traceback.print_exc()
	return 1
	finally:
	if not args.keep_temp and os.path.exists(temp_dir):
	print("\nCleaning up...")
	shutil.rmtree(temp_dir)

	if __name__ == "__main__":
	sys.exit(main())
No results found