Created
October 9, 2025 20:11
-
-
Save nullenc0de/2f2cd0c2046bcf569d5f5e1079401aa9 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
""" | |
Enhanced GitHub Repository to Text File Converter | |
Downloads a GitHub repository and combines all text files into a single output file. | |
Uses a robust "blacklist and inspect" method with additional features. | |
""" | |
import os | |
import sys | |
import shutil | |
import tempfile | |
import argparse | |
from pathlib import Path | |
from urllib.parse import urlparse | |
import subprocess | |
from datetime import datetime | |
import hashlib | |
try: | |
from tqdm import tqdm | |
HAS_TQDM = True | |
except ImportError: | |
HAS_TQDM = False | |
print("Note: Install 'tqdm' for progress bars: pip install tqdm") | |
# --- Default Configuration --- | |
# Blacklist of extensions for files that are almost certainly not text. | |
BINARY_EXTENSIONS = { | |
# Images | |
'.png', '.jpg', '.jpeg', '.gif', '.bmp', '.ico', '.webp', '.tiff', '.tif', '.svg', | |
'.psd', '.ai', '.sketch', '.fig', '.xd', | |
# Audio/Video | |
'.mp3', '.wav', '.ogg', '.flac', '.mp4', '.avi', '.mov', '.mkv', '.webm', '.aac', | |
'.m4a', '.wmv', '.flv', '.mpg', '.mpeg', '.3gp', | |
# Compressed Archives | |
'.zip', '.rar', '.7z', '.tar', '.gz', '.bz2', '.iso', '.dmg', '.xz', '.tgz', | |
# Fonts | |
'.woff', '.woff2', '.ttf', '.eot', '.otf', | |
# Documents | |
'.pdf', '.doc', '.docx', '.xls', '.xlsx', '.ppt', '.pptx', '.odt', '.ods', '.odp', | |
# Binaries/Executables | |
'.exe', '.dll', '.so', '.o', '.a', '.jar', '.pyc', '.class', '.com', '.app', '.deb', '.rpm', | |
# Database files | |
'.db', '.sqlite', '.sqlite3', '.mdb', '.accdb', | |
# Other common binary formats | |
'.lock', '.bin', '.dat', '.pkl', '.model', '.h5', '.joblib', '.npy', '.npz', | |
# IDE/Editor specific | |
'.suo', '.user', '.userosscache', '.sln.docstates', | |
# Package files | |
'.whl', '.egg', '.gem', | |
} | |
# Default files/directories to exclude | |
DEFAULT_EXCLUDE_PATTERNS = { | |
'.git', '.svn', '.hg', 'node_modules', '__pycache__', '.pytest_cache', | |
'venv', 'env', '.env', 'dist', 'build', 'target', '.idea', '.vscode', | |
'*.pyc', '*.pyo', '*.pyd', '.DS_Store', 'Thumbs.db', '*.swp', '*.swo', | |
'coverage', '.coverage', 'htmlcov', '.tox', '.nox', '.hypothesis', | |
'vendor', 'bower_components', '.sass-cache', '.gradle', '.m2', | |
} | |
def format_size(bytes_size): | |
"""Format bytes into human-readable size.""" | |
for unit in ['B', 'KB', 'MB', 'GB']: | |
if bytes_size < 1024.0: | |
return f"{bytes_size:.2f} {unit}" | |
bytes_size /= 1024.0 | |
return f"{bytes_size:.2f} TB" | |
def is_likely_text_file(filepath, sample_size=8192): | |
""" | |
Determine if a file is likely text-based by checking for binary extension | |
and inspecting content for null bytes and text encoding. | |
""" | |
path = Path(filepath) | |
# Fast check: binary extension blacklist | |
if path.suffix.lower() in BINARY_EXTENSIONS: | |
return False | |
# Content check with larger sample | |
try: | |
with open(filepath, 'rb') as f: | |
chunk = f.read(sample_size) | |
# Check for null bytes (strong binary indicator) | |
if b'\0' in chunk: | |
return False | |
# Try to decode as UTF-8 | |
try: | |
chunk.decode('utf-8') | |
return True | |
except UnicodeDecodeError: | |
# Try other common encodings | |
for encoding in ['latin-1', 'cp1252', 'iso-8859-1']: | |
try: | |
chunk.decode(encoding) | |
return True | |
except UnicodeDecodeError: | |
continue | |
return False | |
except (IOError, PermissionError): | |
return False | |
return True | |
def should_exclude(filepath, exclude_set): | |
"""Check if file/directory should be excluded.""" | |
path = Path(filepath) | |
parts = path.parts | |
for pattern in exclude_set: | |
if pattern.startswith('*'): # Glob pattern | |
if path.match(pattern): | |
return True | |
elif pattern in parts: # Directory or filename | |
return True | |
return False | |
def get_file_encoding(filepath): | |
"""Try to detect file encoding.""" | |
encodings = ['utf-8', 'latin-1', 'cp1252', 'iso-8859-1', 'utf-16'] | |
for encoding in encodings: | |
try: | |
with open(filepath, 'r', encoding=encoding) as f: | |
f.read(1024) # Test read | |
return encoding | |
except (UnicodeDecodeError, UnicodeError): | |
continue | |
return 'utf-8' # Fallback with errors='ignore' | |
def clone_repository(repo_url, temp_dir, branch=None): | |
"""Clone the GitHub repository using git.""" | |
print(f"Cloning repository: {repo_url}") | |
cmd = ['git', 'clone', '--depth', '1'] | |
if branch: | |
cmd.extend(['-b', branch]) | |
cmd.extend([repo_url, temp_dir]) | |
try: | |
result = subprocess.run(cmd, check=True, capture_output=True, text=True) | |
print(f"Successfully cloned to {temp_dir}") | |
return True | |
except FileNotFoundError: | |
print("\nERROR: git is not installed or not in your PATH.") | |
print("Please install git: https://git-scm.com/downloads") | |
return False | |
except subprocess.CalledProcessError as e: | |
print(f"\nERROR: Failed to clone repository.") | |
if "Repository not found" in e.stderr: | |
print("The repository may be private or doesn't exist.") | |
else: | |
print(f"Git error: {e.stderr}") | |
return False | |
def process_repository(repo_path, files_to_process, output_file, options): | |
"""Process files and write to output file with enhanced formatting.""" | |
processed_files = 0 | |
error_files = 0 | |
# Calculate content hash for duplicate detection | |
content_hashes = {} | |
duplicate_files = [] | |
with open(output_file, 'w', encoding='utf-8') as out: | |
# Write enhanced header with metadata | |
out.write("=" * 80 + "\n") | |
out.write(f"GitHub Repository Contents\n") | |
out.write(f"Repository: {repo_path.name}\n") | |
out.write(f"Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n") | |
out.write(f"Total files to process: {len(files_to_process)}\n") | |
out.write("=" * 80 + "\n\n") | |
# Write table of contents if requested | |
if options.get('toc', False): | |
out.write("TABLE OF CONTENTS\n") | |
out.write("-" * 40 + "\n") | |
for i, filepath in enumerate(files_to_process, 1): | |
rel_path = filepath.relative_to(repo_path) | |
out.write(f"{i:4}. {rel_path.as_posix()}\n") | |
out.write("\n" + "=" * 80 + "\n\n") | |
# Process files with progress indicator | |
iterator = tqdm(files_to_process, desc="Processing", unit="file", ncols=100) if HAS_TQDM else files_to_process | |
for filepath in iterator: | |
try: | |
rel_path = filepath.relative_to(repo_path) | |
file_size = filepath.stat().st_size | |
# Detect encoding | |
encoding = get_file_encoding(filepath) | |
# Read content | |
with open(filepath, 'r', encoding=encoding, errors='ignore') as f: | |
content = f.read() | |
# Check for duplicates | |
content_hash = hashlib.md5(content.encode('utf-8')).hexdigest() | |
if content_hash in content_hashes: | |
duplicate_files.append((rel_path, content_hashes[content_hash])) | |
if options.get('skip_duplicates', False): | |
continue | |
content_hashes[content_hash] = rel_path | |
# Determine file type/language for syntax hint | |
extension = filepath.suffix.lower() | |
# Write enhanced file header | |
out.write("#" * 80 + "\n") | |
out.write(f"# File: {rel_path.as_posix()}\n") | |
out.write(f"# Size: {format_size(file_size)}\n") | |
out.write(f"# Encoding: {encoding}\n") | |
if extension: | |
out.write(f"# Type: {extension[1:] if extension else 'text'}\n") | |
out.write("#" * 80 + "\n\n") | |
# Add language hint for potential syntax highlighting | |
if options.get('markdown', False) and extension in {'.py', '.js', '.java', '.cpp', '.c', '.go', '.rs'}: | |
lang = extension[1:] | |
out.write(f"```{lang}\n") | |
# Write content with optional line numbers | |
if options.get('line_numbers', False): | |
lines = content.splitlines() | |
width = len(str(len(lines))) | |
for i, line in enumerate(lines, 1): | |
out.write(f"{i:>{width}} | {line}\n") | |
else: | |
out.write(content) | |
if not content.endswith('\n'): | |
out.write('\n') | |
if options.get('markdown', False) and extension in {'.py', '.js', '.java', '.cpp', '.c', '.go', '.rs'}: | |
out.write("```\n") | |
out.write("\n\n") | |
processed_files += 1 | |
except Exception as e: | |
error_files += 1 | |
out.write(f"ERROR: Could not read {rel_path.as_posix()}: {e}\n\n") | |
# Write summary | |
out.write("=" * 80 + "\n") | |
out.write("PROCESSING SUMMARY\n") | |
out.write("=" * 80 + "\n") | |
out.write(f"Files processed successfully: {processed_files}\n") | |
out.write(f"Files with errors: {error_files}\n") | |
if duplicate_files: | |
out.write(f"\nDuplicate files detected: {len(duplicate_files)}\n") | |
for dup, original in duplicate_files[:5]: # Show first 5 | |
out.write(f" - {dup} (duplicate of {original})\n") | |
if len(duplicate_files) > 5: | |
out.write(f" ... and {len(duplicate_files) - 5} more\n") | |
out.write("=" * 80 + "\n") | |
return processed_files, error_files | |
def parse_github_url(url): | |
"""Parse various GitHub URL formats.""" | |
url = url.strip() | |
# Handle git@ SSH URLs | |
if url.startswith('[email protected]:'): | |
url = url.replace('[email protected]:', 'https://github.com/') | |
# Remove .git suffix | |
if url.endswith('.git'): | |
url = url[:-4] | |
# Handle short format (owner/repo) | |
if '/' in url and not url.startswith(('http', 'git@')): | |
return f"https://github.com/{url}.git" | |
# Handle full URLs | |
if 'github.com' in url: | |
return f"{url}.git" if not url.endswith('.git') else url | |
return url | |
def main(): | |
parser = argparse.ArgumentParser( | |
description='Download a GitHub repository and combine all text files into one output file.', | |
formatter_class=argparse.RawTextHelpFormatter | |
) | |
parser.add_argument('repo', help='GitHub repository (owner/repo or URL)') | |
parser.add_argument('-o', '--output', default='repository_contents.txt', | |
help='Output file name (default: repository_contents.txt)') | |
parser.add_argument('-b', '--branch', help='Specific branch to clone') | |
parser.add_argument('--max-file-size', type=int, default=10, | |
help='Max file size in MB (default: 10)') | |
parser.add_argument('--total-size-limit', type=int, default=100, | |
help='Warning threshold for total size in MB (default: 100)') | |
parser.add_argument('--line-numbers', action='store_true', | |
help='Add line numbers to output') | |
parser.add_argument('--toc', action='store_true', | |
help='Add table of contents at the beginning') | |
parser.add_argument('--markdown', action='store_true', | |
help='Add markdown code blocks for better formatting') | |
parser.add_argument('--skip-duplicates', action='store_true', | |
help='Skip duplicate files (same content)') | |
parser.add_argument('--exclude-dir', action='append', default=[], | |
help='Directory to exclude (repeatable)') | |
parser.add_argument('--exclude-ext', action='append', default=[], | |
help='File extension to exclude (repeatable)') | |
parser.add_argument('--include-only-ext', action='append', default=[], | |
help='Process ONLY these extensions (repeatable)') | |
parser.add_argument('--keep-temp', action='store_true', | |
help='Keep temporary clone after processing') | |
parser.add_argument('--stats', action='store_true', | |
help='Show detailed statistics after processing') | |
args = parser.parse_args() | |
# Setup | |
repo_url = parse_github_url(args.repo) | |
max_file_size_bytes = args.max_file_size * 1024 * 1024 | |
total_size_limit_bytes = args.total_size_limit * 1024 * 1024 | |
# Build exclusion patterns | |
exclude_patterns = DEFAULT_EXCLUDE_PATTERNS.copy() | |
for d in args.exclude_dir: | |
exclude_patterns.add(d) | |
for ext in args.exclude_ext: | |
exclude_patterns.add(f"*{ext if ext.startswith('.') else '.' + ext}") | |
temp_dir = tempfile.mkdtemp(prefix='github_repo_') | |
try: | |
# Clone repository | |
if not clone_repository(repo_url, temp_dir, args.branch): | |
return 1 | |
repo_path = Path(temp_dir) | |
# Scan repository | |
print("\nScanning repository...") | |
files_to_process = [] | |
total_size = 0 | |
skipped_count = 0 | |
stats = {'by_extension': {}, 'by_size': {'<1KB': 0, '1-10KB': 0, '10-100KB': 0, '100KB-1MB': 0, '>1MB': 0}} | |
all_files = list(repo_path.rglob('*')) | |
all_files = [f for f in all_files if f.is_file()] | |
iterator = tqdm(all_files, desc="Scanning", unit="file", ncols=100) if HAS_TQDM else all_files | |
for filepath in iterator: | |
rel_path = filepath.relative_to(repo_path) | |
file_size = filepath.stat().st_size | |
# Track statistics | |
ext = filepath.suffix.lower() or 'no_extension' | |
stats['by_extension'][ext] = stats['by_extension'].get(ext, 0) + 1 | |
if file_size < 1024: | |
stats['by_size']['<1KB'] += 1 | |
elif file_size < 10240: | |
stats['by_size']['1-10KB'] += 1 | |
elif file_size < 102400: | |
stats['by_size']['10-100KB'] += 1 | |
elif file_size < 1048576: | |
stats['by_size']['100KB-1MB'] += 1 | |
else: | |
stats['by_size']['>1MB'] += 1 | |
# Apply filters | |
if should_exclude(rel_path, exclude_patterns): | |
skipped_count += 1 | |
continue | |
if file_size > max_file_size_bytes: | |
skipped_count += 1 | |
continue | |
# Include-only filter | |
if args.include_only_ext: | |
ext = filepath.suffix.lower() | |
if not any(ext == (e if e.startswith('.') else f'.{e}') for e in args.include_only_ext): | |
skipped_count += 1 | |
continue | |
if not is_likely_text_file(filepath): | |
skipped_count += 1 | |
continue | |
files_to_process.append(filepath) | |
total_size += file_size | |
print(f"\nFound {len(all_files)} total files") | |
print(f"Will process {len(files_to_process)} text files") | |
print(f"Skipped {skipped_count} files (binary/excluded/oversized)") | |
print(f"Total size to process: {format_size(total_size)}") | |
# Show statistics if requested | |
if args.stats: | |
print("\n" + "="*40) | |
print("FILE STATISTICS") | |
print("="*40) | |
print("\nTop 10 extensions by count:") | |
sorted_exts = sorted(stats['by_extension'].items(), key=lambda x: x[1], reverse=True) | |
for ext, count in sorted_exts[:10]: | |
print(f" {ext:15} {count:5} files") | |
print("\nFile size distribution:") | |
for size_range, count in stats['by_size'].items(): | |
print(f" {size_range:15} {count:5} files") | |
# Size warning | |
if total_size > total_size_limit_bytes: | |
print(f"\n⚠️ WARNING: Total size ({format_size(total_size)}) exceeds limit ({format_size(total_size_limit_bytes)})") | |
if input("Continue anyway? (y/n): ").lower() != 'y': | |
print("Cancelled.") | |
return 1 | |
# Process files | |
if files_to_process: | |
print(f"\nWriting to {args.output}...") | |
options = { | |
'line_numbers': args.line_numbers, | |
'toc': args.toc, | |
'markdown': args.markdown, | |
'skip_duplicates': args.skip_duplicates | |
} | |
processed, errors = process_repository(repo_path, files_to_process, args.output, options) | |
# Summary | |
print(f"\n{'='*50}") | |
print("✅ COMPLETE!") | |
print(f" Processed: {processed} files") | |
print(f" Errors: {errors} files") | |
print(f" Output: {args.output}") | |
print(f" Size: {format_size(Path(args.output).stat().st_size)}") | |
print(f"{'='*50}") | |
else: | |
print("\n⚠️ No text files found to process!") | |
return 0 | |
except KeyboardInterrupt: | |
print("\n\n❌ Cancelled by user") | |
return 1 | |
except Exception as e: | |
print(f"\n❌ ERROR: {e}") | |
import traceback | |
traceback.print_exc() | |
return 1 | |
finally: | |
if not args.keep_temp and os.path.exists(temp_dir): | |
print("\nCleaning up...") | |
shutil.rmtree(temp_dir) | |
if __name__ == "__main__": | |
sys.exit(main()) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment