Skip to content

Instantly share code, notes, and snippets.

@nullenc0de
Created October 9, 2025 20:11
Show Gist options
  • Save nullenc0de/2f2cd0c2046bcf569d5f5e1079401aa9 to your computer and use it in GitHub Desktop.
Save nullenc0de/2f2cd0c2046bcf569d5f5e1079401aa9 to your computer and use it in GitHub Desktop.
#!/usr/bin/env python3
"""
Enhanced GitHub Repository to Text File Converter
Downloads a GitHub repository and combines all text files into a single output file.
Uses a robust "blacklist and inspect" method with additional features.
"""
import os
import sys
import shutil
import tempfile
import argparse
from pathlib import Path
from urllib.parse import urlparse
import subprocess
from datetime import datetime
import hashlib
try:
from tqdm import tqdm
HAS_TQDM = True
except ImportError:
HAS_TQDM = False
print("Note: Install 'tqdm' for progress bars: pip install tqdm")
# --- Default Configuration ---
# Blacklist of extensions for files that are almost certainly not text.
BINARY_EXTENSIONS = {
# Images
'.png', '.jpg', '.jpeg', '.gif', '.bmp', '.ico', '.webp', '.tiff', '.tif', '.svg',
'.psd', '.ai', '.sketch', '.fig', '.xd',
# Audio/Video
'.mp3', '.wav', '.ogg', '.flac', '.mp4', '.avi', '.mov', '.mkv', '.webm', '.aac',
'.m4a', '.wmv', '.flv', '.mpg', '.mpeg', '.3gp',
# Compressed Archives
'.zip', '.rar', '.7z', '.tar', '.gz', '.bz2', '.iso', '.dmg', '.xz', '.tgz',
# Fonts
'.woff', '.woff2', '.ttf', '.eot', '.otf',
# Documents
'.pdf', '.doc', '.docx', '.xls', '.xlsx', '.ppt', '.pptx', '.odt', '.ods', '.odp',
# Binaries/Executables
'.exe', '.dll', '.so', '.o', '.a', '.jar', '.pyc', '.class', '.com', '.app', '.deb', '.rpm',
# Database files
'.db', '.sqlite', '.sqlite3', '.mdb', '.accdb',
# Other common binary formats
'.lock', '.bin', '.dat', '.pkl', '.model', '.h5', '.joblib', '.npy', '.npz',
# IDE/Editor specific
'.suo', '.user', '.userosscache', '.sln.docstates',
# Package files
'.whl', '.egg', '.gem',
}
# Default files/directories to exclude
DEFAULT_EXCLUDE_PATTERNS = {
'.git', '.svn', '.hg', 'node_modules', '__pycache__', '.pytest_cache',
'venv', 'env', '.env', 'dist', 'build', 'target', '.idea', '.vscode',
'*.pyc', '*.pyo', '*.pyd', '.DS_Store', 'Thumbs.db', '*.swp', '*.swo',
'coverage', '.coverage', 'htmlcov', '.tox', '.nox', '.hypothesis',
'vendor', 'bower_components', '.sass-cache', '.gradle', '.m2',
}
def format_size(bytes_size):
"""Format bytes into human-readable size."""
for unit in ['B', 'KB', 'MB', 'GB']:
if bytes_size < 1024.0:
return f"{bytes_size:.2f} {unit}"
bytes_size /= 1024.0
return f"{bytes_size:.2f} TB"
def is_likely_text_file(filepath, sample_size=8192):
"""
Determine if a file is likely text-based by checking for binary extension
and inspecting content for null bytes and text encoding.
"""
path = Path(filepath)
# Fast check: binary extension blacklist
if path.suffix.lower() in BINARY_EXTENSIONS:
return False
# Content check with larger sample
try:
with open(filepath, 'rb') as f:
chunk = f.read(sample_size)
# Check for null bytes (strong binary indicator)
if b'\0' in chunk:
return False
# Try to decode as UTF-8
try:
chunk.decode('utf-8')
return True
except UnicodeDecodeError:
# Try other common encodings
for encoding in ['latin-1', 'cp1252', 'iso-8859-1']:
try:
chunk.decode(encoding)
return True
except UnicodeDecodeError:
continue
return False
except (IOError, PermissionError):
return False
return True
def should_exclude(filepath, exclude_set):
"""Check if file/directory should be excluded."""
path = Path(filepath)
parts = path.parts
for pattern in exclude_set:
if pattern.startswith('*'): # Glob pattern
if path.match(pattern):
return True
elif pattern in parts: # Directory or filename
return True
return False
def get_file_encoding(filepath):
"""Try to detect file encoding."""
encodings = ['utf-8', 'latin-1', 'cp1252', 'iso-8859-1', 'utf-16']
for encoding in encodings:
try:
with open(filepath, 'r', encoding=encoding) as f:
f.read(1024) # Test read
return encoding
except (UnicodeDecodeError, UnicodeError):
continue
return 'utf-8' # Fallback with errors='ignore'
def clone_repository(repo_url, temp_dir, branch=None):
"""Clone the GitHub repository using git."""
print(f"Cloning repository: {repo_url}")
cmd = ['git', 'clone', '--depth', '1']
if branch:
cmd.extend(['-b', branch])
cmd.extend([repo_url, temp_dir])
try:
result = subprocess.run(cmd, check=True, capture_output=True, text=True)
print(f"Successfully cloned to {temp_dir}")
return True
except FileNotFoundError:
print("\nERROR: git is not installed or not in your PATH.")
print("Please install git: https://git-scm.com/downloads")
return False
except subprocess.CalledProcessError as e:
print(f"\nERROR: Failed to clone repository.")
if "Repository not found" in e.stderr:
print("The repository may be private or doesn't exist.")
else:
print(f"Git error: {e.stderr}")
return False
def process_repository(repo_path, files_to_process, output_file, options):
"""Process files and write to output file with enhanced formatting."""
processed_files = 0
error_files = 0
# Calculate content hash for duplicate detection
content_hashes = {}
duplicate_files = []
with open(output_file, 'w', encoding='utf-8') as out:
# Write enhanced header with metadata
out.write("=" * 80 + "\n")
out.write(f"GitHub Repository Contents\n")
out.write(f"Repository: {repo_path.name}\n")
out.write(f"Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n")
out.write(f"Total files to process: {len(files_to_process)}\n")
out.write("=" * 80 + "\n\n")
# Write table of contents if requested
if options.get('toc', False):
out.write("TABLE OF CONTENTS\n")
out.write("-" * 40 + "\n")
for i, filepath in enumerate(files_to_process, 1):
rel_path = filepath.relative_to(repo_path)
out.write(f"{i:4}. {rel_path.as_posix()}\n")
out.write("\n" + "=" * 80 + "\n\n")
# Process files with progress indicator
iterator = tqdm(files_to_process, desc="Processing", unit="file", ncols=100) if HAS_TQDM else files_to_process
for filepath in iterator:
try:
rel_path = filepath.relative_to(repo_path)
file_size = filepath.stat().st_size
# Detect encoding
encoding = get_file_encoding(filepath)
# Read content
with open(filepath, 'r', encoding=encoding, errors='ignore') as f:
content = f.read()
# Check for duplicates
content_hash = hashlib.md5(content.encode('utf-8')).hexdigest()
if content_hash in content_hashes:
duplicate_files.append((rel_path, content_hashes[content_hash]))
if options.get('skip_duplicates', False):
continue
content_hashes[content_hash] = rel_path
# Determine file type/language for syntax hint
extension = filepath.suffix.lower()
# Write enhanced file header
out.write("#" * 80 + "\n")
out.write(f"# File: {rel_path.as_posix()}\n")
out.write(f"# Size: {format_size(file_size)}\n")
out.write(f"# Encoding: {encoding}\n")
if extension:
out.write(f"# Type: {extension[1:] if extension else 'text'}\n")
out.write("#" * 80 + "\n\n")
# Add language hint for potential syntax highlighting
if options.get('markdown', False) and extension in {'.py', '.js', '.java', '.cpp', '.c', '.go', '.rs'}:
lang = extension[1:]
out.write(f"```{lang}\n")
# Write content with optional line numbers
if options.get('line_numbers', False):
lines = content.splitlines()
width = len(str(len(lines)))
for i, line in enumerate(lines, 1):
out.write(f"{i:>{width}} | {line}\n")
else:
out.write(content)
if not content.endswith('\n'):
out.write('\n')
if options.get('markdown', False) and extension in {'.py', '.js', '.java', '.cpp', '.c', '.go', '.rs'}:
out.write("```\n")
out.write("\n\n")
processed_files += 1
except Exception as e:
error_files += 1
out.write(f"ERROR: Could not read {rel_path.as_posix()}: {e}\n\n")
# Write summary
out.write("=" * 80 + "\n")
out.write("PROCESSING SUMMARY\n")
out.write("=" * 80 + "\n")
out.write(f"Files processed successfully: {processed_files}\n")
out.write(f"Files with errors: {error_files}\n")
if duplicate_files:
out.write(f"\nDuplicate files detected: {len(duplicate_files)}\n")
for dup, original in duplicate_files[:5]: # Show first 5
out.write(f" - {dup} (duplicate of {original})\n")
if len(duplicate_files) > 5:
out.write(f" ... and {len(duplicate_files) - 5} more\n")
out.write("=" * 80 + "\n")
return processed_files, error_files
def parse_github_url(url):
"""Parse various GitHub URL formats."""
url = url.strip()
# Handle git@ SSH URLs
if url.startswith('[email protected]:'):
url = url.replace('[email protected]:', 'https://github.com/')
# Remove .git suffix
if url.endswith('.git'):
url = url[:-4]
# Handle short format (owner/repo)
if '/' in url and not url.startswith(('http', 'git@')):
return f"https://github.com/{url}.git"
# Handle full URLs
if 'github.com' in url:
return f"{url}.git" if not url.endswith('.git') else url
return url
def main():
parser = argparse.ArgumentParser(
description='Download a GitHub repository and combine all text files into one output file.',
formatter_class=argparse.RawTextHelpFormatter
)
parser.add_argument('repo', help='GitHub repository (owner/repo or URL)')
parser.add_argument('-o', '--output', default='repository_contents.txt',
help='Output file name (default: repository_contents.txt)')
parser.add_argument('-b', '--branch', help='Specific branch to clone')
parser.add_argument('--max-file-size', type=int, default=10,
help='Max file size in MB (default: 10)')
parser.add_argument('--total-size-limit', type=int, default=100,
help='Warning threshold for total size in MB (default: 100)')
parser.add_argument('--line-numbers', action='store_true',
help='Add line numbers to output')
parser.add_argument('--toc', action='store_true',
help='Add table of contents at the beginning')
parser.add_argument('--markdown', action='store_true',
help='Add markdown code blocks for better formatting')
parser.add_argument('--skip-duplicates', action='store_true',
help='Skip duplicate files (same content)')
parser.add_argument('--exclude-dir', action='append', default=[],
help='Directory to exclude (repeatable)')
parser.add_argument('--exclude-ext', action='append', default=[],
help='File extension to exclude (repeatable)')
parser.add_argument('--include-only-ext', action='append', default=[],
help='Process ONLY these extensions (repeatable)')
parser.add_argument('--keep-temp', action='store_true',
help='Keep temporary clone after processing')
parser.add_argument('--stats', action='store_true',
help='Show detailed statistics after processing')
args = parser.parse_args()
# Setup
repo_url = parse_github_url(args.repo)
max_file_size_bytes = args.max_file_size * 1024 * 1024
total_size_limit_bytes = args.total_size_limit * 1024 * 1024
# Build exclusion patterns
exclude_patterns = DEFAULT_EXCLUDE_PATTERNS.copy()
for d in args.exclude_dir:
exclude_patterns.add(d)
for ext in args.exclude_ext:
exclude_patterns.add(f"*{ext if ext.startswith('.') else '.' + ext}")
temp_dir = tempfile.mkdtemp(prefix='github_repo_')
try:
# Clone repository
if not clone_repository(repo_url, temp_dir, args.branch):
return 1
repo_path = Path(temp_dir)
# Scan repository
print("\nScanning repository...")
files_to_process = []
total_size = 0
skipped_count = 0
stats = {'by_extension': {}, 'by_size': {'<1KB': 0, '1-10KB': 0, '10-100KB': 0, '100KB-1MB': 0, '>1MB': 0}}
all_files = list(repo_path.rglob('*'))
all_files = [f for f in all_files if f.is_file()]
iterator = tqdm(all_files, desc="Scanning", unit="file", ncols=100) if HAS_TQDM else all_files
for filepath in iterator:
rel_path = filepath.relative_to(repo_path)
file_size = filepath.stat().st_size
# Track statistics
ext = filepath.suffix.lower() or 'no_extension'
stats['by_extension'][ext] = stats['by_extension'].get(ext, 0) + 1
if file_size < 1024:
stats['by_size']['<1KB'] += 1
elif file_size < 10240:
stats['by_size']['1-10KB'] += 1
elif file_size < 102400:
stats['by_size']['10-100KB'] += 1
elif file_size < 1048576:
stats['by_size']['100KB-1MB'] += 1
else:
stats['by_size']['>1MB'] += 1
# Apply filters
if should_exclude(rel_path, exclude_patterns):
skipped_count += 1
continue
if file_size > max_file_size_bytes:
skipped_count += 1
continue
# Include-only filter
if args.include_only_ext:
ext = filepath.suffix.lower()
if not any(ext == (e if e.startswith('.') else f'.{e}') for e in args.include_only_ext):
skipped_count += 1
continue
if not is_likely_text_file(filepath):
skipped_count += 1
continue
files_to_process.append(filepath)
total_size += file_size
print(f"\nFound {len(all_files)} total files")
print(f"Will process {len(files_to_process)} text files")
print(f"Skipped {skipped_count} files (binary/excluded/oversized)")
print(f"Total size to process: {format_size(total_size)}")
# Show statistics if requested
if args.stats:
print("\n" + "="*40)
print("FILE STATISTICS")
print("="*40)
print("\nTop 10 extensions by count:")
sorted_exts = sorted(stats['by_extension'].items(), key=lambda x: x[1], reverse=True)
for ext, count in sorted_exts[:10]:
print(f" {ext:15} {count:5} files")
print("\nFile size distribution:")
for size_range, count in stats['by_size'].items():
print(f" {size_range:15} {count:5} files")
# Size warning
if total_size > total_size_limit_bytes:
print(f"\n⚠️ WARNING: Total size ({format_size(total_size)}) exceeds limit ({format_size(total_size_limit_bytes)})")
if input("Continue anyway? (y/n): ").lower() != 'y':
print("Cancelled.")
return 1
# Process files
if files_to_process:
print(f"\nWriting to {args.output}...")
options = {
'line_numbers': args.line_numbers,
'toc': args.toc,
'markdown': args.markdown,
'skip_duplicates': args.skip_duplicates
}
processed, errors = process_repository(repo_path, files_to_process, args.output, options)
# Summary
print(f"\n{'='*50}")
print("✅ COMPLETE!")
print(f" Processed: {processed} files")
print(f" Errors: {errors} files")
print(f" Output: {args.output}")
print(f" Size: {format_size(Path(args.output).stat().st_size)}")
print(f"{'='*50}")
else:
print("\n⚠️ No text files found to process!")
return 0
except KeyboardInterrupt:
print("\n\n❌ Cancelled by user")
return 1
except Exception as e:
print(f"\n❌ ERROR: {e}")
import traceback
traceback.print_exc()
return 1
finally:
if not args.keep_temp and os.path.exists(temp_dir):
print("\nCleaning up...")
shutil.rmtree(temp_dir)
if __name__ == "__main__":
sys.exit(main())
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment