Last active
May 20, 2024 18:39
-
-
Save albertbuchard/991d36041e1b2c93c00ec1b8d09f2716 to your computer and use it in GitHub Desktop.
textrepo: A Python Script to Concatenate All Files in a Repository into a Single Text File, Ignoring Specified Patterns
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
import fnmatch | |
import os | |
""" | |
This Python script, textrepo, concatenates all files within a specified repository into a single text file | |
while respecting .gitignore patterns and additional specified ignore patterns. It prints the formatted content | |
to both a specified output file and standard output. This is useful for reviewing all content within a repository | |
in a structured format, excluding unwanted files and directories such as node_modules, dist, build, and others. | |
Usage: | |
textrepo <repository_root_directory> <output_file_path> | |
Example: | |
textrepo ~/path/to/my-repo my-repo-content.txt | |
Ignore Patterns: | |
- Reads patterns from .gitignore if present | |
- Additional default patterns: .git, *.pyc, __pycache__, package-lock.json, node_modules, dist, build, venv | |
""" | |
def parse_gitignore(gitignore_path): | |
with open(gitignore_path, 'r') as f: | |
patterns = f.readlines() | |
patterns = [p.strip() for p in patterns if p.strip() and not p.startswith('#')] | |
return patterns | |
def is_ignored(file_path, ignore_patterns): | |
for pattern in ignore_patterns: | |
if fnmatch.fnmatch(file_path, pattern): | |
return True | |
# Check if any part of the path matches the ignore patterns | |
path_parts = file_path.split(os.sep) | |
for i in range(1, len(path_parts) + 1): | |
partial_path = os.sep.join(path_parts[:i]) | |
if fnmatch.fnmatch(partial_path, pattern): | |
return True | |
return False | |
def get_file_paths(root_dir, ignore_patterns): | |
file_paths = [] | |
for dirpath, dirnames, filenames in os.walk(root_dir): | |
# Check if the directory or any of its parent directories are ignored | |
if is_ignored(os.path.relpath(dirpath, root_dir), ignore_patterns): | |
dirnames[:] = [] # Stop os.walk from traversing this directory | |
continue | |
for filename in filenames: | |
file_path = os.path.relpath(os.path.join(dirpath, filename), root_dir) | |
if not is_ignored(file_path, ignore_patterns): | |
file_paths.append(file_path) | |
return file_paths | |
def read_file(file_path): | |
encodings = ['utf-8', 'latin-1'] | |
for encoding in encodings: | |
try: | |
with open(file_path, 'r', encoding=encoding) as f: | |
return f.read() | |
except (UnicodeDecodeError, IOError): | |
continue | |
return None | |
def format_file(file_path): | |
content = read_file(file_path) | |
if content is None: | |
return "" | |
title = f"Title: {os.path.basename(file_path)}\nPath: {file_path}\n{'=' * 40}\n" | |
return title + content + '\n\n' | |
def format_repository(root_dir, output_file_path): | |
gitignore_path = os.path.join(root_dir, '.gitignore') | |
ignore_patterns = parse_gitignore(gitignore_path) if os.path.exists(gitignore_path) else [] | |
ignore_patterns += ['.git', '*.pyc', '__pycache__', 'package-lock.json', 'node_modules', 'dist', 'build', | |
'venv', "*/venv/*", "*/__pycache__/*", "*/.git/*", "*/.idea/*", "*/node_modules/*", "*/dist/*", | |
"*/build/*", "*/package-lock.json"] | |
ignore_patterns = list(set(ignore_patterns)) | |
file_paths = get_file_paths(root_dir, ignore_patterns) | |
hierarchy = "Repository File Hierarchy:\n" + '\n'.join(file_paths) + '\n\n' + '=' * 40 + '\n\n' | |
formatted_files = hierarchy | |
for file_path in file_paths: | |
formatted_files += format_file(os.path.join(root_dir, file_path)) | |
with open(output_file_path, 'w', encoding='utf-8') as output_file: | |
output_file.write(formatted_files) | |
print(formatted_files) | |
if __name__ == "__main__": | |
import sys | |
if len(sys.argv) != 3: | |
print("Usage: textrepo <repository_root_directory> <output_file_path>") | |
else: | |
root_directory = sys.argv[1] | |
output_file = sys.argv[2] | |
expanded_path = os.path.expanduser(root_directory) | |
if not os.path.exists(expanded_path): | |
print(f"Error: {expanded_path} does not exist.") | |
else: | |
format_repository(expanded_path, output_file) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
If you plan on using it often:
Use it:
textrepo ~/my/repo my-repo-content.txt