Created
March 13, 2024 13:29
-
-
Save ondrejsojka/a63369a00c94039e543d0b27e2ea1f9b to your computer and use it in GitHub Desktop.
This script merges files in a specific directory into one file, prefixing each with its name and separating the files with ---. The purpose is to be able to then feed a nearly whole codebase to a LLM easily.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
import sys | |
import argparse | |
import re | |
import pathspec | |
import subprocess | |
def load_gitignore(root_dir): | |
gitignore = os.path.join(root_dir, '.gitignore') | |
try: | |
with open(gitignore, 'r') as file: | |
spec = pathspec.PathSpec.from_lines('gitwildmatch', file) | |
except IOError: | |
print(f"Warning: Unable to read .gitignore in {root_dir}, ignoring .gitignore rules.") | |
spec = pathspec.PathSpec.from_lines('gitwildmatch', []) | |
return spec | |
def concatenate_files(root_dir, output_file_path, file_suffixes, startswith, poe_flag): | |
if not os.path.isdir(root_dir): | |
print(f"Error: The specified directory '{root_dir}' does not exist.") | |
sys.exit(1) | |
if not file_suffixes: | |
print("Warning: No file suffixes specified.") | |
ignore_spec = load_gitignore(root_dir) | |
url_pattern = re.compile(r'https://') | |
try: | |
with open(output_file_path, 'w') as output_file: | |
for root, dirs, files in os.walk(root_dir): | |
dirs[:] = [d for d in dirs if not d.startswith('.git')] | |
for file in files: | |
if file_suffixes and not any(file.endswith(suffix) for suffix in file_suffixes): | |
continue | |
if startswith and not any(file.startswith(start) for start in startswith): | |
continue | |
file_path = os.path.join(root, file) | |
if ignore_spec.match_file(file_path): | |
continue | |
relative_path = os.path.relpath(file_path, root_dir) | |
output_file.write(f'---\n{relative_path}\n') | |
try: | |
with open(file_path, 'r', errors='ignore') as input_file: | |
for line in input_file: | |
if poe_flag: | |
line = url_pattern.sub('https-//', line) | |
output_file.write(line) | |
output_file.write('\n') | |
except IOError: | |
print(f"Warning: Failed to read file {file_path}") | |
except IOError: | |
print(f"Error: Unable to write to output file '{output_file_path}'") | |
sys.exit(1) | |
try: | |
result = subprocess.run(['claude_token_counter', output_file_path], capture_output=True, text=True, check=True) | |
token_count = result.stdout.strip() | |
print(token_count) | |
except subprocess.CalledProcessError as e: | |
print(f"Error: Failed to run token counter: {e}") | |
def main(): | |
parser = argparse.ArgumentParser(description="Concatenate files in a directory, excluding those specified in .gitignore") | |
parser.add_argument('root_dir', type=str, help='Root directory to search for files') | |
parser.add_argument('output_file', type=str, help='Path to the output file') | |
parser.add_argument('--filetype', type=str, nargs='+', default=[], help='File extensions to filter for concatenation (accepts multiple)') | |
parser.add_argument('--startswith', type=str, nargs='+', default=[], help='File name prefixes to filter for concatenation (accepts multiple)') | |
parser.add_argument('--poe', action='store_true', help='Enable Poe URL filtering') | |
args = parser.parse_args() | |
concatenate_files(args.root_dir, args.output_file, args.filetype, args.startswith, args.poe) | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment