Created
April 1, 2025 14:17
-
-
Save ondrejsojka/8b053c17f659d51ec74ef10e1db5faa9 to your computer and use it in GitHub Desktop.
Script to concatenate all files in a directory into one .md file. Supports an inclusion filter based off gpt-4o-mini
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
import sys | |
import argparse | |
import re | |
import pathspec | |
import subprocess | |
from typing import Optional | |
from openai import OpenAI | |
client = OpenAI() | |
def evaluate_inclusion(contents: str, systemprompt: str) -> bool: | |
completion = client.chat.completions.create( | |
model="gpt-4o-mini", | |
messages=[ | |
{"role": "system", "content": "You are a classifier and decide whether a given file should be included or excluded based on the following instructions. Respond ONLY with y or n. y means include, n means exclude."}, | |
{"role": "system", "content": systemprompt}, | |
{"role": "user", "content": contents} | |
], | |
logit_bias={88: 100, 77: 100, 297: 100, 342: 100}, # only 'y' or 'n' or ' y' or ' n' | |
max_tokens=1, | |
temperature=0.1 | |
) | |
res = completion.choices[0].message.content | |
if res == 'y' or res == ' y': | |
return True | |
elif res == 'n' or res == ' n': | |
return False | |
else: | |
print("Received from OpenAI as classifier response: ", res) | |
return True | |
def load_gitignore(root_dir): | |
gitignore = os.path.join(root_dir, '.gitignore') | |
try: | |
with open(gitignore, 'r') as file: | |
spec = pathspec.PathSpec.from_lines('gitwildmatch', file) | |
except IOError: | |
print(f"Warning: Unable to read .gitignore in {root_dir}, ignoring .gitignore rules.") | |
spec = pathspec.PathSpec.from_lines('gitwildmatch', []) | |
return spec | |
def concatenate_files(root_dir, output_file_path, file_suffixes, startswith, poe_flag, inclusion_prompt: Optional[str]): | |
if not os.path.isdir(root_dir): | |
print(f"Error: The specified directory '{root_dir}' does not exist.") | |
sys.exit(1) | |
if not file_suffixes: | |
print("Warning: No file suffixes specified.") | |
ignore_spec = load_gitignore(root_dir) | |
url_pattern = re.compile(r'https://') | |
def file_filter(file_path): | |
file = os.path.basename(file_path) | |
return ( | |
(not file_suffixes or any(file.endswith(suffix) for suffix in file_suffixes)) and | |
(not startswith or any(file.startswith(start) for start in startswith)) and | |
not ignore_spec.match_file(file_path) | |
) | |
def process_file_content(content): | |
return url_pattern.sub('https-//', content) if poe_flag else content | |
try: | |
with open(output_file_path, 'w') as output_file: | |
for root, dirs, files in os.walk(root_dir): | |
dirs[:] = [d for d in dirs if not d.startswith('.git')] | |
file_paths = map(lambda f: os.path.join(root, f), sorted(files)) | |
filtered_files = filter(file_filter, file_paths) | |
for file_path in filtered_files: | |
try: | |
with open(file_path, 'r', errors='ignore') as input_file: | |
content = input_file.read() | |
if inclusion_prompt: | |
if not evaluate_inclusion(content, inclusion_prompt): | |
continue | |
relative_path = os.path.relpath(file_path, root_dir) | |
output_file.write(f'---\n{relative_path}\n') | |
processed_content = process_file_content(content) | |
output_file.write(processed_content) | |
output_file.write('\n') | |
except IOError: | |
print(f"Warning: Failed to read file {file_path}") | |
except IOError: | |
print(f"Error: Unable to write to output file '{output_file_path}'") | |
sys.exit(1) | |
try: | |
result = subprocess.run(['claude_token_counter', output_file_path], capture_output=True, text=True, check=True) | |
token_count = result.stdout.strip() | |
print("Claude token count:", token_count) | |
except subprocess.CalledProcessError as e: | |
print(f"Error: Failed to run token counter: {e}") | |
import tiktoken | |
encoding = tiktoken.encoding_for_model('gpt-4o-mini') | |
s = open(output_file_path).read() | |
print("OpenAI 4o token count ", str(len(encoding.encode(s)))) | |
def main(): | |
parser = argparse.ArgumentParser(description="Concatenate files in a directory, excluding those specified in .gitignore") | |
parser.add_argument('root_dir', type=str, help='Root directory to search for files') | |
parser.add_argument('output_file', type=str, help='Path to the output file') | |
parser.add_argument('--filetype', type=str, nargs='+', default=[], help='File extensions to filter for concatenation (accepts multiple)') | |
parser.add_argument('--startswith', type=str, nargs='+', default=[], help='File name prefixes to filter for concatenation (accepts multiple)') | |
parser.add_argument('--poe', action='store_true', help='Enable Poe URL filtering') | |
parser.add_argument('--inclusion_prompt', type=str, help='Prompt for 4o mini to decide file inclusion') | |
args = parser.parse_args() | |
concatenate_files(args.root_dir, args.output_file, args.filetype, args.startswith, args.poe, args.inclusion_prompt) | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment