Skip to content

Instantly share code, notes, and snippets.

@ondrejsojka
Created April 1, 2025 14:17
Show Gist options
  • Save ondrejsojka/8b053c17f659d51ec74ef10e1db5faa9 to your computer and use it in GitHub Desktop.
Save ondrejsojka/8b053c17f659d51ec74ef10e1db5faa9 to your computer and use it in GitHub Desktop.
Script to concatenate all files in a directory into one .md file. Supports an inclusion filter based off gpt-4o-mini
import os
import sys
import argparse
import re
import pathspec
import subprocess
from typing import Optional
from openai import OpenAI
client = OpenAI()
def evaluate_inclusion(contents: str, systemprompt: str) -> bool:
completion = client.chat.completions.create(
model="gpt-4o-mini",
messages=[
{"role": "system", "content": "You are a classifier and decide whether a given file should be included or excluded based on the following instructions. Respond ONLY with y or n. y means include, n means exclude."},
{"role": "system", "content": systemprompt},
{"role": "user", "content": contents}
],
logit_bias={88: 100, 77: 100, 297: 100, 342: 100}, # only 'y' or 'n' or ' y' or ' n'
max_tokens=1,
temperature=0.1
)
res = completion.choices[0].message.content
if res == 'y' or res == ' y':
return True
elif res == 'n' or res == ' n':
return False
else:
print("Received from OpenAI as classifier response: ", res)
return True
def load_gitignore(root_dir):
gitignore = os.path.join(root_dir, '.gitignore')
try:
with open(gitignore, 'r') as file:
spec = pathspec.PathSpec.from_lines('gitwildmatch', file)
except IOError:
print(f"Warning: Unable to read .gitignore in {root_dir}, ignoring .gitignore rules.")
spec = pathspec.PathSpec.from_lines('gitwildmatch', [])
return spec
def concatenate_files(root_dir, output_file_path, file_suffixes, startswith, poe_flag, inclusion_prompt: Optional[str]):
if not os.path.isdir(root_dir):
print(f"Error: The specified directory '{root_dir}' does not exist.")
sys.exit(1)
if not file_suffixes:
print("Warning: No file suffixes specified.")
ignore_spec = load_gitignore(root_dir)
url_pattern = re.compile(r'https://')
def file_filter(file_path):
file = os.path.basename(file_path)
return (
(not file_suffixes or any(file.endswith(suffix) for suffix in file_suffixes)) and
(not startswith or any(file.startswith(start) for start in startswith)) and
not ignore_spec.match_file(file_path)
)
def process_file_content(content):
return url_pattern.sub('https-//', content) if poe_flag else content
try:
with open(output_file_path, 'w') as output_file:
for root, dirs, files in os.walk(root_dir):
dirs[:] = [d for d in dirs if not d.startswith('.git')]
file_paths = map(lambda f: os.path.join(root, f), sorted(files))
filtered_files = filter(file_filter, file_paths)
for file_path in filtered_files:
try:
with open(file_path, 'r', errors='ignore') as input_file:
content = input_file.read()
if inclusion_prompt:
if not evaluate_inclusion(content, inclusion_prompt):
continue
relative_path = os.path.relpath(file_path, root_dir)
output_file.write(f'---\n{relative_path}\n')
processed_content = process_file_content(content)
output_file.write(processed_content)
output_file.write('\n')
except IOError:
print(f"Warning: Failed to read file {file_path}")
except IOError:
print(f"Error: Unable to write to output file '{output_file_path}'")
sys.exit(1)
try:
result = subprocess.run(['claude_token_counter', output_file_path], capture_output=True, text=True, check=True)
token_count = result.stdout.strip()
print("Claude token count:", token_count)
except subprocess.CalledProcessError as e:
print(f"Error: Failed to run token counter: {e}")
import tiktoken
encoding = tiktoken.encoding_for_model('gpt-4o-mini')
s = open(output_file_path).read()
print("OpenAI 4o token count ", str(len(encoding.encode(s))))
def main():
parser = argparse.ArgumentParser(description="Concatenate files in a directory, excluding those specified in .gitignore")
parser.add_argument('root_dir', type=str, help='Root directory to search for files')
parser.add_argument('output_file', type=str, help='Path to the output file')
parser.add_argument('--filetype', type=str, nargs='+', default=[], help='File extensions to filter for concatenation (accepts multiple)')
parser.add_argument('--startswith', type=str, nargs='+', default=[], help='File name prefixes to filter for concatenation (accepts multiple)')
parser.add_argument('--poe', action='store_true', help='Enable Poe URL filtering')
parser.add_argument('--inclusion_prompt', type=str, help='Prompt for 4o mini to decide file inclusion')
args = parser.parse_args()
concatenate_files(args.root_dir, args.output_file, args.filetype, args.startswith, args.poe, args.inclusion_prompt)
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment