Skip to content

Instantly share code, notes, and snippets.

@stuaxo
Last active September 10, 2024 16:31
Show Gist options
  • Save stuaxo/f1beb981dc4845921c31fcb4e16f4821 to your computer and use it in GitHub Desktop.
Save stuaxo/f1beb981dc4845921c31fcb4e16f4821 to your computer and use it in GitHub Desktop.
Output files in subdirectories for ingestion to an LLM such as Claude, ChatGPT etc.
#!/usr/bin/env python3
# Usage: python dirtollm.py [files or glob patterns...] [options]
# Example: python dirtollm.py "*.py" "*.txt" /path/to/specific/file.py --exclude "*.pyc" --copy --verbose -x --binaries
import argparse
import pathlib
import fnmatch
import sys
import os
from typing import List, Tuple, Optional
try:
import pyperclip
PYPERCLIP_AVAILABLE = True
except ImportError:
PYPERCLIP_AVAILABLE = False
pyperclip = None # Keep linter happy
class FileProcessingError(Exception):
pass
def get_file_content(
path: pathlib.Path,
errors: str,
verbose: bool,
include_binaries: bool,
include_empty: bool,
) -> Tuple[Optional[str], Optional[Exception]]:
try:
content = path.read_text(errors=errors)
if not (include_empty or content.strip()):
return None, None
if not include_binaries:
if "\0" in content:
return None, None
return content, None
except UnicodeDecodeError as ude:
if not include_binaries:
return None, None
error_msg = f"#:{path}: Binary file\n"
if verbose:
error_msg += f"UnicodeDecodeError details: {ude}\n"
return f"{error_msg}\n", ude
except Exception as ex:
error_msg = f"#:{path}: Read error\n"
if verbose:
error_msg += f"Error details: {ex}\n"
return f"{error_msg}\n", ex
def fn_matches_multiple(file: str, patterns: List[str]) -> bool:
return any(fnmatch.fnmatch(file, pattern) for pattern in patterns)
def process_path(
path: pathlib.Path,
globs: List[str],
excludes: List[str],
listing: bool,
errors: str,
verbose: bool,
exit_on_error: bool,
include_binaries: bool,
include_empty: bool,
) -> Tuple[str, int]:
output = ""
file_count = 0
if path.is_file():
if not globs or fn_matches_multiple(path.name, globs):
if not fn_matches_multiple(path.name, excludes):
if listing:
output += f"{path}\n"
file_count += 1
else:
file_output, error = get_file_content(
path, errors, verbose, include_binaries, include_empty
)
if file_output is not None:
output += f"#:{path}:\n"
output += file_output.rstrip("\n") + "\n\n"
file_count += 1
if error and exit_on_error:
raise FileProcessingError(f"Exiting due to error in file: {path}")
elif path.is_dir():
for child in path.iterdir():
child_output, child_count = process_path(
child, globs, excludes, listing, errors, verbose,
exit_on_error, include_binaries, include_empty
)
output += child_output
file_count += child_count
return output, file_count
def dirtollm(
paths: List[pathlib.Path],
globs: List[str],
excludes: List[str],
listing: bool = False,
errors: str = "replace",
verbose: bool = False,
exit_on_error: bool = False,
include_binaries: bool = False,
include_empty: bool = False,
) -> Tuple[str, int]:
output = ""
total_file_count = 0
for path in paths:
path_output, file_count = process_path(
path, globs, excludes, listing, errors, verbose,
exit_on_error, include_binaries, include_empty
)
output += path_output
total_file_count += file_count
return output, total_file_count
def main():
parser = argparse.ArgumentParser(
description="Process files based on specified paths or glob patterns.",
epilog='Example: python dirtollm.py "*.py" "*.txt" /path/to/specific/file.py --exclude "*.pyc" --copy --verbose -x --binaries',
)
parser.add_argument("paths", nargs="*", help="Files, directories, or glob patterns to process")
parser.add_argument(
"--exclude", nargs="+", help="Glob patterns to exclude", default=[]
)
parser.add_argument(
"--prompt",
nargs="?",
const="File contents:",
help="Specify prompt text to output before the files",
)
parser.add_argument(
"--count",
action="store_true",
help="Display the count of files, bytes, and tokens processed",
)
parser.add_argument(
"--copy",
action="store_true",
help="Copy output to the clipboard instead of printing to stdout",
)
parser.add_argument(
"--list",
action="store_true",
help="List all files that match the patterns without showing their contents",
)
parser.add_argument(
"--errors",
choices=["strict", "ignore", "replace", "backslashreplace"],
default="replace",
help="Specify how encoding errors are handled (default: replace)",
)
parser.add_argument(
"--verbose", "-v", action="store_true", help="Enable verbose output for errors"
)
parser.add_argument(
"-x",
"--exit-on-error",
action="store_true",
help="Exit on first error encountered",
)
parser.add_argument(
"--binaries", action="store_true", help="Include non unicode files"
)
parser.add_argument("--empty", action="store_true", help="Include empty files")
args = parser.parse_args()
paths = []
globs = []
if not args.paths:
paths = [pathlib.Path(".")]
globs = ["*"]
else:
for path_or_glob in args.paths:
path = pathlib.Path(path_or_glob)
if path.exists():
paths.append(path.resolve())
else:
paths.append(pathlib.Path.cwd())
globs.append(path_or_glob)
try:
output, file_count = dirtollm(
paths,
globs,
args.exclude,
listing=args.list,
errors=args.errors,
verbose=args.verbose,
exit_on_error=args.exit_on_error,
include_binaries=args.binaries,
include_empty=args.empty,
)
except FileProcessingError as fpe:
print(f"Error: {fpe}", file=sys.stderr)
sys.exit(1)
if args.prompt:
output = f"{args.prompt}\n\n{output}"
output = output.rstrip("\n")
byte_count = len(output.encode("utf-8"))
token_count = len(output.split())
if args.count:
print(
f"Processed {file_count} files, {byte_count} bytes, ~{token_count} tokens."
)
elif args.copy:
if PYPERCLIP_AVAILABLE:
pyperclip.copy(output)
print(
f"Copied to clipboard: {file_count} files, {byte_count} bytes, ~{token_count} tokens."
)
else:
print(
"Error: --copy requires pyperclip module. Falling back to stdout.",
file=sys.stderr,
)
print(output)
else:
print(output)
if __name__ == "__main__":
main()
@amigax
Copy link

amigax commented May 21, 2024

very cool dear.

@stuaxo
Copy link
Author

stuaxo commented May 21, 2024

very cool dear.

🤣

@stuaxo
Copy link
Author

stuaxo commented Sep 10, 2024

@amigax now show me your batch files, hehe 😋

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment