Last active
June 30, 2025 00:22
-
-
Save guilledk/2549cd4984e3a979b068fa0530d3854a to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
''' | |
llmctx.py | |
--------- | |
Utility for extracting relevant source snippets from a project tree and | |
building plain-text *context* files that can be pasted into an LLM prompt. | |
It runs in two mutually-exclusive modes: | |
Manual mode (default when `.llmctx.toml` exists) | |
The config file is a TOML document with two sections: | |
[general] | |
output_dir = '.llmctx' # directory where artefacts go | |
copy = ['README.md'] # files copied verbatim | |
[[concat]] | |
input_dir = 'src' # each regular file under this dir … | |
output_file = 'code.txt' # … is appended to this file | |
Every file added to *output_file* is preceded by a header line | |
`=== relative/path/to/file ===` to keep provenance clear. | |
Automatic mode (enabled with `--auto` or when no config is present) | |
* The workspace is scanned for files whose suffix matches a whitelist | |
(defaults to a set of common source extensions; can be overridden | |
by `general.suffix_whitelist` in the config). | |
* If executed inside a Git repo we honour `git ls-files`; otherwise a | |
recursive walk is performed while skipping *.git* directories. | |
* Files are grouped into buckets so that | |
- no bucket exceeds `--max-lines` total lines | |
- the number of buckets never exceeds `--max-files` | |
* Buckets are written to `ctx_XX.txt` under *output_dir*. | |
Command-line usage | |
------------------ | |
python3 llmctx.py [-c CONFIG] [--auto] | |
[--max-files N] [--max-lines N] | |
[ROOT] | |
Arguments | |
~~~~~~~~~ | |
-c/--config Path to the TOML config (default: .llmctx.toml) | |
--auto Force automatic mode even when CONFIG exists | |
--max-files Max buckets in auto mode (default: 20) | |
--max-lines Max lines per bucket in auto mode (default: 1500) | |
ROOT Project root directory (default: current directory) | |
Outputs | |
~~~~~~~ | |
All generated artefacts are placed under `output_dir` (default | |
`.llmctx`) inside *ROOT*. Copy operations preserve filenames; concat and | |
auto buckets use `ctx_XX.txt`. | |
The script is self-contained and relies only on the Python 3.11+ | |
standard library (the bundled `tomllib` is used for parsing TOML). | |
Example | |
~~~~~~~ | |
# Manual mode using .llmctx.toml in the cwd | |
python3 llmctx.py | |
# Automatic mode on a different repo | |
python3 llmctx.py --auto --max-lines 1000 ~/projects/myrepo | |
''' | |
from __future__ import annotations | |
import argparse | |
import shutil | |
import subprocess | |
import sys | |
from collections import defaultdict | |
from dataclasses import dataclass, field | |
from pathlib import Path | |
from typing import List, Sequence | |
import tomllib | |
# Data models | |
@dataclass(frozen=True) | |
class ConcatEntry: | |
''' | |
A directory whose files are concatenated into *output_file*. | |
''' | |
input_dir: Path | |
output_file: Path | |
@dataclass(frozen=True) | |
class ManualConfig: | |
''' | |
Configuration loaded from .llmctx.toml (manual mode). | |
''' | |
output_dir: Path | |
concat: List[ConcatEntry] | |
copy: List[Path] | |
@dataclass(frozen=True) | |
class AutoConfig: | |
''' | |
In-memory settings for automatic mode. | |
''' | |
output_dir: Path | |
max_files: int | |
max_lines: int | |
suffix_whitelist: set[str] | |
# Manual-mode helpers | |
def load_manual_config(path: Path) -> ManualConfig: | |
''' | |
Parse .llmctx.toml and return a strongly-typed structure. | |
''' | |
with path.open('rb') as fp: | |
raw = tomllib.load(fp) | |
gen = raw.get('general', {}) | |
out_dir = Path(gen.get('output_dir', '.llmctx')) | |
concat = [ | |
ConcatEntry(Path(it['input_dir']), Path(it['output_file'])) | |
for it in raw.get('concat', []) | |
] | |
copy = [Path(p) for p in gen.get('copy', [])] | |
return ManualConfig(out_dir, concat, copy) | |
def ensure_dir(path: Path) -> None: | |
''' | |
Create *path* (and parents) if it does not already exist. | |
''' | |
path.mkdir(parents=True, exist_ok=True) | |
def concat_dir(entry: ConcatEntry, out_dir: Path) -> None: | |
''' | |
Concatenate every file in *entry.input_dir* into *entry.output_file*. | |
''' | |
dest = out_dir / entry.output_file | |
dest.write_text('', encoding='utf-8') | |
for fp in sorted(entry.input_dir.iterdir()): | |
if not fp.is_file(): | |
continue | |
header = f'\n=== {fp.relative_to(entry.input_dir)} ===\n' | |
with dest.open('a', encoding='utf-8') as out, \ | |
fp.open('r', encoding='utf-8') as src: | |
out.write(header) | |
out.write(src.read()) | |
def copy_file(src: Path, out_dir: Path) -> None: | |
''' | |
Copy *src* into *out_dir* preserving the basename. | |
''' | |
shutil.copy(src, out_dir / src.name) | |
# Automatic-mode helpers | |
DEFAULT_EXTS: set[str] = { | |
'.toml', '.md', | |
'.py', '.js', '.ts', '.java', '.c', '.h', | |
'.cpp', '.hpp', '.cs', '.go', '.rs', '.php', | |
'.sql' | |
} | |
def load_suffix_whitelist(cfg_path: Path) -> set[str]: | |
''' | |
Return custom suffix whitelist if defined, otherwise *DEFAULT_EXTS*. | |
''' | |
if cfg_path.exists(): | |
with cfg_path.open('rb') as fp: | |
raw = tomllib.load(fp) | |
custom = raw.get('general', {}).get('suffix_whitelist') | |
if custom: | |
return {e if e.startswith('.') else f'.{e}' for e in custom} | |
return DEFAULT_EXTS | |
def git_ls_files(root: Path) -> set[Path]: | |
''' | |
Return tracked files via *git ls-files* or an empty set on failure. | |
''' | |
try: | |
proc = subprocess.run( | |
['git', 'ls-files', '--cached', '--others', | |
'--exclude-standard', '-z'], | |
cwd=root, | |
stdout=subprocess.PIPE, | |
stderr=subprocess.DEVNULL, | |
check=True, | |
) | |
except (OSError, subprocess.CalledProcessError): | |
return set() | |
return {root / Path(p.decode()) for p in proc.stdout.split(b'\0') if p} | |
def all_files(root: Path, exts: set[str]) -> set[Path]: | |
''' | |
Gather candidate files, preferring Git when available. | |
''' | |
candidates = git_ls_files(root) or { | |
p for p in root.rglob('*') | |
if p.is_file() and '.git' not in p.parts | |
} | |
return {p for p in candidates if p.suffix.lower() in exts} | |
def line_count(path: Path) -> int: | |
''' | |
Return the number of lines in *path* (binary read is faster). | |
''' | |
with path.open('rb') as fp: | |
return sum(1 for _ in fp) | |
@dataclass | |
class Item: | |
''' | |
Node in the bucket tree (either a file or collapsed directory). | |
''' | |
path: Path | |
lines: int | |
children: list['Item'] = field(default_factory=list) | |
def write(self, out_fp: Path, root: Path) -> None: | |
''' | |
Write this item (recursively) to *out_fp*. | |
''' | |
if self.children: | |
for child in self.children: | |
child.write(out_fp, root) | |
return | |
header = f'\n=== {self.path.relative_to(root)} ===\n' | |
with out_fp.open('a', encoding='utf-8') as out: | |
out.write(header) | |
out.write(self.path.read_text(encoding='utf-8')) | |
def collapse_dirs(files: dict[Path, int], max_lines: int) -> list[Item]: | |
''' | |
Collapse directories whose combined size ≤ *max_lines* into a single | |
Item to reduce bucket fragmentation. | |
''' | |
by_dir: dict[Path, list[Path]] = defaultdict(list) | |
for fp in files: | |
by_dir[fp.parent].append(fp) | |
collapsed: set[Path] = set() | |
items: list[Item] = [] | |
for d in sorted(by_dir, key=lambda p: len(p.parts), reverse=True): | |
if any(parent in collapsed for parent in d.parents): | |
continue | |
child_files = by_dir[d] | |
total = sum(files[f] for f in child_files) | |
if total <= max_lines and len(child_files) > 1: | |
collapsed.add(d) | |
children = [Item(f, files[f]) for f in child_files] | |
items.append(Item(d, total, children)) | |
for fp, cnt in files.items(): | |
if not any(c == fp or c in fp.parents for c in collapsed): | |
items.append(Item(fp, cnt)) | |
return items | |
def pack_items(items: Sequence[Item], | |
max_files: int, | |
max_lines: int) -> list[list[Item]]: | |
''' | |
Greedy bucket-packing subject to *max_files* and *max_lines*. | |
''' | |
buckets: list[list[Item]] = [] | |
cur: list[Item] = [] | |
cur_lines = 0 | |
for itm in sorted(items, key=lambda i: i.lines, reverse=True): | |
if cur_lines + itm.lines > max_lines and cur: | |
buckets.append(cur) | |
cur, cur_lines = [], 0 | |
cur.append(itm) | |
cur_lines += itm.lines | |
if cur: | |
buckets.append(cur) | |
# Merge buckets if we generated too many | |
while len(buckets) > max_files: | |
a = buckets.pop() | |
buckets[-1].extend(a) | |
return buckets[:max_files] | |
def run_auto(root: Path, cfg: AutoConfig) -> None: | |
''' | |
Entry-point for automatic mode. | |
''' | |
ensure_dir(cfg.output_dir) | |
paths = all_files(root, cfg.suffix_whitelist) | |
if not paths: | |
print('no matching files', file=sys.stderr) | |
return | |
# helper to collapse, pack and write one group | |
def write_group(group: dict[Path, int], prefix: str) -> None: | |
if not group: | |
return | |
collapsed = collapse_dirs(group, cfg.max_lines) | |
buckets = pack_items(collapsed, cfg.max_files, cfg.max_lines) | |
for idx, bucket in enumerate(buckets, 1): | |
fp = cfg.output_dir / f'{prefix}_{idx:02}.txt' | |
fp.write_text('', encoding='utf-8') | |
for item in bucket: | |
item.write(fp, root) | |
# split by suffix | |
md_counts = {p: line_count(p) for p in paths if p.suffix.lower() == '.md'} | |
code_counts = { | |
p: line_count(p) for p in paths if p.suffix.lower() != '.md' | |
} | |
write_group(code_counts, 'ctx') | |
write_group(md_counts, 'docs') | |
# CLI plumbing | |
def parse_args() -> argparse.Namespace: | |
''' | |
Return parsed CLI arguments. | |
''' | |
ap = argparse.ArgumentParser( | |
description='Build LLM context files for a project', | |
) | |
ap.add_argument('-c', '--config', | |
default='.llmctx.toml', | |
type=Path, | |
help='path to config (default .llmctx.toml)') | |
ap.add_argument('--auto', action='store_true', | |
help='force automatic mode even if config exists') | |
ap.add_argument('--max-files', type=int, default=20) | |
ap.add_argument('--max-lines', type=int, default=1500) | |
ap.add_argument('root', nargs='?', default='.', type=Path, | |
help='project root (default cwd)') | |
return ap.parse_args() | |
def main() -> None: | |
''' | |
Top-level dispatcher for manual vs automatic mode. | |
''' | |
ns = parse_args() | |
cfg_path = ns.config.expanduser() | |
root = Path(ns.root).resolve() | |
if ns.auto or not cfg_path.exists(): | |
wlist = load_suffix_whitelist(cfg_path) | |
cfg = AutoConfig( | |
output_dir=Path('.llmctx'), | |
max_files=ns.max_files, | |
max_lines=ns.max_lines, | |
suffix_whitelist=wlist, | |
) | |
run_auto(root, cfg) | |
return | |
cfg = load_manual_config(cfg_path) | |
ensure_dir(cfg.output_dir) | |
for e in cfg.concat: | |
concat_dir(e, cfg.output_dir) | |
for src in cfg.copy: | |
copy_file(src, cfg.output_dir) | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment