Skip to content

Instantly share code, notes, and snippets.

@guilledk
Last active June 30, 2025 00:22
Show Gist options
  • Save guilledk/2549cd4984e3a979b068fa0530d3854a to your computer and use it in GitHub Desktop.
Save guilledk/2549cd4984e3a979b068fa0530d3854a to your computer and use it in GitHub Desktop.
#!/usr/bin/env python3
'''
llmctx.py
---------
Utility for extracting relevant source snippets from a project tree and
building plain-text *context* files that can be pasted into an LLM prompt.
It runs in two mutually-exclusive modes:
Manual mode (default when `.llmctx.toml` exists)
The config file is a TOML document with two sections:
[general]
output_dir = '.llmctx' # directory where artefacts go
copy = ['README.md'] # files copied verbatim
[[concat]]
input_dir = 'src' # each regular file under this dir …
output_file = 'code.txt' # … is appended to this file
Every file added to *output_file* is preceded by a header line
`=== relative/path/to/file ===` to keep provenance clear.
Automatic mode (enabled with `--auto` or when no config is present)
* The workspace is scanned for files whose suffix matches a whitelist
(defaults to a set of common source extensions; can be overridden
by `general.suffix_whitelist` in the config).
* If executed inside a Git repo we honour `git ls-files`; otherwise a
recursive walk is performed while skipping *.git* directories.
* Files are grouped into buckets so that
- no bucket exceeds `--max-lines` total lines
- the number of buckets never exceeds `--max-files`
* Buckets are written to `ctx_XX.txt` under *output_dir*.
Command-line usage
------------------
python3 llmctx.py [-c CONFIG] [--auto]
[--max-files N] [--max-lines N]
[ROOT]
Arguments
~~~~~~~~~
-c/--config Path to the TOML config (default: .llmctx.toml)
--auto Force automatic mode even when CONFIG exists
--max-files Max buckets in auto mode (default: 20)
--max-lines Max lines per bucket in auto mode (default: 1500)
ROOT Project root directory (default: current directory)
Outputs
~~~~~~~
All generated artefacts are placed under `output_dir` (default
`.llmctx`) inside *ROOT*. Copy operations preserve filenames; concat and
auto buckets use `ctx_XX.txt`.
The script is self-contained and relies only on the Python 3.11+
standard library (the bundled `tomllib` is used for parsing TOML).
Example
~~~~~~~
# Manual mode using .llmctx.toml in the cwd
python3 llmctx.py
# Automatic mode on a different repo
python3 llmctx.py --auto --max-lines 1000 ~/projects/myrepo
'''
from __future__ import annotations
import argparse
import shutil
import subprocess
import sys
from collections import defaultdict
from dataclasses import dataclass, field
from pathlib import Path
from typing import List, Sequence
import tomllib
# Data models
@dataclass(frozen=True)
class ConcatEntry:
'''
A directory whose files are concatenated into *output_file*.
'''
input_dir: Path
output_file: Path
@dataclass(frozen=True)
class ManualConfig:
'''
Configuration loaded from .llmctx.toml (manual mode).
'''
output_dir: Path
concat: List[ConcatEntry]
copy: List[Path]
@dataclass(frozen=True)
class AutoConfig:
'''
In-memory settings for automatic mode.
'''
output_dir: Path
max_files: int
max_lines: int
suffix_whitelist: set[str]
# Manual-mode helpers
def load_manual_config(path: Path) -> ManualConfig:
'''
Parse .llmctx.toml and return a strongly-typed structure.
'''
with path.open('rb') as fp:
raw = tomllib.load(fp)
gen = raw.get('general', {})
out_dir = Path(gen.get('output_dir', '.llmctx'))
concat = [
ConcatEntry(Path(it['input_dir']), Path(it['output_file']))
for it in raw.get('concat', [])
]
copy = [Path(p) for p in gen.get('copy', [])]
return ManualConfig(out_dir, concat, copy)
def ensure_dir(path: Path) -> None:
'''
Create *path* (and parents) if it does not already exist.
'''
path.mkdir(parents=True, exist_ok=True)
def concat_dir(entry: ConcatEntry, out_dir: Path) -> None:
'''
Concatenate every file in *entry.input_dir* into *entry.output_file*.
'''
dest = out_dir / entry.output_file
dest.write_text('', encoding='utf-8')
for fp in sorted(entry.input_dir.iterdir()):
if not fp.is_file():
continue
header = f'\n=== {fp.relative_to(entry.input_dir)} ===\n'
with dest.open('a', encoding='utf-8') as out, \
fp.open('r', encoding='utf-8') as src:
out.write(header)
out.write(src.read())
def copy_file(src: Path, out_dir: Path) -> None:
'''
Copy *src* into *out_dir* preserving the basename.
'''
shutil.copy(src, out_dir / src.name)
# Automatic-mode helpers
DEFAULT_EXTS: set[str] = {
'.toml', '.md',
'.py', '.js', '.ts', '.java', '.c', '.h',
'.cpp', '.hpp', '.cs', '.go', '.rs', '.php',
'.sql'
}
def load_suffix_whitelist(cfg_path: Path) -> set[str]:
'''
Return custom suffix whitelist if defined, otherwise *DEFAULT_EXTS*.
'''
if cfg_path.exists():
with cfg_path.open('rb') as fp:
raw = tomllib.load(fp)
custom = raw.get('general', {}).get('suffix_whitelist')
if custom:
return {e if e.startswith('.') else f'.{e}' for e in custom}
return DEFAULT_EXTS
def git_ls_files(root: Path) -> set[Path]:
'''
Return tracked files via *git ls-files* or an empty set on failure.
'''
try:
proc = subprocess.run(
['git', 'ls-files', '--cached', '--others',
'--exclude-standard', '-z'],
cwd=root,
stdout=subprocess.PIPE,
stderr=subprocess.DEVNULL,
check=True,
)
except (OSError, subprocess.CalledProcessError):
return set()
return {root / Path(p.decode()) for p in proc.stdout.split(b'\0') if p}
def all_files(root: Path, exts: set[str]) -> set[Path]:
'''
Gather candidate files, preferring Git when available.
'''
candidates = git_ls_files(root) or {
p for p in root.rglob('*')
if p.is_file() and '.git' not in p.parts
}
return {p for p in candidates if p.suffix.lower() in exts}
def line_count(path: Path) -> int:
'''
Return the number of lines in *path* (binary read is faster).
'''
with path.open('rb') as fp:
return sum(1 for _ in fp)
@dataclass
class Item:
'''
Node in the bucket tree (either a file or collapsed directory).
'''
path: Path
lines: int
children: list['Item'] = field(default_factory=list)
def write(self, out_fp: Path, root: Path) -> None:
'''
Write this item (recursively) to *out_fp*.
'''
if self.children:
for child in self.children:
child.write(out_fp, root)
return
header = f'\n=== {self.path.relative_to(root)} ===\n'
with out_fp.open('a', encoding='utf-8') as out:
out.write(header)
out.write(self.path.read_text(encoding='utf-8'))
def collapse_dirs(files: dict[Path, int], max_lines: int) -> list[Item]:
'''
Collapse directories whose combined size ≤ *max_lines* into a single
Item to reduce bucket fragmentation.
'''
by_dir: dict[Path, list[Path]] = defaultdict(list)
for fp in files:
by_dir[fp.parent].append(fp)
collapsed: set[Path] = set()
items: list[Item] = []
for d in sorted(by_dir, key=lambda p: len(p.parts), reverse=True):
if any(parent in collapsed for parent in d.parents):
continue
child_files = by_dir[d]
total = sum(files[f] for f in child_files)
if total <= max_lines and len(child_files) > 1:
collapsed.add(d)
children = [Item(f, files[f]) for f in child_files]
items.append(Item(d, total, children))
for fp, cnt in files.items():
if not any(c == fp or c in fp.parents for c in collapsed):
items.append(Item(fp, cnt))
return items
def pack_items(items: Sequence[Item],
max_files: int,
max_lines: int) -> list[list[Item]]:
'''
Greedy bucket-packing subject to *max_files* and *max_lines*.
'''
buckets: list[list[Item]] = []
cur: list[Item] = []
cur_lines = 0
for itm in sorted(items, key=lambda i: i.lines, reverse=True):
if cur_lines + itm.lines > max_lines and cur:
buckets.append(cur)
cur, cur_lines = [], 0
cur.append(itm)
cur_lines += itm.lines
if cur:
buckets.append(cur)
# Merge buckets if we generated too many
while len(buckets) > max_files:
a = buckets.pop()
buckets[-1].extend(a)
return buckets[:max_files]
def run_auto(root: Path, cfg: AutoConfig) -> None:
'''
Entry-point for automatic mode.
'''
ensure_dir(cfg.output_dir)
paths = all_files(root, cfg.suffix_whitelist)
if not paths:
print('no matching files', file=sys.stderr)
return
# helper to collapse, pack and write one group
def write_group(group: dict[Path, int], prefix: str) -> None:
if not group:
return
collapsed = collapse_dirs(group, cfg.max_lines)
buckets = pack_items(collapsed, cfg.max_files, cfg.max_lines)
for idx, bucket in enumerate(buckets, 1):
fp = cfg.output_dir / f'{prefix}_{idx:02}.txt'
fp.write_text('', encoding='utf-8')
for item in bucket:
item.write(fp, root)
# split by suffix
md_counts = {p: line_count(p) for p in paths if p.suffix.lower() == '.md'}
code_counts = {
p: line_count(p) for p in paths if p.suffix.lower() != '.md'
}
write_group(code_counts, 'ctx')
write_group(md_counts, 'docs')
# CLI plumbing
def parse_args() -> argparse.Namespace:
'''
Return parsed CLI arguments.
'''
ap = argparse.ArgumentParser(
description='Build LLM context files for a project',
)
ap.add_argument('-c', '--config',
default='.llmctx.toml',
type=Path,
help='path to config (default .llmctx.toml)')
ap.add_argument('--auto', action='store_true',
help='force automatic mode even if config exists')
ap.add_argument('--max-files', type=int, default=20)
ap.add_argument('--max-lines', type=int, default=1500)
ap.add_argument('root', nargs='?', default='.', type=Path,
help='project root (default cwd)')
return ap.parse_args()
def main() -> None:
'''
Top-level dispatcher for manual vs automatic mode.
'''
ns = parse_args()
cfg_path = ns.config.expanduser()
root = Path(ns.root).resolve()
if ns.auto or not cfg_path.exists():
wlist = load_suffix_whitelist(cfg_path)
cfg = AutoConfig(
output_dir=Path('.llmctx'),
max_files=ns.max_files,
max_lines=ns.max_lines,
suffix_whitelist=wlist,
)
run_auto(root, cfg)
return
cfg = load_manual_config(cfg_path)
ensure_dir(cfg.output_dir)
for e in cfg.concat:
concat_dir(e, cfg.output_dir)
for src in cfg.copy:
copy_file(src, cfg.output_dir)
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment