guilledk · June 30, 2025 00:22
diff --git a/llmctx.py b/llmctx.py
 #!/usr/bin/env python3
 '''
 llmctx.py
 ---------

 Utility for extracting relevant source snippets from a project tree and
 building plain-text *context* files that can be pasted into an LLM prompt.

 It runs in two mutually-exclusive modes:

 Manual mode (default when `.llmctx.toml` exists)
    The config file is a TOML document with two sections:

    [general]
    output_dir = '.llmctx'          # directory where artefacts go
    copy       = ['README.md']      # files copied verbatim

    [[concat]]
    input_dir   = 'src'             # each regular file under this dir …
    output_file = 'code.txt'        # … is appended to this file

    Every file added to *output_file* is preceded by a header line
    `=== relative/path/to/file ===` to keep provenance clear.

 Automatic mode (enabled with `--auto` or when no config is present)
    * The workspace is scanned for files whose suffix matches a whitelist
      (defaults to a set of common source extensions; can be overridden
      by `general.suffix_whitelist` in the config).
    * If executed inside a Git repo we honour `git ls-files`; otherwise a
      recursive walk is performed while skipping *.git* directories.
    * Files are grouped into buckets so that
        - no bucket exceeds `--max-lines` total lines
        - the number of buckets never exceeds `--max-files`
    * Buckets are written to `ctx_XX.txt` under *output_dir*.

 Command-line usage
 ------------------
    python3 llmctx.py [-c CONFIG] [--auto]
                      [--max-files N] [--max-lines N]
                      [ROOT]

 Arguments
 ~~~~~~~~~
  -c/--config    Path to the TOML config (default: .llmctx.toml)
  --auto         Force automatic mode even when CONFIG exists
  --max-files    Max buckets in auto mode (default: 20)
  --max-lines    Max lines per bucket in auto mode (default: 1500)
  ROOT           Project root directory (default: current directory)

 Outputs
 ~~~~~~~
 All generated artefacts are placed under `output_dir` (default
 `.llmctx`) inside *ROOT*.  Copy operations preserve filenames; concat and
 auto buckets use `ctx_XX.txt`.

 The script is self-contained and relies only on the Python 3.11+
 standard library (the bundled `tomllib` is used for parsing TOML).

 Example
 ~~~~~~~
    # Manual mode using .llmctx.toml in the cwd
    python3 llmctx.py

    # Automatic mode on a different repo
    python3 llmctx.py --auto --max-lines 1000 ~/projects/myrepo
 '''
 from __future__ import annotations

 import argparse
 import shutil
 import subprocess
 import sys
 from collections import defaultdict
 from dataclasses import dataclass, field
 from pathlib import Path
 from typing import List, Sequence

 import tomllib


 # Data models

 @dataclass(frozen=True)
 class ConcatEntry:
    '''
    A directory whose files are concatenated into *output_file*.

    '''
    input_dir: Path
    output_file: Path


 @dataclass(frozen=True)
 class ManualConfig:
    '''
    Configuration loaded from .llmctx.toml (manual mode).

    '''
    output_dir: Path
    concat: List[ConcatEntry]
    copy: List[Path]


 @dataclass(frozen=True)
 class AutoConfig:
    '''
    In-memory settings for automatic mode.

    '''
    output_dir: Path
    max_files: int
    max_lines: int
    suffix_whitelist: set[str]


 # Manual-mode helpers

 def load_manual_config(path: Path) -> ManualConfig:
    '''
    Parse .llmctx.toml and return a strongly-typed structure.

    '''
    with path.open('rb') as fp:
        raw = tomllib.load(fp)

    gen = raw.get('general', {})
    out_dir = Path(gen.get('output_dir', '.llmctx'))

    concat = [
        ConcatEntry(Path(it['input_dir']), Path(it['output_file']))
        for it in raw.get('concat', [])
    ]
    copy = [Path(p) for p in gen.get('copy', [])]

    return ManualConfig(out_dir, concat, copy)


 def ensure_dir(path: Path) -> None:
    '''
    Create *path* (and parents) if it does not already exist.

    '''
    path.mkdir(parents=True, exist_ok=True)


 def concat_dir(entry: ConcatEntry, out_dir: Path) -> None:
    '''
    Concatenate every file in *entry.input_dir* into *entry.output_file*.

    '''
    dest = out_dir / entry.output_file
    dest.write_text('', encoding='utf-8')

    for fp in sorted(entry.input_dir.iterdir()):
        if not fp.is_file():
            continue
        header = f'\n=== {fp.relative_to(entry.input_dir)} ===\n'
        with dest.open('a', encoding='utf-8') as out, \
                fp.open('r', encoding='utf-8') as src:
            out.write(header)
            out.write(src.read())


 def copy_file(src: Path, out_dir: Path) -> None:
    '''
    Copy *src* into *out_dir* preserving the basename.

    '''
    shutil.copy(src, out_dir / src.name)


 # Automatic-mode helpers

 DEFAULT_EXTS: set[str] = {
    '.toml', '.md',
    '.py', '.js', '.ts', '.java', '.c', '.h',
    '.cpp', '.hpp', '.cs', '.go', '.rs', '.php',
    '.sql'
 }


 def load_suffix_whitelist(cfg_path: Path) -> set[str]:
    '''
    Return custom suffix whitelist if defined, otherwise *DEFAULT_EXTS*.

    '''
    if cfg_path.exists():
        with cfg_path.open('rb') as fp:
            raw = tomllib.load(fp)
        custom = raw.get('general', {}).get('suffix_whitelist')
        if custom:
            return {e if e.startswith('.') else f'.{e}' for e in custom}
    return DEFAULT_EXTS


 def git_ls_files(root: Path) -> set[Path]:
    '''
    Return tracked files via *git ls-files* or an empty set on failure.

    '''
    try:
        proc = subprocess.run(
            ['git', 'ls-files', '--cached', '--others',
             '--exclude-standard', '-z'],
            cwd=root,
            stdout=subprocess.PIPE,
            stderr=subprocess.DEVNULL,
            check=True,
        )
    except (OSError, subprocess.CalledProcessError):
        return set()

    return {root / Path(p.decode()) for p in proc.stdout.split(b'\0') if p}


 def all_files(root: Path, exts: set[str]) -> set[Path]:
    '''
    Gather candidate files, preferring Git when available.

    '''
    candidates = git_ls_files(root) or {
        p for p in root.rglob('*')
        if p.is_file() and '.git' not in p.parts
    }
    return {p for p in candidates if p.suffix.lower() in exts}


 def line_count(path: Path) -> int:
    '''
    Return the number of lines in *path* (binary read is faster).

    '''
    with path.open('rb') as fp:
        return sum(1 for _ in fp)


 @dataclass
 class Item:
    '''
    Node in the bucket tree (either a file or collapsed directory).

    '''
    path: Path
    lines: int
    children: list['Item'] = field(default_factory=list)

    def write(self, out_fp: Path, root: Path) -> None:
        '''
        Write this item (recursively) to *out_fp*.

        '''
        if self.children:
            for child in self.children:
                child.write(out_fp, root)
            return
        header = f'\n=== {self.path.relative_to(root)} ===\n'
        with out_fp.open('a', encoding='utf-8') as out:
            out.write(header)
            out.write(self.path.read_text(encoding='utf-8'))


 def collapse_dirs(files: dict[Path, int], max_lines: int) -> list[Item]:
    '''
    Collapse directories whose combined size ≤ *max_lines* into a single
    Item to reduce bucket fragmentation.

    '''
    by_dir: dict[Path, list[Path]] = defaultdict(list)
    for fp in files:
        by_dir[fp.parent].append(fp)

    collapsed: set[Path] = set()
    items: list[Item] = []

    for d in sorted(by_dir, key=lambda p: len(p.parts), reverse=True):
        if any(parent in collapsed for parent in d.parents):
            continue
        child_files = by_dir[d]
        total = sum(files[f] for f in child_files)
        if total <= max_lines and len(child_files) > 1:
            collapsed.add(d)
            children = [Item(f, files[f]) for f in child_files]
            items.append(Item(d, total, children))

    for fp, cnt in files.items():
        if not any(c == fp or c in fp.parents for c in collapsed):
            items.append(Item(fp, cnt))
    return items


 def pack_items(items: Sequence[Item],
               max_files: int,
               max_lines: int) -> list[list[Item]]:
    '''
    Greedy bucket-packing subject to *max_files* and *max_lines*.

    '''
    buckets: list[list[Item]] = []
    cur: list[Item] = []
    cur_lines = 0

    for itm in sorted(items, key=lambda i: i.lines, reverse=True):
        if cur_lines + itm.lines > max_lines and cur:
            buckets.append(cur)
            cur, cur_lines = [], 0
        cur.append(itm)
        cur_lines += itm.lines
    if cur:
        buckets.append(cur)

    # Merge buckets if we generated too many
    while len(buckets) > max_files:
        a = buckets.pop()
        buckets[-1].extend(a)

    return buckets[:max_files]


 def run_auto(root: Path, cfg: AutoConfig) -> None:
    '''
    Entry-point for automatic mode.

    '''
    ensure_dir(cfg.output_dir)

    paths = all_files(root, cfg.suffix_whitelist)
    if not paths:
        print('no matching files', file=sys.stderr)
        return

    # helper to collapse, pack and write one group
    def write_group(group: dict[Path, int], prefix: str) -> None:
        if not group:
            return
        collapsed = collapse_dirs(group, cfg.max_lines)
        buckets = pack_items(collapsed, cfg.max_files, cfg.max_lines)
        for idx, bucket in enumerate(buckets, 1):
            fp = cfg.output_dir / f'{prefix}_{idx:02}.txt'
            fp.write_text('', encoding='utf-8')
            for item in bucket:
                item.write(fp, root)

    # split by suffix
    md_counts = {p: line_count(p) for p in paths if p.suffix.lower() == '.md'}
    code_counts = {
        p: line_count(p) for p in paths if p.suffix.lower() != '.md'
    }

    write_group(code_counts, 'ctx')
    write_group(md_counts, 'docs')


 # CLI plumbing

 def parse_args() -> argparse.Namespace:
    '''
    Return parsed CLI arguments.

    '''
    ap = argparse.ArgumentParser(
        description='Build LLM context files for a project',
    )
    ap.add_argument('-c', '--config',
                    default='.llmctx.toml',
                    type=Path,
                    help='path to config (default .llmctx.toml)')
    ap.add_argument('--auto', action='store_true',
                    help='force automatic mode even if config exists')
    ap.add_argument('--max-files', type=int, default=20)
    ap.add_argument('--max-lines', type=int, default=1500)
    ap.add_argument('root', nargs='?', default='.', type=Path,
                    help='project root (default cwd)')
    return ap.parse_args()


 def main() -> None:
    '''
    Top-level dispatcher for manual vs automatic mode.

    '''
    ns = parse_args()
    cfg_path = ns.config.expanduser()
    root = Path(ns.root).resolve()

    if ns.auto or not cfg_path.exists():
        wlist = load_suffix_whitelist(cfg_path)
        cfg = AutoConfig(
            output_dir=Path('.llmctx'),
            max_files=ns.max_files,
            max_lines=ns.max_lines,
            suffix_whitelist=wlist,
        )
        run_auto(root, cfg)
        return

    cfg = load_manual_config(cfg_path)
    ensure_dir(cfg.output_dir)
    for e in cfg.concat:
        concat_dir(e, cfg.output_dir)
    for src in cfg.copy:
        copy_file(src, cfg.output_dir)


 if __name__ == '__main__':
    main()
	#!/usr/bin/env python3
	'''
	llmctx.py
	---------

	Utility for extracting relevant source snippets from a project tree and
	building plain-text context files that can be pasted into an LLM prompt.

	It runs in two mutually-exclusive modes:

	Manual mode (default when `.llmctx.toml` exists)
	The config file is a TOML document with two sections:

	[general]
	output_dir = '.llmctx' # directory where artefacts go
	copy = ['README.md'] # files copied verbatim

	[[concat]]
	input_dir = 'src' # each regular file under this dir …
	output_file = 'code.txt' # … is appended to this file

	Every file added to output_file is preceded by a header line
	`=== relative/path/to/file ===` to keep provenance clear.

	Automatic mode (enabled with `--auto` or when no config is present)
	* The workspace is scanned for files whose suffix matches a whitelist
	(defaults to a set of common source extensions; can be overridden
	by `general.suffix_whitelist` in the config).
	* If executed inside a Git repo we honour `git ls-files`; otherwise a
	recursive walk is performed while skipping .git directories.
	* Files are grouped into buckets so that
	- no bucket exceeds `--max-lines` total lines
	- the number of buckets never exceeds `--max-files`
	* Buckets are written to `ctx_XX.txt` under output_dir.

	Command-line usage
	------------------
	python3 llmctx.py [-c CONFIG] [--auto]
	[--max-files N] [--max-lines N]
	[ROOT]

	Arguments
	~~~~~~~~~
	-c/--config Path to the TOML config (default: .llmctx.toml)
	--auto Force automatic mode even when CONFIG exists
	--max-files Max buckets in auto mode (default: 20)
	--max-lines Max lines per bucket in auto mode (default: 1500)
	ROOT Project root directory (default: current directory)

	Outputs
	~~~~~~~
	All generated artefacts are placed under `output_dir` (default
	`.llmctx`) inside ROOT. Copy operations preserve filenames; concat and
	auto buckets use `ctx_XX.txt`.

	The script is self-contained and relies only on the Python 3.11+
	standard library (the bundled `tomllib` is used for parsing TOML).

	Example
	~~~~~~~
	# Manual mode using .llmctx.toml in the cwd
	python3 llmctx.py

	# Automatic mode on a different repo
	python3 llmctx.py --auto --max-lines 1000 ~/projects/myrepo
	'''
	from __future__ import annotations

	import argparse
	import shutil
	import subprocess
	import sys
	from collections import defaultdict
	from dataclasses import dataclass, field
	from pathlib import Path
	from typing import List, Sequence

	import tomllib


	# Data models

	@dataclass(frozen=True)
	class ConcatEntry:
	'''
	A directory whose files are concatenated into output_file.

	'''
	input_dir: Path
	output_file: Path


	@dataclass(frozen=True)
	class ManualConfig:
	'''
	Configuration loaded from .llmctx.toml (manual mode).

	'''
	output_dir: Path
	concat: List[ConcatEntry]
	copy: List[Path]


	@dataclass(frozen=True)
	class AutoConfig:
	'''
	In-memory settings for automatic mode.

	'''
	output_dir: Path
	max_files: int
	max_lines: int
	suffix_whitelist: set[str]


	# Manual-mode helpers

	def load_manual_config(path: Path) -> ManualConfig:
	'''
	Parse .llmctx.toml and return a strongly-typed structure.

	'''
	with path.open('rb') as fp:
	raw = tomllib.load(fp)

	gen = raw.get('general', {})
	out_dir = Path(gen.get('output_dir', '.llmctx'))

	concat = [
	ConcatEntry(Path(it['input_dir']), Path(it['output_file']))
	for it in raw.get('concat', [])
	]
	copy = [Path(p) for p in gen.get('copy', [])]

	return ManualConfig(out_dir, concat, copy)


	def ensure_dir(path: Path) -> None:
	'''
	Create path (and parents) if it does not already exist.

	'''
	path.mkdir(parents=True, exist_ok=True)


	def concat_dir(entry: ConcatEntry, out_dir: Path) -> None:
	'''
	Concatenate every file in entry.input_dir into entry.output_file.

	'''
	dest = out_dir / entry.output_file
	dest.write_text('', encoding='utf-8')

	for fp in sorted(entry.input_dir.iterdir()):
	if not fp.is_file():
	continue
	header = f'\n=== {fp.relative_to(entry.input_dir)} ===\n'
	with dest.open('a', encoding='utf-8') as out, \
	fp.open('r', encoding='utf-8') as src:
	out.write(header)
	out.write(src.read())


	def copy_file(src: Path, out_dir: Path) -> None:
	'''
	Copy src into out_dir preserving the basename.

	'''
	shutil.copy(src, out_dir / src.name)


	# Automatic-mode helpers

	DEFAULT_EXTS: set[str] = {
	'.toml', '.md',
	'.py', '.js', '.ts', '.java', '.c', '.h',
	'.cpp', '.hpp', '.cs', '.go', '.rs', '.php',
	'.sql'
	}


	def load_suffix_whitelist(cfg_path: Path) -> set[str]:
	'''
	Return custom suffix whitelist if defined, otherwise DEFAULT_EXTS.

	'''
	if cfg_path.exists():
	with cfg_path.open('rb') as fp:
	raw = tomllib.load(fp)
	custom = raw.get('general', {}).get('suffix_whitelist')
	if custom:
	return {e if e.startswith('.') else f'.{e}' for e in custom}
	return DEFAULT_EXTS


	def git_ls_files(root: Path) -> set[Path]:
	'''
	Return tracked files via git ls-files or an empty set on failure.

	'''
	try:
	proc = subprocess.run(
	['git', 'ls-files', '--cached', '--others',
	'--exclude-standard', '-z'],
	cwd=root,
	stdout=subprocess.PIPE,
	stderr=subprocess.DEVNULL,
	check=True,
	)
	except (OSError, subprocess.CalledProcessError):
	return set()

	return {root / Path(p.decode()) for p in proc.stdout.split(b'\0') if p}


	def all_files(root: Path, exts: set[str]) -> set[Path]:
	'''
	Gather candidate files, preferring Git when available.

	'''
	candidates = git_ls_files(root) or {
	p for p in root.rglob('*')
	if p.is_file() and '.git' not in p.parts
	}
	return {p for p in candidates if p.suffix.lower() in exts}


	def line_count(path: Path) -> int:
	'''
	Return the number of lines in path (binary read is faster).

	'''
	with path.open('rb') as fp:
	return sum(1 for _ in fp)


	@dataclass
	class Item:
	'''
	Node in the bucket tree (either a file or collapsed directory).

	'''
	path: Path
	lines: int
	children: list['Item'] = field(default_factory=list)

	def write(self, out_fp: Path, root: Path) -> None:
	'''
	Write this item (recursively) to out_fp.

	'''
	if self.children:
	for child in self.children:
	child.write(out_fp, root)
	return
	header = f'\n=== {self.path.relative_to(root)} ===\n'
	with out_fp.open('a', encoding='utf-8') as out:
	out.write(header)
	out.write(self.path.read_text(encoding='utf-8'))


	def collapse_dirs(files: dict[Path, int], max_lines: int) -> list[Item]:
	'''
	Collapse directories whose combined size ≤ max_lines into a single
	Item to reduce bucket fragmentation.

	'''
	by_dir: dict[Path, list[Path]] = defaultdict(list)
	for fp in files:
	by_dir[fp.parent].append(fp)

	collapsed: set[Path] = set()
	items: list[Item] = []

	for d in sorted(by_dir, key=lambda p: len(p.parts), reverse=True):
	if any(parent in collapsed for parent in d.parents):
	continue
	child_files = by_dir[d]
	total = sum(files[f] for f in child_files)
	if total <= max_lines and len(child_files) > 1:
	collapsed.add(d)
	children = [Item(f, files[f]) for f in child_files]
	items.append(Item(d, total, children))

	for fp, cnt in files.items():
	if not any(c == fp or c in fp.parents for c in collapsed):
	items.append(Item(fp, cnt))
	return items


	def pack_items(items: Sequence[Item],
	max_files: int,
	max_lines: int) -> list[list[Item]]:
	'''
	Greedy bucket-packing subject to max_files and max_lines.

	'''
	buckets: list[list[Item]] = []
	cur: list[Item] = []
	cur_lines = 0

	for itm in sorted(items, key=lambda i: i.lines, reverse=True):
	if cur_lines + itm.lines > max_lines and cur:
	buckets.append(cur)
	cur, cur_lines = [], 0
	cur.append(itm)
	cur_lines += itm.lines
	if cur:
	buckets.append(cur)

	# Merge buckets if we generated too many
	while len(buckets) > max_files:
	a = buckets.pop()
	buckets[-1].extend(a)

	return buckets[:max_files]


	def run_auto(root: Path, cfg: AutoConfig) -> None:
	'''
	Entry-point for automatic mode.

	'''
	ensure_dir(cfg.output_dir)

	paths = all_files(root, cfg.suffix_whitelist)
	if not paths:
	print('no matching files', file=sys.stderr)
	return

	# helper to collapse, pack and write one group
	def write_group(group: dict[Path, int], prefix: str) -> None:
	if not group:
	return
	collapsed = collapse_dirs(group, cfg.max_lines)
	buckets = pack_items(collapsed, cfg.max_files, cfg.max_lines)
	for idx, bucket in enumerate(buckets, 1):
	fp = cfg.output_dir / f'{prefix}_{idx:02}.txt'
	fp.write_text('', encoding='utf-8')
	for item in bucket:
	item.write(fp, root)

	# split by suffix
	md_counts = {p: line_count(p) for p in paths if p.suffix.lower() == '.md'}
	code_counts = {
	p: line_count(p) for p in paths if p.suffix.lower() != '.md'
	}

	write_group(code_counts, 'ctx')
	write_group(md_counts, 'docs')


	# CLI plumbing

	def parse_args() -> argparse.Namespace:
	'''
	Return parsed CLI arguments.

	'''
	ap = argparse.ArgumentParser(
	description='Build LLM context files for a project',
	)
	ap.add_argument('-c', '--config',
	default='.llmctx.toml',
	type=Path,
	help='path to config (default .llmctx.toml)')
	ap.add_argument('--auto', action='store_true',
	help='force automatic mode even if config exists')
	ap.add_argument('--max-files', type=int, default=20)
	ap.add_argument('--max-lines', type=int, default=1500)
	ap.add_argument('root', nargs='?', default='.', type=Path,
	help='project root (default cwd)')
	return ap.parse_args()


	def main() -> None:
	'''
	Top-level dispatcher for manual vs automatic mode.

	'''
	ns = parse_args()
	cfg_path = ns.config.expanduser()
	root = Path(ns.root).resolve()

	if ns.auto or not cfg_path.exists():
	wlist = load_suffix_whitelist(cfg_path)
	cfg = AutoConfig(
	output_dir=Path('.llmctx'),
	max_files=ns.max_files,
	max_lines=ns.max_lines,
	suffix_whitelist=wlist,
	)
	run_auto(root, cfg)
	return

	cfg = load_manual_config(cfg_path)
	ensure_dir(cfg.output_dir)
	for e in cfg.concat:
	concat_dir(e, cfg.output_dir)
	for src in cfg.copy:
	copy_file(src, cfg.output_dir)


	if __name__ == '__main__':
	main()