Skip to content

Instantly share code, notes, and snippets.

@marclove
Created August 9, 2025 17:24
Show Gist options
  • Save marclove/a1d477f9ae50d12a99d0b49e2ccc7961 to your computer and use it in GitHub Desktop.
Save marclove/a1d477f9ae50d12a99d0b49e2ccc7961 to your computer and use it in GitHub Desktop.
LLM Context Bundler for Typescript
//! LLM Context Bundler
//! --------------------
//! A tiny, **parser-driven** concatenation bundler designed specifically for
//! *prompting a coding LLM* with a complete, readable view of your codebase.
//!
//! Goals (tailored to doc-generation / comprehension):
//! - Resolve realistic JS/TS/Paths with **oxc_resolver** (tsconfig-aware)
//! - Parse code with **oxc_parser**; find ESM imports and top-level CJS `require()`
//! - Inline dependencies **before** each file (DFS), preserving strong file
//! boundaries and breadcrumbs for the LLM
//! - Keep imports in the text (as comments) so relationships remain visible
//! - Skip heavy third-party code by default (exclude `node_modules`) to save tokens
//! - Optional JSON inlining (commented), optional nested `require()` discovery
//! - Optional output size cap with a clear truncation notice at the top
//!
//! Non-goals:
//! - This is *not* an executable bundle. No module wrapper, no transpile.
//! It's for reading only, so we favor clarity over runtime correctness.
//!
//! Public API:
//! - `bundle(path) -> Result<String>`: default, opinionated settings for LLM prompts
//! - `bundle_with_options(path, &BundleOptions) -> Result<String>`: full control
//!
//! Suggested `Cargo.toml` (relevant deps):
//! ```toml
//! [dependencies]
//! oxc_resolver = "11"
//! oxc_parser = "0.29"
//! oxc_ast = "0.29"
//! oxc_span = "0.29"
//! oxc_allocator = "0.29"
//! ```
use std::{
collections::{HashMap, HashSet},
fs,
path::{Path, PathBuf},
};
use oxc_allocator::Allocator;
use oxc_ast::ast as ast;
use oxc_parser::Parser;
use oxc_resolver::{ResolveOptions, Resolver, TsconfigOptions, TsconfigReferences, NODEJS_BUILTINS};
use oxc_span::{SourceType, Span};
// ------------------------------ Error alias ------------------------------
/// Lightweight error alias for library ergonomics.
pub type Result<T> = std::result::Result<T, Box<dyn std::error::Error + Send + Sync>>;
// ------------------------------ Options ----------------------------------
/// Options tuned for feeding code to an LLM, not for execution.
#[derive(Debug, Clone)]
pub struct BundleOptions {
/// Include dependencies from `node_modules`.
/// Default: **false** (third-party code usually wastes tokens for doc tasks)
pub include_node_modules: bool,
/// Try to find **any** static `require("...")` across the file, not just top-level.
/// Default: **false** (top-level is usually enough; full scan adds work)
pub include_all_static_requires: bool,
/// Replace import/require statements that we inline with a comment instead of removing.
/// Default: **true** (better breadcrumbs for an LLM)
pub comment_instead_of_strip: bool,
/// Maximum number of bytes to emit (not including the preface header).
/// If reached, we stop inlining further files and mark the output as truncated.
/// Default: **None** (no cap)
pub max_bytes: Option<usize>,
/// Inline `.json` files (as *commented* blocks) when imported.
/// Default: **true** (JSON config is often helpful context)
pub include_json: bool,
}
impl Default for BundleOptions {
fn default() -> Self {
Self {
include_node_modules: false,
include_all_static_requires: false,
comment_instead_of_strip: true,
max_bytes: None,
include_json: true,
}
}
}
// ------------------------------ Public API ------------------------------
/// Opinionated defaults ideal for LLM doc/comprehension prompts.
///
/// *Signature preserved as requested.*
pub fn bundle(ts_file_path: &Path) -> Result<String> {
bundle_with_options(ts_file_path, &BundleOptions::default())
}
/// Full-control entry. Prefer this if you need to tweak behavior.
pub fn bundle_with_options(ts_file_path: &Path, opts: &BundleOptions) -> Result<String> {
let entry = ts_file_path.canonicalize()?;
let project_dir = entry
.parent()
.ok_or("Entry file must have a parent directory")?
.canonicalize()?;
// Wire tsconfig.json into the resolver if present – this enables TS path mapping.
let tsconfig = find_upwards(&project_dir, "tsconfig.json");
let mut ropts = ResolveOptions {
// Prefer TS/ESM first for comprehension, then common variants
extensions: vec![
".ts".into(), ".tsx".into(), ".mts".into(), ".cts".into(),
".js".into(), ".jsx".into(), ".mjs".into(), ".cjs".into(),
".json".into(), // allow JSON resolution for context
],
// Node-style condition names. Include both ESM and CJS entry points.
condition_names: vec!["node".into(), "import".into(), "require".into(), "default".into()],
..ResolveOptions::default()
};
if let Some(config_file) = tsconfig {
ropts.tsconfig = Some(TsconfigOptions { config_file, references: TsconfigReferences::Auto });
}
let resolver = Resolver::new(ropts);
// Internal bundler state. We keep this separate so the recursion stays clean.
let mut state = State::new(resolver, opts.clone());
// Build the bundle body first; prepend a summary header at the very end
// so we can include accurate counts and truncation notes.
state.bundle_file(&entry)?;
let body = state.out;
// Compose preface summary (human- and LLM-friendly)
let mut preface = String::new();
preface.push_str("/*\n");
preface.push_str("================== BUNDLE PREFACE ==================\n");
preface.push_str(&format!("Entry: {}\n", entry.display()));
preface.push_str(&format!("Files inlined: {}\n", state.included_files.len()));
preface.push_str(&format!("Total bytes (body): {}\n", state.total_bytes));
if let Some(max) = opts.max_bytes {
preface.push_str(&format!("Max bytes: {}\n", max));
}
if state.truncated {
preface.push_str("NOTE: Output truncated due to max_bytes. Some dependencies were not inlined.\n");
}
if !state.unresolved.is_empty() {
preface.push_str("Unresolved or skipped (kept as-is):\n");
for (spec, from) in &state.unresolved {
preface.push_str(&format!(" - '{}' (from {})\n", spec, from.display()));
}
}
preface.push_str("====================================================\n");
preface.push_str("*/\n\n");
Ok(format!("{}{}", preface, body))
}
// ------------------------------ Core state --------------------------------
/// Internal, mutable state carried through the DFS.
struct State {
resolver: Resolver,
opts: BundleOptions,
// Avoid re-reading or re-emitting the same file.
visited: HashSet<PathBuf>,
// Resolve cache: (context_dir, spec) -> full_path
res_cache: HashMap<(PathBuf, String), PathBuf>,
// Diagnostics & summary
included_files: Vec<PathBuf>,
unresolved: Vec<(String, PathBuf)>,
// Output accumulation & limits
out: String,
total_bytes: usize,
truncated: bool,
}
impl State {
fn new(resolver: Resolver, opts: BundleOptions) -> Self {
Self {
resolver,
opts,
visited: HashSet::new(),
res_cache: HashMap::new(),
included_files: Vec::new(),
unresolved: Vec::new(),
out: String::new(),
total_bytes: 0,
truncated: false,
}
}
/// Guard to enforce `max_bytes`. Once we hit the cap, we stop recursing.
fn can_emit_more(&self) -> bool {
if let Some(max) = self.opts.max_bytes {
self.total_bytes < max
} else {
true
}
}
/// Append bytes while tracking the cap.
fn push_out(&mut self, s: &str) {
if self.truncated { return; }
if let Some(max) = self.opts.max_bytes {
// compute how many bytes we *can* still add
let remaining = max.saturating_sub(self.total_bytes);
if s.len() > remaining {
self.out.push_str(&s[..remaining]);
self.total_bytes += remaining;
self.truncated = true;
return;
}
}
self.total_bytes += s.len();
self.out.push_str(s);
}
/// Resolve `spec` from within `ctx_dir`, using a small cache.
fn resolve(&mut self, ctx_dir: &Path, spec: &str) -> Option<PathBuf> {
let key = (ctx_dir.to_path_buf(), spec.to_string());
if let Some(p) = self.res_cache.get(&key) { return Some(p.clone()); }
match self.resolver.resolve(ctx_dir, spec) {
Ok(res) => {
let p = res.full_path().to_path_buf();
self.res_cache.insert(key, p.clone());
Some(p)
}
Err(_) => None
}
}
/// Depth-first inline of a single file.
fn bundle_file(&mut self, file_path: &Path) -> Result<()> {
let path = file_path.canonicalize()?;
if !self.visited.insert(path.clone()) {
// already emitted this file
return Ok(());
}
if is_declaration_file(&path) {
return Ok(());
}
if !self.can_emit_more() { self.truncated = true; return Ok(()); }
let source = fs::read_to_string(&path)?;
// Parse the file into an AST for reliable import/require collection.
let allocator = Allocator::default();
let source_type = SourceType::from_path(&path).unwrap_or(SourceType::ts());
let parser = Parser::new(&allocator, &source, source_type);
let parsed = parser.parse();
let program = parsed.program;
// 1) Collect ESM imports / re-exports (with source) and top-level CJS requires.
let mut to_inline: Vec<SpecUse> = Vec::new();
// ESM: keep exact statement spans so we can comment/strip only those lines.
for stmt in &program.body {
match stmt {
ast::Statement::ImportDeclaration(n) => {
if let Some(src) = &n.source { to_inline.push(SpecUse::esm(n.span, &src.value)); }
}
ast::Statement::ExportNamedDeclaration(n) => {
if let Some(src) = &n.source { to_inline.push(SpecUse::esm(n.span, &src.value)); }
}
ast::Statement::ExportAllDeclaration(n) => {
if let Some(src) = &n.source { to_inline.push(SpecUse::esm(n.span, &src.value)); }
}
_ => {}
}
}
// CJS: top-level require forms -> strip or comment the *entire* statement.
for stmt in &program.body {
if let ast::Statement::ExpressionStatement(expr_stmt) = stmt {
if let Some(spec) = extract_static_require_spec(&expr_stmt.expression) {
to_inline.push(SpecUse::cjs_top(expr_stmt.span, spec));
}
}
if let ast::Statement::VariableDeclaration(var_decl) = stmt {
for decl in &var_decl.declarations {
if let Some(init) = &decl.init {
if let Some(spec) = extract_static_require_spec(init) {
to_inline.push(SpecUse::cjs_top(var_decl.span, spec));
break; // Remove once per statement
}
}
}
}
}
// Optionally: discover ANY static `require("...")` (not only top-level) via a
// lightweight heuristic. We do NOT strip these (could be inside expressions),
// but we inline their targets before this file so the LLM sees the code too.
if self.opts.include_all_static_requires {
for spec in find_all_static_requires_heuristic(&source) {
to_inline.push(SpecUse::cjs_nested(spec));
}
}
// Deduplicate by spec while keeping earliest statement span for stripping.
to_inline = dedup_spec_uses(to_inline);
// 2) Resolve & inline dependencies first (DFS). Track which statements we actually inlined
// so we can replace those specific import/require statements with a small comment.
let mut replacements: HashMap<Span, String> = HashMap::new();
for item in &to_inline {
let spec = item.spec();
// Skip node builtins and URLs/data URIs – they are not project files.
if is_node_builtin(spec) || looks_like_url(spec) { continue; }
let ctx_dir = path.parent().ok_or("File without parent directory")?.to_path_buf();
let Some(resolved) = self.resolve(&ctx_dir, spec) else {
self.unresolved.push((spec.to_string(), path.clone()));
continue;
};
// Skip declaration files; optionally skip node_modules to save tokens.
if is_declaration_file(&resolved) { continue; }
if !self.opts.include_node_modules && in_node_modules(&resolved) { continue; }
if is_json_file(&resolved) {
if self.opts.include_json {
// Inline JSON as a commented block; ideal for documentation context.
self.emit_json_block(&resolved)?;
}
// If we emitted JSON, treat as "inlined" for ESM/CJS statement replacement.
if let Some(sp) = item.strip_span() { replacements.insert(sp, replaced_comment(spec, &resolved)); }
continue;
}
if is_code_file(&resolved) {
self.bundle_file(&resolved)?; // DFS inline
if let Some(sp) = item.strip_span() {
replacements.insert(sp, replaced_comment(spec, &resolved));
}
} else {
// Other asset types – keep import as-is, but note unresolved/unsupported.
self.unresolved.push((spec.to_string(), path.clone()));
}
if !self.can_emit_more() { self.truncated = true; break; }
}
// 3) Write *this* file with optional statement replacement (comment or strip).
// We reconstruct by streaming the original source and splicing where needed.
self.push_out("/* ===== FILE START ===== */\n");
self.push_out(&format!("/* PATH: {} */\n", path.display()));
let mut cursor = 0usize;
for stmt in &program.body {
let sp = stmt.span();
if let Some(repl) = replacements.get(&sp) {
// Emit text *before* the statement, then our breadcrumb comment.
let start = sp.start as usize;
let end = sp.end as usize;
if start > cursor { self.push_out(&source[cursor..start]); }
if self.opts.comment_instead_of_strip { self.push_out(repl); }
// else: drop the statement entirely
cursor = end;
}
}
if cursor < source.len() { self.push_out(&source[cursor..]); }
self.push_out("\n/* ===== FILE END ===== */\n\n");
// Record for preface summary.
self.included_files.push(path);
Ok(())
}
/// Emit a commented JSON block. This keeps the raw JSON visible to the LLM without
/// affecting any parsing logic in downstream tools.
fn emit_json_block(&mut self, json_path: &Path) -> Result<()> {
if !self.can_emit_more() { self.truncated = true; return Ok(()); }
let text = fs::read_to_string(json_path)?;
self.push_out("/* ===== FILE START ===== */\n");
self.push_out(&format!("/* PATH: {} (JSON) */\n", json_path.display()));
self.push_out("/* BEGIN JSON */\n");
self.push_out(&text);
self.push_out("\n/* END JSON */\n");
self.push_out("/* ===== FILE END ===== */\n\n");
Ok(())
}
}
// ------------------------------ Helpers -----------------------------------
/// Represents a dependency use-site we discovered.
#[derive(Clone, Debug)]
enum SpecUse {
/// ESM `import` or `export ... from` with the statement span.
Esm { span: Span, spec: String },
/// CJS top-level `require()` with the whole statement span.
CjsTop { span: Span, spec: String },
/// CJS nested static `require()` (no safe statement span to strip).
CjsNested { spec: String },
}
impl SpecUse {
fn esm(span: Span, spec: &str) -> Self { Self::Esm { span, spec: spec.to_string() } }
fn cjs_top(span: Span, spec: &str) -> Self { Self::CjsTop { span, spec: spec.to_string() } }
fn cjs_nested(spec: String) -> Self { Self::CjsNested { spec } }
fn spec(&self) -> &str {
match self { Self::Esm { spec, .. } | Self::CjsTop { spec, .. } | Self::CjsNested { spec } => spec }
}
fn strip_span(&self) -> Option<Span> {
match self { Self::Esm { span, .. } | Self::CjsTop { span, .. } => Some(*span), _ => None }
}
}
/// Deduplicate uses by spec, keeping the earliest strip-able span if any.
fn dedup_spec_uses(mut items: Vec<SpecUse>) -> Vec<SpecUse> {
let mut seen: HashMap<String, SpecUse> = HashMap::new();
for it in items.drain(..) {
let key = it.spec().to_string();
seen.entry(key).and_modify(|existing| {
// Prefer a variant that has a span (so we can strip/comment), and keep the earliest span.
match (existing.strip_span(), it.strip_span()) {
(None, Some(_)) => { *existing = it.clone(); }
(Some(old), Some(new)) if new.start < old.start => { *existing = it.clone(); }
_ => {}
}
}).or_insert(it);
}
seen.into_values().collect()
}
/// Extract `require("...")` if the expression is a static single-argument call,
/// possibly wrapped in a member expression (e.g. `require("x").foo`).
fn extract_static_require_spec<'a>(expr: &ast::Expression<'a>) -> Option<&'a str> {
use ast::{Argument, Expression, MemberExpression};
match expr {
Expression::CallExpression(call) => {
if let Expression::Identifier(ident) = &call.callee {
if ident.name.as_str() == "require" && call.arguments.len() == 1 {
if let Argument::Expression(Expression::StringLiteral(lit)) = &call.arguments[0] {
return Some(lit.value.as_str());
}
}
}
None
}
Expression::MemberExpression(member) => {
match member {
MemberExpression::StaticMemberExpression(m) => extract_static_require_spec(&m.object),
MemberExpression::ComputedMemberExpression(m) => extract_static_require_spec(&m.object),
MemberExpression::PrivateFieldExpression(_) => None,
}
}
_ => None,
}
}
/// Heuristic (regex) for discovering *any* `require("...")` in the file when
/// `include_all_static_requires` is enabled. We don't strip these; we only inline
/// their targets so the LLM has the code for context.
fn find_all_static_requires_heuristic(source: &str) -> Vec<String> {
// NOTE: This is intentionally simple; it is good enough for doc prompts and avoids
// writing a full AST walker. It ignores comments/strings edge cases.
let re = regex::Regex::new(r#"require\s*\(\s*['\"]([^'\"]+)['\"]\s*\)"#).unwrap();
let mut v = Vec::new();
let mut seen = HashSet::new();
for cap in re.captures_iter(source) {
if let Some(m) = cap.get(1) { if seen.insert(m.as_str().to_string()) { v.push(m.as_str().to_string()); } }
}
v
}
fn is_code_file(p: &Path) -> bool {
matches!(p.extension().and_then(|e| e.to_str()).unwrap_or(""),
"js" | "mjs" | "cjs" | "jsx" |
"ts" | "mts" | "cts" | "tsx")
}
fn is_json_file(p: &Path) -> bool {
p.extension().and_then(|e| e.to_str()) == Some("json")
}
fn is_declaration_file(p: &Path) -> bool {
p.extension().and_then(|e| e.to_str()).map(|ext| ext == "d.ts" || ext == "d.mts" || ext == "d.cts").unwrap_or(false)
}
fn is_node_builtin(spec: &str) -> bool {
let s = spec.strip_prefix("node:").unwrap_or(spec);
NODEJS_BUILTINS.iter().any(|&b| b == s)
}
fn looks_like_url(spec: &str) -> bool {
let lower = spec.to_ascii_lowercase();
lower.starts_with("http://") || lower.starts_with("https://") || lower.starts_with("data:")
}
fn in_node_modules(p: &Path) -> bool {
p.components().any(|c| c.as_os_str() == "node_modules")
}
fn find_upwards(start: &Path, needle: &str) -> Option<PathBuf> {
let mut dir = Some(start.to_path_buf());
while let Some(d) = dir {
let candidate = d.join(needle);
if candidate.is_file() { return Some(candidate); }
dir = d.parent().map(|p| p.to_path_buf());
}
None
}
fn replaced_comment(spec: &str, resolved: &Path) -> String {
format!("/* inlined: '{}' => {} */\n", spec, resolved.display())
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment