Created
August 9, 2025 17:24
-
-
Save marclove/a1d477f9ae50d12a99d0b49e2ccc7961 to your computer and use it in GitHub Desktop.
LLM Context Bundler for Typescript
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| //! LLM Context Bundler | |
| //! -------------------- | |
| //! A tiny, **parser-driven** concatenation bundler designed specifically for | |
| //! *prompting a coding LLM* with a complete, readable view of your codebase. | |
| //! | |
| //! Goals (tailored to doc-generation / comprehension): | |
| //! - Resolve realistic JS/TS/Paths with **oxc_resolver** (tsconfig-aware) | |
| //! - Parse code with **oxc_parser**; find ESM imports and top-level CJS `require()` | |
| //! - Inline dependencies **before** each file (DFS), preserving strong file | |
| //! boundaries and breadcrumbs for the LLM | |
| //! - Keep imports in the text (as comments) so relationships remain visible | |
| //! - Skip heavy third-party code by default (exclude `node_modules`) to save tokens | |
| //! - Optional JSON inlining (commented), optional nested `require()` discovery | |
| //! - Optional output size cap with a clear truncation notice at the top | |
| //! | |
| //! Non-goals: | |
| //! - This is *not* an executable bundle. No module wrapper, no transpile. | |
| //! It's for reading only, so we favor clarity over runtime correctness. | |
| //! | |
| //! Public API: | |
| //! - `bundle(path) -> Result<String>`: default, opinionated settings for LLM prompts | |
| //! - `bundle_with_options(path, &BundleOptions) -> Result<String>`: full control | |
| //! | |
| //! Suggested `Cargo.toml` (relevant deps): | |
| //! ```toml | |
| //! [dependencies] | |
| //! oxc_resolver = "11" | |
| //! oxc_parser = "0.29" | |
| //! oxc_ast = "0.29" | |
| //! oxc_span = "0.29" | |
| //! oxc_allocator = "0.29" | |
| //! ``` | |
| use std::{ | |
| collections::{HashMap, HashSet}, | |
| fs, | |
| path::{Path, PathBuf}, | |
| }; | |
| use oxc_allocator::Allocator; | |
| use oxc_ast::ast as ast; | |
| use oxc_parser::Parser; | |
| use oxc_resolver::{ResolveOptions, Resolver, TsconfigOptions, TsconfigReferences, NODEJS_BUILTINS}; | |
| use oxc_span::{SourceType, Span}; | |
| // ------------------------------ Error alias ------------------------------ | |
| /// Lightweight error alias for library ergonomics. | |
| pub type Result<T> = std::result::Result<T, Box<dyn std::error::Error + Send + Sync>>; | |
| // ------------------------------ Options ---------------------------------- | |
| /// Options tuned for feeding code to an LLM, not for execution. | |
| #[derive(Debug, Clone)] | |
| pub struct BundleOptions { | |
| /// Include dependencies from `node_modules`. | |
| /// Default: **false** (third-party code usually wastes tokens for doc tasks) | |
| pub include_node_modules: bool, | |
| /// Try to find **any** static `require("...")` across the file, not just top-level. | |
| /// Default: **false** (top-level is usually enough; full scan adds work) | |
| pub include_all_static_requires: bool, | |
| /// Replace import/require statements that we inline with a comment instead of removing. | |
| /// Default: **true** (better breadcrumbs for an LLM) | |
| pub comment_instead_of_strip: bool, | |
| /// Maximum number of bytes to emit (not including the preface header). | |
| /// If reached, we stop inlining further files and mark the output as truncated. | |
| /// Default: **None** (no cap) | |
| pub max_bytes: Option<usize>, | |
| /// Inline `.json` files (as *commented* blocks) when imported. | |
| /// Default: **true** (JSON config is often helpful context) | |
| pub include_json: bool, | |
| } | |
| impl Default for BundleOptions { | |
| fn default() -> Self { | |
| Self { | |
| include_node_modules: false, | |
| include_all_static_requires: false, | |
| comment_instead_of_strip: true, | |
| max_bytes: None, | |
| include_json: true, | |
| } | |
| } | |
| } | |
| // ------------------------------ Public API ------------------------------ | |
| /// Opinionated defaults ideal for LLM doc/comprehension prompts. | |
| /// | |
| /// *Signature preserved as requested.* | |
| pub fn bundle(ts_file_path: &Path) -> Result<String> { | |
| bundle_with_options(ts_file_path, &BundleOptions::default()) | |
| } | |
| /// Full-control entry. Prefer this if you need to tweak behavior. | |
| pub fn bundle_with_options(ts_file_path: &Path, opts: &BundleOptions) -> Result<String> { | |
| let entry = ts_file_path.canonicalize()?; | |
| let project_dir = entry | |
| .parent() | |
| .ok_or("Entry file must have a parent directory")? | |
| .canonicalize()?; | |
| // Wire tsconfig.json into the resolver if present – this enables TS path mapping. | |
| let tsconfig = find_upwards(&project_dir, "tsconfig.json"); | |
| let mut ropts = ResolveOptions { | |
| // Prefer TS/ESM first for comprehension, then common variants | |
| extensions: vec![ | |
| ".ts".into(), ".tsx".into(), ".mts".into(), ".cts".into(), | |
| ".js".into(), ".jsx".into(), ".mjs".into(), ".cjs".into(), | |
| ".json".into(), // allow JSON resolution for context | |
| ], | |
| // Node-style condition names. Include both ESM and CJS entry points. | |
| condition_names: vec!["node".into(), "import".into(), "require".into(), "default".into()], | |
| ..ResolveOptions::default() | |
| }; | |
| if let Some(config_file) = tsconfig { | |
| ropts.tsconfig = Some(TsconfigOptions { config_file, references: TsconfigReferences::Auto }); | |
| } | |
| let resolver = Resolver::new(ropts); | |
| // Internal bundler state. We keep this separate so the recursion stays clean. | |
| let mut state = State::new(resolver, opts.clone()); | |
| // Build the bundle body first; prepend a summary header at the very end | |
| // so we can include accurate counts and truncation notes. | |
| state.bundle_file(&entry)?; | |
| let body = state.out; | |
| // Compose preface summary (human- and LLM-friendly) | |
| let mut preface = String::new(); | |
| preface.push_str("/*\n"); | |
| preface.push_str("================== BUNDLE PREFACE ==================\n"); | |
| preface.push_str(&format!("Entry: {}\n", entry.display())); | |
| preface.push_str(&format!("Files inlined: {}\n", state.included_files.len())); | |
| preface.push_str(&format!("Total bytes (body): {}\n", state.total_bytes)); | |
| if let Some(max) = opts.max_bytes { | |
| preface.push_str(&format!("Max bytes: {}\n", max)); | |
| } | |
| if state.truncated { | |
| preface.push_str("NOTE: Output truncated due to max_bytes. Some dependencies were not inlined.\n"); | |
| } | |
| if !state.unresolved.is_empty() { | |
| preface.push_str("Unresolved or skipped (kept as-is):\n"); | |
| for (spec, from) in &state.unresolved { | |
| preface.push_str(&format!(" - '{}' (from {})\n", spec, from.display())); | |
| } | |
| } | |
| preface.push_str("====================================================\n"); | |
| preface.push_str("*/\n\n"); | |
| Ok(format!("{}{}", preface, body)) | |
| } | |
| // ------------------------------ Core state -------------------------------- | |
| /// Internal, mutable state carried through the DFS. | |
| struct State { | |
| resolver: Resolver, | |
| opts: BundleOptions, | |
| // Avoid re-reading or re-emitting the same file. | |
| visited: HashSet<PathBuf>, | |
| // Resolve cache: (context_dir, spec) -> full_path | |
| res_cache: HashMap<(PathBuf, String), PathBuf>, | |
| // Diagnostics & summary | |
| included_files: Vec<PathBuf>, | |
| unresolved: Vec<(String, PathBuf)>, | |
| // Output accumulation & limits | |
| out: String, | |
| total_bytes: usize, | |
| truncated: bool, | |
| } | |
| impl State { | |
| fn new(resolver: Resolver, opts: BundleOptions) -> Self { | |
| Self { | |
| resolver, | |
| opts, | |
| visited: HashSet::new(), | |
| res_cache: HashMap::new(), | |
| included_files: Vec::new(), | |
| unresolved: Vec::new(), | |
| out: String::new(), | |
| total_bytes: 0, | |
| truncated: false, | |
| } | |
| } | |
| /// Guard to enforce `max_bytes`. Once we hit the cap, we stop recursing. | |
| fn can_emit_more(&self) -> bool { | |
| if let Some(max) = self.opts.max_bytes { | |
| self.total_bytes < max | |
| } else { | |
| true | |
| } | |
| } | |
| /// Append bytes while tracking the cap. | |
| fn push_out(&mut self, s: &str) { | |
| if self.truncated { return; } | |
| if let Some(max) = self.opts.max_bytes { | |
| // compute how many bytes we *can* still add | |
| let remaining = max.saturating_sub(self.total_bytes); | |
| if s.len() > remaining { | |
| self.out.push_str(&s[..remaining]); | |
| self.total_bytes += remaining; | |
| self.truncated = true; | |
| return; | |
| } | |
| } | |
| self.total_bytes += s.len(); | |
| self.out.push_str(s); | |
| } | |
| /// Resolve `spec` from within `ctx_dir`, using a small cache. | |
| fn resolve(&mut self, ctx_dir: &Path, spec: &str) -> Option<PathBuf> { | |
| let key = (ctx_dir.to_path_buf(), spec.to_string()); | |
| if let Some(p) = self.res_cache.get(&key) { return Some(p.clone()); } | |
| match self.resolver.resolve(ctx_dir, spec) { | |
| Ok(res) => { | |
| let p = res.full_path().to_path_buf(); | |
| self.res_cache.insert(key, p.clone()); | |
| Some(p) | |
| } | |
| Err(_) => None | |
| } | |
| } | |
| /// Depth-first inline of a single file. | |
| fn bundle_file(&mut self, file_path: &Path) -> Result<()> { | |
| let path = file_path.canonicalize()?; | |
| if !self.visited.insert(path.clone()) { | |
| // already emitted this file | |
| return Ok(()); | |
| } | |
| if is_declaration_file(&path) { | |
| return Ok(()); | |
| } | |
| if !self.can_emit_more() { self.truncated = true; return Ok(()); } | |
| let source = fs::read_to_string(&path)?; | |
| // Parse the file into an AST for reliable import/require collection. | |
| let allocator = Allocator::default(); | |
| let source_type = SourceType::from_path(&path).unwrap_or(SourceType::ts()); | |
| let parser = Parser::new(&allocator, &source, source_type); | |
| let parsed = parser.parse(); | |
| let program = parsed.program; | |
| // 1) Collect ESM imports / re-exports (with source) and top-level CJS requires. | |
| let mut to_inline: Vec<SpecUse> = Vec::new(); | |
| // ESM: keep exact statement spans so we can comment/strip only those lines. | |
| for stmt in &program.body { | |
| match stmt { | |
| ast::Statement::ImportDeclaration(n) => { | |
| if let Some(src) = &n.source { to_inline.push(SpecUse::esm(n.span, &src.value)); } | |
| } | |
| ast::Statement::ExportNamedDeclaration(n) => { | |
| if let Some(src) = &n.source { to_inline.push(SpecUse::esm(n.span, &src.value)); } | |
| } | |
| ast::Statement::ExportAllDeclaration(n) => { | |
| if let Some(src) = &n.source { to_inline.push(SpecUse::esm(n.span, &src.value)); } | |
| } | |
| _ => {} | |
| } | |
| } | |
| // CJS: top-level require forms -> strip or comment the *entire* statement. | |
| for stmt in &program.body { | |
| if let ast::Statement::ExpressionStatement(expr_stmt) = stmt { | |
| if let Some(spec) = extract_static_require_spec(&expr_stmt.expression) { | |
| to_inline.push(SpecUse::cjs_top(expr_stmt.span, spec)); | |
| } | |
| } | |
| if let ast::Statement::VariableDeclaration(var_decl) = stmt { | |
| for decl in &var_decl.declarations { | |
| if let Some(init) = &decl.init { | |
| if let Some(spec) = extract_static_require_spec(init) { | |
| to_inline.push(SpecUse::cjs_top(var_decl.span, spec)); | |
| break; // Remove once per statement | |
| } | |
| } | |
| } | |
| } | |
| } | |
| // Optionally: discover ANY static `require("...")` (not only top-level) via a | |
| // lightweight heuristic. We do NOT strip these (could be inside expressions), | |
| // but we inline their targets before this file so the LLM sees the code too. | |
| if self.opts.include_all_static_requires { | |
| for spec in find_all_static_requires_heuristic(&source) { | |
| to_inline.push(SpecUse::cjs_nested(spec)); | |
| } | |
| } | |
| // Deduplicate by spec while keeping earliest statement span for stripping. | |
| to_inline = dedup_spec_uses(to_inline); | |
| // 2) Resolve & inline dependencies first (DFS). Track which statements we actually inlined | |
| // so we can replace those specific import/require statements with a small comment. | |
| let mut replacements: HashMap<Span, String> = HashMap::new(); | |
| for item in &to_inline { | |
| let spec = item.spec(); | |
| // Skip node builtins and URLs/data URIs – they are not project files. | |
| if is_node_builtin(spec) || looks_like_url(spec) { continue; } | |
| let ctx_dir = path.parent().ok_or("File without parent directory")?.to_path_buf(); | |
| let Some(resolved) = self.resolve(&ctx_dir, spec) else { | |
| self.unresolved.push((spec.to_string(), path.clone())); | |
| continue; | |
| }; | |
| // Skip declaration files; optionally skip node_modules to save tokens. | |
| if is_declaration_file(&resolved) { continue; } | |
| if !self.opts.include_node_modules && in_node_modules(&resolved) { continue; } | |
| if is_json_file(&resolved) { | |
| if self.opts.include_json { | |
| // Inline JSON as a commented block; ideal for documentation context. | |
| self.emit_json_block(&resolved)?; | |
| } | |
| // If we emitted JSON, treat as "inlined" for ESM/CJS statement replacement. | |
| if let Some(sp) = item.strip_span() { replacements.insert(sp, replaced_comment(spec, &resolved)); } | |
| continue; | |
| } | |
| if is_code_file(&resolved) { | |
| self.bundle_file(&resolved)?; // DFS inline | |
| if let Some(sp) = item.strip_span() { | |
| replacements.insert(sp, replaced_comment(spec, &resolved)); | |
| } | |
| } else { | |
| // Other asset types – keep import as-is, but note unresolved/unsupported. | |
| self.unresolved.push((spec.to_string(), path.clone())); | |
| } | |
| if !self.can_emit_more() { self.truncated = true; break; } | |
| } | |
| // 3) Write *this* file with optional statement replacement (comment or strip). | |
| // We reconstruct by streaming the original source and splicing where needed. | |
| self.push_out("/* ===== FILE START ===== */\n"); | |
| self.push_out(&format!("/* PATH: {} */\n", path.display())); | |
| let mut cursor = 0usize; | |
| for stmt in &program.body { | |
| let sp = stmt.span(); | |
| if let Some(repl) = replacements.get(&sp) { | |
| // Emit text *before* the statement, then our breadcrumb comment. | |
| let start = sp.start as usize; | |
| let end = sp.end as usize; | |
| if start > cursor { self.push_out(&source[cursor..start]); } | |
| if self.opts.comment_instead_of_strip { self.push_out(repl); } | |
| // else: drop the statement entirely | |
| cursor = end; | |
| } | |
| } | |
| if cursor < source.len() { self.push_out(&source[cursor..]); } | |
| self.push_out("\n/* ===== FILE END ===== */\n\n"); | |
| // Record for preface summary. | |
| self.included_files.push(path); | |
| Ok(()) | |
| } | |
| /// Emit a commented JSON block. This keeps the raw JSON visible to the LLM without | |
| /// affecting any parsing logic in downstream tools. | |
| fn emit_json_block(&mut self, json_path: &Path) -> Result<()> { | |
| if !self.can_emit_more() { self.truncated = true; return Ok(()); } | |
| let text = fs::read_to_string(json_path)?; | |
| self.push_out("/* ===== FILE START ===== */\n"); | |
| self.push_out(&format!("/* PATH: {} (JSON) */\n", json_path.display())); | |
| self.push_out("/* BEGIN JSON */\n"); | |
| self.push_out(&text); | |
| self.push_out("\n/* END JSON */\n"); | |
| self.push_out("/* ===== FILE END ===== */\n\n"); | |
| Ok(()) | |
| } | |
| } | |
| // ------------------------------ Helpers ----------------------------------- | |
| /// Represents a dependency use-site we discovered. | |
| #[derive(Clone, Debug)] | |
| enum SpecUse { | |
| /// ESM `import` or `export ... from` with the statement span. | |
| Esm { span: Span, spec: String }, | |
| /// CJS top-level `require()` with the whole statement span. | |
| CjsTop { span: Span, spec: String }, | |
| /// CJS nested static `require()` (no safe statement span to strip). | |
| CjsNested { spec: String }, | |
| } | |
| impl SpecUse { | |
| fn esm(span: Span, spec: &str) -> Self { Self::Esm { span, spec: spec.to_string() } } | |
| fn cjs_top(span: Span, spec: &str) -> Self { Self::CjsTop { span, spec: spec.to_string() } } | |
| fn cjs_nested(spec: String) -> Self { Self::CjsNested { spec } } | |
| fn spec(&self) -> &str { | |
| match self { Self::Esm { spec, .. } | Self::CjsTop { spec, .. } | Self::CjsNested { spec } => spec } | |
| } | |
| fn strip_span(&self) -> Option<Span> { | |
| match self { Self::Esm { span, .. } | Self::CjsTop { span, .. } => Some(*span), _ => None } | |
| } | |
| } | |
| /// Deduplicate uses by spec, keeping the earliest strip-able span if any. | |
| fn dedup_spec_uses(mut items: Vec<SpecUse>) -> Vec<SpecUse> { | |
| let mut seen: HashMap<String, SpecUse> = HashMap::new(); | |
| for it in items.drain(..) { | |
| let key = it.spec().to_string(); | |
| seen.entry(key).and_modify(|existing| { | |
| // Prefer a variant that has a span (so we can strip/comment), and keep the earliest span. | |
| match (existing.strip_span(), it.strip_span()) { | |
| (None, Some(_)) => { *existing = it.clone(); } | |
| (Some(old), Some(new)) if new.start < old.start => { *existing = it.clone(); } | |
| _ => {} | |
| } | |
| }).or_insert(it); | |
| } | |
| seen.into_values().collect() | |
| } | |
| /// Extract `require("...")` if the expression is a static single-argument call, | |
| /// possibly wrapped in a member expression (e.g. `require("x").foo`). | |
| fn extract_static_require_spec<'a>(expr: &ast::Expression<'a>) -> Option<&'a str> { | |
| use ast::{Argument, Expression, MemberExpression}; | |
| match expr { | |
| Expression::CallExpression(call) => { | |
| if let Expression::Identifier(ident) = &call.callee { | |
| if ident.name.as_str() == "require" && call.arguments.len() == 1 { | |
| if let Argument::Expression(Expression::StringLiteral(lit)) = &call.arguments[0] { | |
| return Some(lit.value.as_str()); | |
| } | |
| } | |
| } | |
| None | |
| } | |
| Expression::MemberExpression(member) => { | |
| match member { | |
| MemberExpression::StaticMemberExpression(m) => extract_static_require_spec(&m.object), | |
| MemberExpression::ComputedMemberExpression(m) => extract_static_require_spec(&m.object), | |
| MemberExpression::PrivateFieldExpression(_) => None, | |
| } | |
| } | |
| _ => None, | |
| } | |
| } | |
| /// Heuristic (regex) for discovering *any* `require("...")` in the file when | |
| /// `include_all_static_requires` is enabled. We don't strip these; we only inline | |
| /// their targets so the LLM has the code for context. | |
| fn find_all_static_requires_heuristic(source: &str) -> Vec<String> { | |
| // NOTE: This is intentionally simple; it is good enough for doc prompts and avoids | |
| // writing a full AST walker. It ignores comments/strings edge cases. | |
| let re = regex::Regex::new(r#"require\s*\(\s*['\"]([^'\"]+)['\"]\s*\)"#).unwrap(); | |
| let mut v = Vec::new(); | |
| let mut seen = HashSet::new(); | |
| for cap in re.captures_iter(source) { | |
| if let Some(m) = cap.get(1) { if seen.insert(m.as_str().to_string()) { v.push(m.as_str().to_string()); } } | |
| } | |
| v | |
| } | |
| fn is_code_file(p: &Path) -> bool { | |
| matches!(p.extension().and_then(|e| e.to_str()).unwrap_or(""), | |
| "js" | "mjs" | "cjs" | "jsx" | | |
| "ts" | "mts" | "cts" | "tsx") | |
| } | |
| fn is_json_file(p: &Path) -> bool { | |
| p.extension().and_then(|e| e.to_str()) == Some("json") | |
| } | |
| fn is_declaration_file(p: &Path) -> bool { | |
| p.extension().and_then(|e| e.to_str()).map(|ext| ext == "d.ts" || ext == "d.mts" || ext == "d.cts").unwrap_or(false) | |
| } | |
| fn is_node_builtin(spec: &str) -> bool { | |
| let s = spec.strip_prefix("node:").unwrap_or(spec); | |
| NODEJS_BUILTINS.iter().any(|&b| b == s) | |
| } | |
| fn looks_like_url(spec: &str) -> bool { | |
| let lower = spec.to_ascii_lowercase(); | |
| lower.starts_with("http://") || lower.starts_with("https://") || lower.starts_with("data:") | |
| } | |
| fn in_node_modules(p: &Path) -> bool { | |
| p.components().any(|c| c.as_os_str() == "node_modules") | |
| } | |
| fn find_upwards(start: &Path, needle: &str) -> Option<PathBuf> { | |
| let mut dir = Some(start.to_path_buf()); | |
| while let Some(d) = dir { | |
| let candidate = d.join(needle); | |
| if candidate.is_file() { return Some(candidate); } | |
| dir = d.parent().map(|p| p.to_path_buf()); | |
| } | |
| None | |
| } | |
| fn replaced_comment(spec: &str, resolved: &Path) -> String { | |
| format!("/* inlined: '{}' => {} */\n", spec, resolved.display()) | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment