You probably want to put this in your repo and run it via npx
and tsx
as such:
First make sure you have the dependencies with npm/yarn/pnpm. You need ignore
, lodash
and yargs
npx tsx src/scripts/serialize-repo.ts
Cheers! ✨
Options: | |
--version Show version number [boolean] | |
-s, --size Chunk size in megabytes [number] [default: Infinity] | |
-p, --path Base path to serialize (optional) [string] | |
--help Show help [boolean] | |
Examples: | |
pnpm serialize-repo Serialize entire repository into a | |
single file | |
pnpm serialize-repo -s 10 Split repository into 10MB chunks | |
pnpm serialize-repo -p src/app Serialize only the src/app directory | |
pnpm serialize-repo -s 5 -p Split src/components into 5MB chunks | |
src/components |
/** | |
* This script reads all text-based files in a Git repository or a specified directory, | |
* splits them into chunks based on size, and writes those chunks to disk in a structured format. | |
* It also calculates a checksum to keep track of repository state. | |
*/ | |
import { execSync } from 'child_process'; | |
import crypto from 'crypto'; | |
import fs from 'fs/promises'; | |
import ignore, { Ignore } from 'ignore'; | |
import _ from 'lodash'; | |
import path from 'path'; | |
import yargs from 'yargs'; | |
import { hideBin } from 'yargs/helpers'; | |
/** Logger to output messages (replace with your preferred logger). */ | |
const logger = console; | |
/** | |
* Set of known file extensions that are typically binary. | |
* Files with these extensions won't be read for textual content. | |
*/ | |
const BINARY_FILE_EXTENSIONS = new Set([ | |
'.jpg', '.jpeg', '.png', '.gif', '.ico', '.webp', '.pdf', '.mp4', | |
'.webm', '.mov', '.mp3', '.wav', '.ttf', '.woff', '.woff2', '.eot', | |
'.exe', '.dll', '.bin', '.iso', '.img', '.dmg', '.dat', '.sys', | |
'.so', '.o', '.a', '.lib', '.class', '.jar', '.apk', '.com', | |
'.elf', '.drv', '.rom', '.vhd', '.vhdx', '.gho', '.efi', '.bpl', | |
'.cpl', '.ocx', '.scr', '.rco', '.ovl', '.mo', '.nib', '.xap', | |
'.psf', '.pak', '.img3', '.img4', '.msi', '.cab', '.otf', '.cur', | |
'.ani', '.swf', '.fla', '.flv', '.mpg', '.mpeg', '.avi', '.wmv', | |
'.mkv', '.ogg', '.ogv', '.wma', '.mid', '.midi', '.aac', '.flac', | |
'.bmp', '.psd', '.ai', '.eps', '.raw', '.tif', '.tiff', '.3ds', | |
'.max', '.obj', '.fbx', '.blend', '.crt', '.key', '.pem', '.der', | |
'.png2', '.jp2', '.swc', '.mso', '.p12', '.p7b', '.gbr', '.pcb', | |
'.icns', '.xdf', '.zip', '.rar', '.7z', '.gz', '.tar', '.tgz', | |
'.bz2', '.xz' | |
]); | |
/** | |
* Represents a single file entry with its path and content. | |
*/ | |
interface FileEntry { | |
/** | |
* Relative file path. | |
*/ | |
path: string; | |
/** | |
* File content as a string. | |
*/ | |
content: string; | |
} | |
/** | |
* Narrows any unknown type error to a standard Error object. | |
* @param error The unknown type to be asserted. | |
* @throws Throws a new Error if the provided error is not an instance of Error. | |
*/ | |
function assertError(error: unknown): asserts error is Error { | |
if (!(error instanceof Error)) throw new Error('Unknown error type'); | |
} | |
/** | |
* Reads the `.gitignore` file if present and returns an Ignore instance. | |
* @returns An Ignore instance containing rules from `.gitignore`. | |
*/ | |
async function readGitignore(): Promise<Ignore> { | |
const ig = ignore(); | |
try { | |
const gitignore = await fs.readFile('.gitignore', 'utf-8'); | |
ig.add(gitignore); | |
} catch (error) { | |
assertError(error); | |
logger.warn('No .gitignore found, proceeding without it:', error.message); | |
} | |
return ig; | |
} | |
/** | |
* Checks if a file is likely to be text by extension and by scanning its first 4KB for null bytes. | |
* @param filePath Absolute path to the file. | |
* @returns Boolean indicating if the file is text (true) or binary (false). | |
*/ | |
async function isTextFile(filePath: string): Promise<boolean> { | |
const ext = path.extname(filePath).toLowerCase(); | |
if (BINARY_FILE_EXTENSIONS.has(ext)) return false; | |
try { | |
const fd = await fs.open(filePath, 'r'); | |
const buffer = Buffer.alloc(4096); | |
const { bytesRead } = await fd.read(buffer, 0, 4096, 0); | |
await fd.close(); | |
for (let i = 0; i < bytesRead; i++) { | |
if (buffer[i] === 0) return false; // Null byte detected | |
} | |
return true; | |
} catch (error) { | |
assertError(error); | |
logger.error('Error checking file type:', error.message); | |
return false; | |
} | |
} | |
/** | |
* Recursively walks through a directory, yielding file paths of text-based files | |
* while respecting `.gitignore` rules and optional basePath constraints. | |
* @param dir The directory to walk. | |
* @param ig Ignore instance for filtering paths. | |
* @param basePath Optional absolute root path for restricting traversal. | |
* @param base Internal parameter for forming relative paths. | |
*/ | |
async function* walkDirectory( | |
dir: string, | |
ig: Ignore, | |
basePath?: string, | |
base = '', | |
): AsyncGenerator<string> { | |
const entries = await fs.readdir(dir, { withFileTypes: true }); | |
for (const entry of entries) { | |
const relativePath = path.join(base, entry.name); | |
const fullPath = path.join(dir, entry.name); | |
// Skip if path is outside basePath | |
if (basePath && !fullPath.startsWith(basePath)) continue; | |
// Skip paths ignored by .gitignore | |
if (ig.ignores(relativePath)) continue; | |
if (entry.isDirectory()) { | |
yield* walkDirectory(fullPath, ig, basePath, relativePath); | |
} else if (entry.isFile() && (await isTextFile(fullPath))) { | |
yield fullPath; | |
} | |
} | |
} | |
/** | |
* Computes a short hash representing the tracked files in a Git repository. | |
* Uses Git to list and hash tracked files, then adds chunk size info to the hash. | |
* @param chunkSize The maximum chunk size in MB. | |
* @returns A short hash string or a timestamp-based fallback if not in a Git repository. | |
*/ | |
async function getRepoChecksum(chunkSize: number): Promise<string> { | |
try { | |
const trackedFiles = execSync('git ls-files -c --exclude-standard') | |
.toString() | |
.trim() | |
.split('\n') | |
.sort(); | |
const hash = crypto.createHash('sha256'); | |
for (const file of trackedFiles) { | |
try { | |
const fileHash = execSync(`git hash-object "${file}"`).toString().trim(); | |
hash.update(`${file}:${fileHash}\n`); | |
} catch (error) { | |
assertError(error); | |
// Skip files that can't be hashed | |
continue; | |
} | |
} | |
// Include chunkSize to differentiate different chunk settings | |
if (chunkSize !== Infinity) hash.update(chunkSize.toString()); | |
return hash.digest('hex').slice(0, 8); | |
} catch (error) { | |
assertError(error); | |
// If not a Git repo, return timestamp as fallback | |
logger.warn('Not a git repository, using timestamp as fallback:', error.message); | |
return Date.now().toString(36); | |
} | |
} | |
/** | |
* Writes a collection of FileEntry objects to disk as a single chunk. | |
* @param files An array of FileEntry objects to write. | |
* @param index The chunk index for naming the output file. | |
* @param outputDir The directory to which the chunk file will be written. | |
*/ | |
async function writeChunk(files: FileEntry[], index: number, outputDir: string): Promise<void> { | |
const chunk = files.map((file) => `>>>> ${file.path}\n${file.content}`).join('\n\n'); | |
const outputPath = path.join(outputDir, `chunk-${index}.txt`); | |
await fs.writeFile(outputPath, chunk, 'utf-8'); | |
logger.info(`Written chunk ${index} with ${files.length} files`); | |
} | |
/** | |
* Options controlling repository serialization behavior. | |
*/ | |
interface SerializeOptions { | |
/** | |
* Maximum chunk size in megabytes. Use Infinity for a single-chunk output. | |
*/ | |
chunkSizeMB: number; | |
/** | |
* Base path to serialize. Defaults to the current working directory if omitted. | |
*/ | |
basePath?: string; | |
} | |
/** | |
* Serializes text-based files in a repository or subdirectory into chunks. | |
* Each chunk is written as a single text file containing multiple file contents. | |
* @param options The serialization options including chunk size and optional base path. | |
* @returns The output directory path where all chunk files are stored. | |
*/ | |
async function serializeRepo(options: SerializeOptions): Promise<string> { | |
const { chunkSizeMB, basePath } = options; | |
const checksum = await getRepoChecksum(chunkSizeMB); | |
const pathSuffix = basePath ? `_${path.basename(basePath)}` : ''; | |
const dirName = | |
chunkSizeMB === Infinity | |
? `${checksum}${pathSuffix}` | |
: `${checksum}${pathSuffix}_${chunkSizeMB}mb`; | |
const outputDir = path.join(process.cwd(), 'repo-serialized', dirName); | |
await fs.mkdir(outputDir, { recursive: true }); | |
const ig = await readGitignore(); | |
const files: FileEntry[] = []; | |
let currentChunkSize = 0; | |
let chunkIndex = 0; | |
const startPath = basePath ? path.resolve(process.cwd(), basePath) : process.cwd(); | |
for await (const filePath of walkDirectory(startPath, ig, startPath)) { | |
try { | |
const content = await fs.readFile(filePath, 'utf-8'); | |
const fileSize = Buffer.byteLength(content, 'utf-8'); | |
// If next file exceeds chunk size, write the current chunk first | |
if (currentChunkSize + fileSize > chunkSizeMB * 1024 * 1024) { | |
await writeChunk(files, chunkIndex++, outputDir); | |
files.length = 0; | |
currentChunkSize = 0; | |
} | |
files.push({ | |
path: path.relative(process.cwd(), filePath), | |
content, | |
}); | |
currentChunkSize += fileSize; | |
} catch (error) { | |
assertError(error); | |
logger.error(`Error processing file ${filePath}:`, error.message); | |
} | |
} | |
// Write any leftover files as the final chunk | |
if (files.length > 0) { | |
await writeChunk(files, chunkIndex, outputDir); | |
} | |
return outputDir; | |
} | |
// Configure command-line options | |
const argv = yargs(hideBin(process.argv)) | |
.option('size', { | |
alias: 's', | |
type: 'number', | |
description: 'Chunk size in megabytes', | |
default: Infinity, | |
}) | |
.option('path', { | |
alias: 'p', | |
type: 'string', | |
description: 'Base path to serialize (optional)', | |
}) | |
.example('pnpm serialize-repo', 'Serialize entire repository into a single file') | |
.example('pnpm serialize-repo -s 10', 'Split repository into 10MB chunks') | |
.example('pnpm serialize-repo -p src/app', 'Serialize only the src/app directory') | |
.example('pnpm serialize-repo -s 5 -p src/components', 'Split src/components into 5MB chunks') | |
.check(async (argv) => { | |
if (isNaN(argv.size) || argv.size <= 0) { | |
throw new Error('Please provide a valid chunk size in megabytes'); | |
} | |
if ( | |
argv.path && | |
!(await fs | |
.access(argv.path) | |
.then(() => true) | |
.catch(() => false)) | |
) { | |
throw new Error('Provided path does not exist'); | |
} | |
return true; | |
}) | |
.help().argv; | |
/** | |
* Main entry point. Parses command-line options, then serializes the repository. | |
*/ | |
async function main() { | |
const { size, path: basePath } = await argv; | |
logger.info( | |
`Serializing repo from ${basePath || 'root'} ${ | |
size !== Infinity ? ` with chunk size ${size}MB` : '' | |
}` | |
); | |
const outputDir = await serializeRepo({ chunkSizeMB: size, basePath }); | |
logger.info(`✨ Repository serialized successfully!`); | |
if (size !== Infinity) { | |
const files = await fs.readdir(outputDir); | |
logger.info(`Generated chunks:`); | |
for (const file of files) { | |
logger.info(path.join(outputDir, file)); | |
} | |
} else { | |
logger.info(`Outputed file:`); | |
logger.info(path.join(outputDir, 'chunk-0.txt')); | |
} | |
} | |
// Execute main | |
void main(); |