Skip to content

Instantly share code, notes, and snippets.

@mohsen1
Last active December 25, 2024 08:05
Show Gist options
  • Save mohsen1/14973c9e4d66d3a1193deb94869c1437 to your computer and use it in GitHub Desktop.
Save mohsen1/14973c9e4d66d3a1193deb94869c1437 to your computer and use it in GitHub Desktop.
Serialize repo for LLMs

serialize-repo.ts

You probably want to put this in your repo and run it via npx and tsx as such:

First make sure you have the dependencies with npm/yarn/pnpm. You need ignore, lodash and yargs

npx tsx src/scripts/serialize-repo.ts

Cheers! ✨

Options:
--version Show version number [boolean]
-s, --size Chunk size in megabytes [number] [default: Infinity]
-p, --path Base path to serialize (optional) [string]
--help Show help [boolean]
Examples:
pnpm serialize-repo Serialize entire repository into a
single file
pnpm serialize-repo -s 10 Split repository into 10MB chunks
pnpm serialize-repo -p src/app Serialize only the src/app directory
pnpm serialize-repo -s 5 -p Split src/components into 5MB chunks
src/components
/**
* This script reads all text-based files in a Git repository or a specified directory,
* splits them into chunks based on size, and writes those chunks to disk in a structured format.
* It also calculates a checksum to keep track of repository state.
*/
import { execSync } from 'child_process';
import crypto from 'crypto';
import fs from 'fs/promises';
import ignore, { Ignore } from 'ignore';
import _ from 'lodash';
import path from 'path';
import yargs from 'yargs';
import { hideBin } from 'yargs/helpers';
/** Logger to output messages (replace with your preferred logger). */
const logger = console;
/**
* Set of known file extensions that are typically binary.
* Files with these extensions won't be read for textual content.
*/
const BINARY_FILE_EXTENSIONS = new Set([
'.jpg', '.jpeg', '.png', '.gif', '.ico', '.webp', '.pdf', '.mp4',
'.webm', '.mov', '.mp3', '.wav', '.ttf', '.woff', '.woff2', '.eot',
'.exe', '.dll', '.bin', '.iso', '.img', '.dmg', '.dat', '.sys',
'.so', '.o', '.a', '.lib', '.class', '.jar', '.apk', '.com',
'.elf', '.drv', '.rom', '.vhd', '.vhdx', '.gho', '.efi', '.bpl',
'.cpl', '.ocx', '.scr', '.rco', '.ovl', '.mo', '.nib', '.xap',
'.psf', '.pak', '.img3', '.img4', '.msi', '.cab', '.otf', '.cur',
'.ani', '.swf', '.fla', '.flv', '.mpg', '.mpeg', '.avi', '.wmv',
'.mkv', '.ogg', '.ogv', '.wma', '.mid', '.midi', '.aac', '.flac',
'.bmp', '.psd', '.ai', '.eps', '.raw', '.tif', '.tiff', '.3ds',
'.max', '.obj', '.fbx', '.blend', '.crt', '.key', '.pem', '.der',
'.png2', '.jp2', '.swc', '.mso', '.p12', '.p7b', '.gbr', '.pcb',
'.icns', '.xdf', '.zip', '.rar', '.7z', '.gz', '.tar', '.tgz',
'.bz2', '.xz'
]);
/**
* Represents a single file entry with its path and content.
*/
interface FileEntry {
/**
* Relative file path.
*/
path: string;
/**
* File content as a string.
*/
content: string;
}
/**
* Narrows any unknown type error to a standard Error object.
* @param error The unknown type to be asserted.
* @throws Throws a new Error if the provided error is not an instance of Error.
*/
function assertError(error: unknown): asserts error is Error {
if (!(error instanceof Error)) throw new Error('Unknown error type');
}
/**
* Reads the `.gitignore` file if present and returns an Ignore instance.
* @returns An Ignore instance containing rules from `.gitignore`.
*/
async function readGitignore(): Promise<Ignore> {
const ig = ignore();
try {
const gitignore = await fs.readFile('.gitignore', 'utf-8');
ig.add(gitignore);
} catch (error) {
assertError(error);
logger.warn('No .gitignore found, proceeding without it:', error.message);
}
return ig;
}
/**
* Checks if a file is likely to be text by extension and by scanning its first 4KB for null bytes.
* @param filePath Absolute path to the file.
* @returns Boolean indicating if the file is text (true) or binary (false).
*/
async function isTextFile(filePath: string): Promise<boolean> {
const ext = path.extname(filePath).toLowerCase();
if (BINARY_FILE_EXTENSIONS.has(ext)) return false;
try {
const fd = await fs.open(filePath, 'r');
const buffer = Buffer.alloc(4096);
const { bytesRead } = await fd.read(buffer, 0, 4096, 0);
await fd.close();
for (let i = 0; i < bytesRead; i++) {
if (buffer[i] === 0) return false; // Null byte detected
}
return true;
} catch (error) {
assertError(error);
logger.error('Error checking file type:', error.message);
return false;
}
}
/**
* Recursively walks through a directory, yielding file paths of text-based files
* while respecting `.gitignore` rules and optional basePath constraints.
* @param dir The directory to walk.
* @param ig Ignore instance for filtering paths.
* @param basePath Optional absolute root path for restricting traversal.
* @param base Internal parameter for forming relative paths.
*/
async function* walkDirectory(
dir: string,
ig: Ignore,
basePath?: string,
base = '',
): AsyncGenerator<string> {
const entries = await fs.readdir(dir, { withFileTypes: true });
for (const entry of entries) {
const relativePath = path.join(base, entry.name);
const fullPath = path.join(dir, entry.name);
// Skip if path is outside basePath
if (basePath && !fullPath.startsWith(basePath)) continue;
// Skip paths ignored by .gitignore
if (ig.ignores(relativePath)) continue;
if (entry.isDirectory()) {
yield* walkDirectory(fullPath, ig, basePath, relativePath);
} else if (entry.isFile() && (await isTextFile(fullPath))) {
yield fullPath;
}
}
}
/**
* Computes a short hash representing the tracked files in a Git repository.
* Uses Git to list and hash tracked files, then adds chunk size info to the hash.
* @param chunkSize The maximum chunk size in MB.
* @returns A short hash string or a timestamp-based fallback if not in a Git repository.
*/
async function getRepoChecksum(chunkSize: number): Promise<string> {
try {
const trackedFiles = execSync('git ls-files -c --exclude-standard')
.toString()
.trim()
.split('\n')
.sort();
const hash = crypto.createHash('sha256');
for (const file of trackedFiles) {
try {
const fileHash = execSync(`git hash-object "${file}"`).toString().trim();
hash.update(`${file}:${fileHash}\n`);
} catch (error) {
assertError(error);
// Skip files that can't be hashed
continue;
}
}
// Include chunkSize to differentiate different chunk settings
if (chunkSize !== Infinity) hash.update(chunkSize.toString());
return hash.digest('hex').slice(0, 8);
} catch (error) {
assertError(error);
// If not a Git repo, return timestamp as fallback
logger.warn('Not a git repository, using timestamp as fallback:', error.message);
return Date.now().toString(36);
}
}
/**
* Writes a collection of FileEntry objects to disk as a single chunk.
* @param files An array of FileEntry objects to write.
* @param index The chunk index for naming the output file.
* @param outputDir The directory to which the chunk file will be written.
*/
async function writeChunk(files: FileEntry[], index: number, outputDir: string): Promise<void> {
const chunk = files.map((file) => `>>>> ${file.path}\n${file.content}`).join('\n\n');
const outputPath = path.join(outputDir, `chunk-${index}.txt`);
await fs.writeFile(outputPath, chunk, 'utf-8');
logger.info(`Written chunk ${index} with ${files.length} files`);
}
/**
* Options controlling repository serialization behavior.
*/
interface SerializeOptions {
/**
* Maximum chunk size in megabytes. Use Infinity for a single-chunk output.
*/
chunkSizeMB: number;
/**
* Base path to serialize. Defaults to the current working directory if omitted.
*/
basePath?: string;
}
/**
* Serializes text-based files in a repository or subdirectory into chunks.
* Each chunk is written as a single text file containing multiple file contents.
* @param options The serialization options including chunk size and optional base path.
* @returns The output directory path where all chunk files are stored.
*/
async function serializeRepo(options: SerializeOptions): Promise<string> {
const { chunkSizeMB, basePath } = options;
const checksum = await getRepoChecksum(chunkSizeMB);
const pathSuffix = basePath ? `_${path.basename(basePath)}` : '';
const dirName =
chunkSizeMB === Infinity
? `${checksum}${pathSuffix}`
: `${checksum}${pathSuffix}_${chunkSizeMB}mb`;
const outputDir = path.join(process.cwd(), 'repo-serialized', dirName);
await fs.mkdir(outputDir, { recursive: true });
const ig = await readGitignore();
const files: FileEntry[] = [];
let currentChunkSize = 0;
let chunkIndex = 0;
const startPath = basePath ? path.resolve(process.cwd(), basePath) : process.cwd();
for await (const filePath of walkDirectory(startPath, ig, startPath)) {
try {
const content = await fs.readFile(filePath, 'utf-8');
const fileSize = Buffer.byteLength(content, 'utf-8');
// If next file exceeds chunk size, write the current chunk first
if (currentChunkSize + fileSize > chunkSizeMB * 1024 * 1024) {
await writeChunk(files, chunkIndex++, outputDir);
files.length = 0;
currentChunkSize = 0;
}
files.push({
path: path.relative(process.cwd(), filePath),
content,
});
currentChunkSize += fileSize;
} catch (error) {
assertError(error);
logger.error(`Error processing file ${filePath}:`, error.message);
}
}
// Write any leftover files as the final chunk
if (files.length > 0) {
await writeChunk(files, chunkIndex, outputDir);
}
return outputDir;
}
// Configure command-line options
const argv = yargs(hideBin(process.argv))
.option('size', {
alias: 's',
type: 'number',
description: 'Chunk size in megabytes',
default: Infinity,
})
.option('path', {
alias: 'p',
type: 'string',
description: 'Base path to serialize (optional)',
})
.example('pnpm serialize-repo', 'Serialize entire repository into a single file')
.example('pnpm serialize-repo -s 10', 'Split repository into 10MB chunks')
.example('pnpm serialize-repo -p src/app', 'Serialize only the src/app directory')
.example('pnpm serialize-repo -s 5 -p src/components', 'Split src/components into 5MB chunks')
.check(async (argv) => {
if (isNaN(argv.size) || argv.size <= 0) {
throw new Error('Please provide a valid chunk size in megabytes');
}
if (
argv.path &&
!(await fs
.access(argv.path)
.then(() => true)
.catch(() => false))
) {
throw new Error('Provided path does not exist');
}
return true;
})
.help().argv;
/**
* Main entry point. Parses command-line options, then serializes the repository.
*/
async function main() {
const { size, path: basePath } = await argv;
logger.info(
`Serializing repo from ${basePath || 'root'} ${
size !== Infinity ? ` with chunk size ${size}MB` : ''
}`
);
const outputDir = await serializeRepo({ chunkSizeMB: size, basePath });
logger.info(`✨ Repository serialized successfully!`);
if (size !== Infinity) {
const files = await fs.readdir(outputDir);
logger.info(`Generated chunks:`);
for (const file of files) {
logger.info(path.join(outputDir, file));
}
} else {
logger.info(`Outputed file:`);
logger.info(path.join(outputDir, 'chunk-0.txt'));
}
}
// Execute main
void main();
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment