trvswgnr · February 6, 2025 15:40
diff --git a/fileContains.test.ts b/fileContains.test.ts
 import { describe, it, expect, beforeAll, afterAll } from "bun:test";
 import { fileContains, BoyerMooreSearcher } from "./fileContains";
 import fs from "fs";
 import path from "path";

 describe("fileContains", () => {
    const testDir = path.join(__dirname, "test-files");

    beforeAll(() => {
        if (!fs.existsSync(testDir)) {
            fs.mkdirSync(testDir);
        }
    });

    afterAll(() => {
        fs.rmSync(testDir, { recursive: true, force: true });
    });

    it("should find a phrase in a small file", async () => {
        const filePath = path.join(testDir, "small.txt");
        fs.writeFileSync(filePath, "Hello, world!");
        expect(await fileContains(filePath, "world")).toBe(true);
        expect(await fileContains(filePath, "universe")).toBe(false);
    });

    it("should find a phrase spanning multiple chunks", async () => {
        const filePath = path.join(testDir, "multi-chunk.txt");
        const content = "a".repeat(65536) + "target" + "b".repeat(65536);
        fs.writeFileSync(filePath, content);

        expect(await fileContains(filePath, "target")).toBe(true);
    });

    it("should handle multi-byte characters", async () => {
        const filePath = path.join(testDir, "multi-byte.txt");
        fs.writeFileSync(filePath, "你好，世界！");

        expect(await fileContains(filePath, "世界")).toBe(true);
        expect(await fileContains(filePath, "宇宙")).toBe(false);
    });

    it("should handle large files efficiently", async () => {
        const filePath = path.join(testDir, "large.txt");
        const content = "a".repeat(10 * 1024 * 1024) + "needle" + "b".repeat(10 * 1024 * 1024);
        fs.writeFileSync(filePath, content);

        const start = Date.now();
        const result = await fileContains(filePath, "needle");
        const end = Date.now();

        expect(result).toBe(true);
        expect(end - start).toBeLessThan(1000); // Should complete in less than 1 second
    });
 });

 describe("StreamSearcher", () => {
    it("should find a phrase in a single chunk", () => {
        const searcher = new BoyerMooreSearcher("world");
        expect(searcher.search("Hello, world!")).toBe(true);
        expect(searcher.search("Hello, universe!")).toBe(false);
    });

    it("should find a phrase spanning multiple chunks", () => {
        const searcher = new BoyerMooreSearcher("target");
        expect(searcher.search("Some text before the tar")).toBe(false);
        expect(searcher.search("get phrase")).toBe(true);
    });

    it("should handle multi-byte characters", () => {
        const searcher = new BoyerMooreSearcher("世界");
        expect(searcher.search("你好，")).toBe(false);
        expect(searcher.search("世界！")).toBe(true);
    });
 });
diff --git a/fileContains.ts b/fileContains.ts
 import { createReadStream } from "fs";

 /**
 * the default chunk size to read from the file, in bytes
 */
 const DEFAULT_CHUNK_SIZE = 16 * 1024;

 /**
 * searches for a specific phrase in a file using the Boyer-Moore algorithm
 *
 * reads the file in chunks and uses a stream-based approach to efficiently
 * search for the given phrase without loading the entire file into memory. it
 * utilizes the Boyer-Moore string searching algorithm for improved performance
 *
 * @param filePath - the path to the file to be searched
 * @param phrase - the phrase to search for within the file
 * @param chunkSize - the size of each chunk to read from the file, in bytes
 * defaults to 16KB (16 * 1024 bytes)
 *
 * @returns a Promise that resolves to `true` if the phrase is found in the
 * file, otherwise `false`
 *
 * @throws rejects with an error if there's an issue reading the file
 *
 * @example
 * const found = await searchFileForPhrase('/path/to/file.txt', 'search phrase');
 * if (found) {
 *   console.log('phrase found in file');
 * } else {
 *   console.log('not found');
 * }
 */
 export async function fileContains(
    filePath: string,
    phrase: string,
    chunkSize = DEFAULT_CHUNK_SIZE,
 ): Promise<boolean> {
    return new Promise((resolve, reject) => {
        const searcher = new BoyerMooreSearcher(phrase);
        const stream = createReadStream(filePath, { highWaterMark: chunkSize });

        stream.on("data", (chunk: Buffer) => {
            if (searcher.search(chunk.toString("utf8"))) {
                stream.destroy();
                resolve(true);
            }
        });

        stream.on("end", () => resolve(false));
        stream.on("error", reject);
    });
 }

 /**
 * implements the Boyer-Moore string searching algorithm
 *
 * designed to efficiently search for a pattern within a stream of characters
 */
 export class BoyerMooreSearcher {
    private badCharMap: BadCharMap;
    private pattern: string;
    private buffer: string;
    private patternLength: number;

    /**
     * initializes a new instance of the BooyerMooreSearcher
     *
     * @param pattern - the pattern to search for
     * @constructor
     */
    constructor(pattern: string) {
        this.pattern = pattern;
        this.badCharMap = new BadCharMap(pattern);
        this.buffer = "";
        this.patternLength = [...pattern].length; // handle multi-byte characters correctly
    }

    /**
     * searches for the pattern in the given chunk.
     *
     * designed to be called repeatedly with each chunk in the stream
     *
     * @param chunk - the next chunk in the stream to search.
     * @returns `true` if the pattern is found, otherwise `false`
     * @note modifies the internal buffer and returns a boolean
     */
    search(chunk: string): boolean {
        this.buffer += chunk;
        const r = this.findPattern();

        // reset the buffer if a match is found or keep only the last
        // `(patternLength - 1)` characters
        this.buffer = r ? "" : this.buffer.slice(-this.patternLength + 1);

        return r;
    }

    /**
     * implements the core Boyer-Moore algorithm to find the pattern in the
     * current buffer
     *
     * @returns `true` if the pattern is found in the current buffer, otherwise
     * `false`
     * @private
     */
    private findPattern(): boolean {
        const len = this.buffer.length;
        let i = this.patternLength - 1;

        while (i < len) {
            let j = this.patternLength - 1;

            // compare characters from right to left
            while (j >= 0 && this.buffer[i - (this.patternLength - j - 1)] === this.pattern[j]) {
                j--;
            }

            if (j < 0) {
                return true; // pattern found
            }

            // use the bad character heuristic to skip characters
            const skip = this.badCharMap.get(this.buffer[i]!) ?? this.patternLength;
            i += Math.max(1, skip);
        }

        return false; // pattern not found in this chunk
    }
 }

 /**
 * implements a Bad Character Map for the Boyer-Moore string searching algorithm
 *
 * used to preprocess the search pattern and create a lookup table for the bad
 * character heuristic. it allows for efficient skipping of characters during
 * the search process, improving the overall performance of the algorithm
 *
 * the map uses hashed character codes as keys and their rightmost positions in
 * the pattern as values, allowing for constant-time lookups
 */
 export class BadCharMap extends Map<number, number> {
    /**
     * constructs a new BadCharMap for the given pattern
     *
     * @param pattern - the search pattern to create the map for
     * @constructor
     */
    constructor(pattern: string) {
        super();
        const chars = [...pattern]; // handle Unicode characters correctly
        const len = chars.length;
        let hash: number;

        // iterate through the pattern from left to right
        for (let i = 0; i < len; i++) {
            // hash each character and store its rightmost position
            hash = hashString(chars.at(i)!);
            // the value is the distance from the rightmost occurrence to the
            // end of the pattern
            this.set(hash, len - i - 1);
        }
    }

    /**
     * retrieves the skip value for a given character
     *
     * overrides the default {@link Map.prototype.get} method to allow lookups
     * using either a string (which will be hashed) or a pre-computed hash value
     *
     * @param i - the character (as a string) or its hash value (as a number) to
     * look up
     * @returns the skip value for the character, or `undefined` if not found in
     * the map
     */
    get(i: string | number): number | undefined {
        if (typeof i === "string") {
            return super.get(hashString(i));
        }
        return super.get(i);
    }
 }

 /**
 * generates a hash code for a string using the djb2 algorithm
 *
 * @param s - the input string to hash
 * @returns a 32-bit integer hash code
 */
 export function hashString(s: string): number {
    let hash = 5381;
    const len = s.length;

    for (let i = 0; i < len; i++) {
        // bitwise left shift by 5 is equivalent to multiplying by 32
        hash = (hash << 5) + hash + s.charCodeAt(i);
        // convert to 32-bit integer
        hash |= 0;
    }

    return hash >>> 0; // make sure it's an unsigned 32-bit integer
 }
	import { describe, it, expect, beforeAll, afterAll } from "bun:test";
	import { fileContains, BoyerMooreSearcher } from "./fileContains";
	import fs from "fs";
	import path from "path";

	describe("fileContains", () => {
	const testDir = path.join(__dirname, "test-files");

	beforeAll(() => {
	if (!fs.existsSync(testDir)) {
	fs.mkdirSync(testDir);
	}
	});

	afterAll(() => {
	fs.rmSync(testDir, { recursive: true, force: true });
	});

	it("should find a phrase in a small file", async () => {
	const filePath = path.join(testDir, "small.txt");
	fs.writeFileSync(filePath, "Hello, world!");
	expect(await fileContains(filePath, "world")).toBe(true);
	expect(await fileContains(filePath, "universe")).toBe(false);
	});

	it("should find a phrase spanning multiple chunks", async () => {
	const filePath = path.join(testDir, "multi-chunk.txt");
	const content = "a".repeat(65536) + "target" + "b".repeat(65536);
	fs.writeFileSync(filePath, content);

	expect(await fileContains(filePath, "target")).toBe(true);
	});

	it("should handle multi-byte characters", async () => {
	const filePath = path.join(testDir, "multi-byte.txt");
	fs.writeFileSync(filePath, "你好，世界！");

	expect(await fileContains(filePath, "世界")).toBe(true);
	expect(await fileContains(filePath, "宇宙")).toBe(false);
	});

	it("should handle large files efficiently", async () => {
	const filePath = path.join(testDir, "large.txt");
	const content = "a".repeat(10 * 1024 * 1024) + "needle" + "b".repeat(10 * 1024 * 1024);
	fs.writeFileSync(filePath, content);

	const start = Date.now();
	const result = await fileContains(filePath, "needle");
	const end = Date.now();

	expect(result).toBe(true);
	expect(end - start).toBeLessThan(1000); // Should complete in less than 1 second
	});
	});

	describe("StreamSearcher", () => {
	it("should find a phrase in a single chunk", () => {
	const searcher = new BoyerMooreSearcher("world");
	expect(searcher.search("Hello, world!")).toBe(true);
	expect(searcher.search("Hello, universe!")).toBe(false);
	});

	it("should find a phrase spanning multiple chunks", () => {
	const searcher = new BoyerMooreSearcher("target");
	expect(searcher.search("Some text before the tar")).toBe(false);
	expect(searcher.search("get phrase")).toBe(true);
	});

	it("should handle multi-byte characters", () => {
	const searcher = new BoyerMooreSearcher("世界");
	expect(searcher.search("你好，")).toBe(false);
	expect(searcher.search("世界！")).toBe(true);
	});
	});
	import { createReadStream } from "fs";

	/**
	* the default chunk size to read from the file, in bytes
	*/
	const DEFAULT_CHUNK_SIZE = 16 * 1024;

	/**
	* searches for a specific phrase in a file using the Boyer-Moore algorithm
	*
	* reads the file in chunks and uses a stream-based approach to efficiently
	* search for the given phrase without loading the entire file into memory. it
	* utilizes the Boyer-Moore string searching algorithm for improved performance
	*
	* @param filePath - the path to the file to be searched
	* @param phrase - the phrase to search for within the file
	* @param chunkSize - the size of each chunk to read from the file, in bytes
	* defaults to 16KB (16 * 1024 bytes)
	*
	* @returns a Promise that resolves to `true` if the phrase is found in the
	* file, otherwise `false`
	*
	* @throws rejects with an error if there's an issue reading the file
	*
	* @example
	* const found = await searchFileForPhrase('/path/to/file.txt', 'search phrase');
	* if (found) {
	* console.log('phrase found in file');
	* } else {
	* console.log('not found');
	* }
	*/
	export async function fileContains(
	filePath: string,
	phrase: string,
	chunkSize = DEFAULT_CHUNK_SIZE,
	): Promise<boolean> {
	return new Promise((resolve, reject) => {
	const searcher = new BoyerMooreSearcher(phrase);
	const stream = createReadStream(filePath, { highWaterMark: chunkSize });

	stream.on("data", (chunk: Buffer) => {
	if (searcher.search(chunk.toString("utf8"))) {
	stream.destroy();
	resolve(true);
	}
	});

	stream.on("end", () => resolve(false));
	stream.on("error", reject);
	});
	}

	/**
	* implements the Boyer-Moore string searching algorithm
	*
	* designed to efficiently search for a pattern within a stream of characters
	*/
	export class BoyerMooreSearcher {
	private badCharMap: BadCharMap;
	private pattern: string;
	private buffer: string;
	private patternLength: number;

	/**
	* initializes a new instance of the BooyerMooreSearcher
	*
	* @param pattern - the pattern to search for
	* @constructor
	*/
	constructor(pattern: string) {
	this.pattern = pattern;
	this.badCharMap = new BadCharMap(pattern);
	this.buffer = "";
	this.patternLength = [...pattern].length; // handle multi-byte characters correctly
	}

	/**
	* searches for the pattern in the given chunk.
	*
	* designed to be called repeatedly with each chunk in the stream
	*
	* @param chunk - the next chunk in the stream to search.
	* @returns `true` if the pattern is found, otherwise `false`
	* @note modifies the internal buffer and returns a boolean
	*/
	search(chunk: string): boolean {
	this.buffer += chunk;
	const r = this.findPattern();

	// reset the buffer if a match is found or keep only the last
	// `(patternLength - 1)` characters
	this.buffer = r ? "" : this.buffer.slice(-this.patternLength + 1);

	return r;
	}

	/**
	* implements the core Boyer-Moore algorithm to find the pattern in the
	* current buffer
	*
	* @returns `true` if the pattern is found in the current buffer, otherwise
	* `false`
	* @private
	*/
	private findPattern(): boolean {
	const len = this.buffer.length;
	let i = this.patternLength - 1;

	while (i < len) {
	let j = this.patternLength - 1;

	// compare characters from right to left
	while (j >= 0 && this.buffer[i - (this.patternLength - j - 1)] === this.pattern[j]) {
	j--;
	}

	if (j < 0) {
	return true; // pattern found
	}

	// use the bad character heuristic to skip characters
	const skip = this.badCharMap.get(this.buffer[i]!) ?? this.patternLength;
	i += Math.max(1, skip);
	}

	return false; // pattern not found in this chunk
	}
	}

	/**
	* implements a Bad Character Map for the Boyer-Moore string searching algorithm
	*
	* used to preprocess the search pattern and create a lookup table for the bad
	* character heuristic. it allows for efficient skipping of characters during
	* the search process, improving the overall performance of the algorithm
	*
	* the map uses hashed character codes as keys and their rightmost positions in
	* the pattern as values, allowing for constant-time lookups
	*/
	export class BadCharMap extends Map<number, number> {
	/**
	* constructs a new BadCharMap for the given pattern
	*
	* @param pattern - the search pattern to create the map for
	* @constructor
	*/
	constructor(pattern: string) {
	super();
	const chars = [...pattern]; // handle Unicode characters correctly
	const len = chars.length;
	let hash: number;

	// iterate through the pattern from left to right
	for (let i = 0; i < len; i++) {
	// hash each character and store its rightmost position
	hash = hashString(chars.at(i)!);
	// the value is the distance from the rightmost occurrence to the
	// end of the pattern
	this.set(hash, len - i - 1);
	}
	}

	/**
	* retrieves the skip value for a given character
	*
	* overrides the default {@link Map.prototype.get} method to allow lookups
	* using either a string (which will be hashed) or a pre-computed hash value
	*
	* @param i - the character (as a string) or its hash value (as a number) to
	* look up
	* @returns the skip value for the character, or `undefined` if not found in
	* the map
	*/
	get(i: string \| number): number \| undefined {
	if (typeof i === "string") {
	return super.get(hashString(i));
	}
	return super.get(i);
	}
	}

	/**
	* generates a hash code for a string using the djb2 algorithm
	*
	* @param s - the input string to hash
	* @returns a 32-bit integer hash code
	*/
	export function hashString(s: string): number {
	let hash = 5381;
	const len = s.length;

	for (let i = 0; i < len; i++) {
	// bitwise left shift by 5 is equivalent to multiplying by 32
	hash = (hash << 5) + hash + s.charCodeAt(i);
	// convert to 32-bit integer
	hash \|= 0;
	}

	return hash >>> 0; // make sure it's an unsigned 32-bit integer
	}