Last active
February 6, 2025 15:40
-
-
Save trvswgnr/48e46dd6a60de1e9d8e8571d914ef2b0 to your computer and use it in GitHub Desktop.
check if a file contains a specified phrase, without loading the entire file into memory
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import { describe, it, expect, beforeAll, afterAll } from "bun:test"; | |
import { fileContains, BoyerMooreSearcher } from "./fileContains"; | |
import fs from "fs"; | |
import path from "path"; | |
describe("fileContains", () => { | |
const testDir = path.join(__dirname, "test-files"); | |
beforeAll(() => { | |
if (!fs.existsSync(testDir)) { | |
fs.mkdirSync(testDir); | |
} | |
}); | |
afterAll(() => { | |
fs.rmSync(testDir, { recursive: true, force: true }); | |
}); | |
it("should find a phrase in a small file", async () => { | |
const filePath = path.join(testDir, "small.txt"); | |
fs.writeFileSync(filePath, "Hello, world!"); | |
expect(await fileContains(filePath, "world")).toBe(true); | |
expect(await fileContains(filePath, "universe")).toBe(false); | |
}); | |
it("should find a phrase spanning multiple chunks", async () => { | |
const filePath = path.join(testDir, "multi-chunk.txt"); | |
const content = "a".repeat(65536) + "target" + "b".repeat(65536); | |
fs.writeFileSync(filePath, content); | |
expect(await fileContains(filePath, "target")).toBe(true); | |
}); | |
it("should handle multi-byte characters", async () => { | |
const filePath = path.join(testDir, "multi-byte.txt"); | |
fs.writeFileSync(filePath, "你好,世界!"); | |
expect(await fileContains(filePath, "世界")).toBe(true); | |
expect(await fileContains(filePath, "宇宙")).toBe(false); | |
}); | |
it("should handle large files efficiently", async () => { | |
const filePath = path.join(testDir, "large.txt"); | |
const content = "a".repeat(10 * 1024 * 1024) + "needle" + "b".repeat(10 * 1024 * 1024); | |
fs.writeFileSync(filePath, content); | |
const start = Date.now(); | |
const result = await fileContains(filePath, "needle"); | |
const end = Date.now(); | |
expect(result).toBe(true); | |
expect(end - start).toBeLessThan(1000); // Should complete in less than 1 second | |
}); | |
}); | |
describe("StreamSearcher", () => { | |
it("should find a phrase in a single chunk", () => { | |
const searcher = new BoyerMooreSearcher("world"); | |
expect(searcher.search("Hello, world!")).toBe(true); | |
expect(searcher.search("Hello, universe!")).toBe(false); | |
}); | |
it("should find a phrase spanning multiple chunks", () => { | |
const searcher = new BoyerMooreSearcher("target"); | |
expect(searcher.search("Some text before the tar")).toBe(false); | |
expect(searcher.search("get phrase")).toBe(true); | |
}); | |
it("should handle multi-byte characters", () => { | |
const searcher = new BoyerMooreSearcher("世界"); | |
expect(searcher.search("你好,")).toBe(false); | |
expect(searcher.search("世界!")).toBe(true); | |
}); | |
}); |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import { createReadStream } from "fs"; | |
/** | |
* the default chunk size to read from the file, in bytes | |
*/ | |
const DEFAULT_CHUNK_SIZE = 16 * 1024; | |
/** | |
* searches for a specific phrase in a file using the Boyer-Moore algorithm | |
* | |
* reads the file in chunks and uses a stream-based approach to efficiently | |
* search for the given phrase without loading the entire file into memory. it | |
* utilizes the Boyer-Moore string searching algorithm for improved performance | |
* | |
* @param filePath - the path to the file to be searched | |
* @param phrase - the phrase to search for within the file | |
* @param chunkSize - the size of each chunk to read from the file, in bytes | |
* defaults to 16KB (16 * 1024 bytes) | |
* | |
* @returns a Promise that resolves to `true` if the phrase is found in the | |
* file, otherwise `false` | |
* | |
* @throws rejects with an error if there's an issue reading the file | |
* | |
* @example | |
* const found = await searchFileForPhrase('/path/to/file.txt', 'search phrase'); | |
* if (found) { | |
* console.log('phrase found in file'); | |
* } else { | |
* console.log('not found'); | |
* } | |
*/ | |
export async function fileContains( | |
filePath: string, | |
phrase: string, | |
chunkSize = DEFAULT_CHUNK_SIZE, | |
): Promise<boolean> { | |
return new Promise((resolve, reject) => { | |
const searcher = new BoyerMooreSearcher(phrase); | |
const stream = createReadStream(filePath, { highWaterMark: chunkSize }); | |
stream.on("data", (chunk: Buffer) => { | |
if (searcher.search(chunk.toString("utf8"))) { | |
stream.destroy(); | |
resolve(true); | |
} | |
}); | |
stream.on("end", () => resolve(false)); | |
stream.on("error", reject); | |
}); | |
} | |
/** | |
* implements the Boyer-Moore string searching algorithm | |
* | |
* designed to efficiently search for a pattern within a stream of characters | |
*/ | |
export class BoyerMooreSearcher { | |
private badCharMap: BadCharMap; | |
private pattern: string; | |
private buffer: string; | |
private patternLength: number; | |
/** | |
* initializes a new instance of the BooyerMooreSearcher | |
* | |
* @param pattern - the pattern to search for | |
* @constructor | |
*/ | |
constructor(pattern: string) { | |
this.pattern = pattern; | |
this.badCharMap = new BadCharMap(pattern); | |
this.buffer = ""; | |
this.patternLength = [...pattern].length; // handle multi-byte characters correctly | |
} | |
/** | |
* searches for the pattern in the given chunk. | |
* | |
* designed to be called repeatedly with each chunk in the stream | |
* | |
* @param chunk - the next chunk in the stream to search. | |
* @returns `true` if the pattern is found, otherwise `false` | |
* @note modifies the internal buffer and returns a boolean | |
*/ | |
search(chunk: string): boolean { | |
this.buffer += chunk; | |
const r = this.findPattern(); | |
// reset the buffer if a match is found or keep only the last | |
// `(patternLength - 1)` characters | |
this.buffer = r ? "" : this.buffer.slice(-this.patternLength + 1); | |
return r; | |
} | |
/** | |
* implements the core Boyer-Moore algorithm to find the pattern in the | |
* current buffer | |
* | |
* @returns `true` if the pattern is found in the current buffer, otherwise | |
* `false` | |
* @private | |
*/ | |
private findPattern(): boolean { | |
const len = this.buffer.length; | |
let i = this.patternLength - 1; | |
while (i < len) { | |
let j = this.patternLength - 1; | |
// compare characters from right to left | |
while (j >= 0 && this.buffer[i - (this.patternLength - j - 1)] === this.pattern[j]) { | |
j--; | |
} | |
if (j < 0) { | |
return true; // pattern found | |
} | |
// use the bad character heuristic to skip characters | |
const skip = this.badCharMap.get(this.buffer[i]!) ?? this.patternLength; | |
i += Math.max(1, skip); | |
} | |
return false; // pattern not found in this chunk | |
} | |
} | |
/** | |
* implements a Bad Character Map for the Boyer-Moore string searching algorithm | |
* | |
* used to preprocess the search pattern and create a lookup table for the bad | |
* character heuristic. it allows for efficient skipping of characters during | |
* the search process, improving the overall performance of the algorithm | |
* | |
* the map uses hashed character codes as keys and their rightmost positions in | |
* the pattern as values, allowing for constant-time lookups | |
*/ | |
export class BadCharMap extends Map<number, number> { | |
/** | |
* constructs a new BadCharMap for the given pattern | |
* | |
* @param pattern - the search pattern to create the map for | |
* @constructor | |
*/ | |
constructor(pattern: string) { | |
super(); | |
const chars = [...pattern]; // handle Unicode characters correctly | |
const len = chars.length; | |
let hash: number; | |
// iterate through the pattern from left to right | |
for (let i = 0; i < len; i++) { | |
// hash each character and store its rightmost position | |
hash = hashString(chars.at(i)!); | |
// the value is the distance from the rightmost occurrence to the | |
// end of the pattern | |
this.set(hash, len - i - 1); | |
} | |
} | |
/** | |
* retrieves the skip value for a given character | |
* | |
* overrides the default {@link Map.prototype.get} method to allow lookups | |
* using either a string (which will be hashed) or a pre-computed hash value | |
* | |
* @param i - the character (as a string) or its hash value (as a number) to | |
* look up | |
* @returns the skip value for the character, or `undefined` if not found in | |
* the map | |
*/ | |
get(i: string | number): number | undefined { | |
if (typeof i === "string") { | |
return super.get(hashString(i)); | |
} | |
return super.get(i); | |
} | |
} | |
/** | |
* generates a hash code for a string using the djb2 algorithm | |
* | |
* @param s - the input string to hash | |
* @returns a 32-bit integer hash code | |
*/ | |
export function hashString(s: string): number { | |
let hash = 5381; | |
const len = s.length; | |
for (let i = 0; i < len; i++) { | |
// bitwise left shift by 5 is equivalent to multiplying by 32 | |
hash = (hash << 5) + hash + s.charCodeAt(i); | |
// convert to 32-bit integer | |
hash |= 0; | |
} | |
return hash >>> 0; // make sure it's an unsigned 32-bit integer | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment