Last active
November 7, 2024 21:06
-
-
Save notriddle/aa8edfe3c1b0a4f5e777d6db179464a2 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// need to escape specials | |
// https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Regular_expressions | |
const REGEX_SPECIALS = /[\$\(\)\*\+\.\/\?\[\\\]\^\{\|\}]/; | |
class EditDistanceCalculator { | |
constructor(string, limit) { | |
this.string = string; | |
this.current = []; | |
this.prev = []; | |
this.prevPrev = []; | |
this.matcherRegExp = new RegExp("^" + this.buildMatcherRegExp(string, limit) + "$"); | |
} | |
buildMatcherRegExp(string, limit) { | |
let prev = []; | |
let current = []; | |
const l = string.length; | |
const esc = []; | |
for (let i = 0; i < l; i += 1) { | |
esc[i] = string[i].replace(REGEX_SPECIALS, x => `\\${x}`); | |
prev[i] = string.substring(i).replace(REGEX_SPECIALS, x => `\\${x}`); | |
} | |
prev[l] = ""; | |
for (let dist = 1; dist <= limit; dist += 1) { | |
for (let i = 0; i <= l; i += 1) { | |
if (dist <= 2 && l - i === 1) { | |
// Fast path were we already know the correct regex | |
const c = esc[i]; | |
current[i] = dist === 2 ? | |
("(?:(:?" + c + ".{0,2})|(?:." + c + ".?)|(?:.{0,2}" + c + "?))") : | |
("(?:(:?" + c + ".)|(?:." + c + ")|.?)"); | |
continue; | |
} | |
let result = "("; | |
let close = ")"; | |
for (let j = i; j < l; j += 1) { | |
// substitution | deletion | |
result += "(?:.?" + prev[j + 1] + ")|"; | |
// insertion | |
result += "(?:." + prev[j] + ")|"; | |
// transposition | |
if (j + 1 < l && string[j + 1] !== string[j]) { | |
result += "(?:" + esc[j + 1] + esc[j] + prev[j + 2] + ")|"; | |
} | |
// verbatim | |
result += "(?:" + esc[j] + "(?:"; | |
close += "))"; | |
} | |
// insertion after verbatim | |
if (dist > 1) { | |
result += ".{0," + dist + "}"; | |
} else { | |
result += ".?"; | |
} | |
result += close; | |
current[i] = result; | |
} | |
const tmp = prev; | |
prev = current; | |
current = prev; | |
} | |
return prev[0]; | |
} | |
/* | |
* This function was translated, mostly line-for-line, from | |
* https://github.com/rust-lang/rust/blob/ff4b772f805ec1e/compiler/rustc_span/src/edit_distance.rs | |
* | |
* The current implementation is the restricted Damerau-Levenshtein algorithm. It is restricted | |
* because it does not permit modifying characters that have already been transposed. The specific | |
* algorithm should not matter to the caller of the methods, which is why it is not noted in the | |
* documentation. | |
*/ | |
calculate(b, limit) { | |
let a = this.string; | |
// Ensure that `b` is the shorter string, minimizing memory use. | |
if (a.length < b.length) { | |
const aTmp = a; | |
a = b; | |
b = aTmp; | |
} | |
const minDist = a.length - b.length; | |
// If we know the limit will be exceeded, we can return early. | |
if (minDist > limit) { | |
return limit + 1; | |
} | |
// Strip common prefix. | |
// We know that `b` is the shorter string, so we don't need to check | |
// `a.length`. | |
while (b.length > 0 && b[0] === a[0]) { | |
a = a.substring(1); | |
b = b.substring(1); | |
} | |
// Strip common suffix. | |
while (b.length > 0 && b[b.length - 1] === a[a.length - 1]) { | |
a = a.substring(0, a.length - 1); | |
b = b.substring(0, b.length - 1); | |
} | |
// If either string is empty, the distance is the length of the other. | |
// We know that `b` is the shorter string, so we don't need to check `a`. | |
if (b.length === 0) { | |
return minDist; | |
} | |
const aLength = a.length; | |
const bLength = b.length; | |
for (let i = 0; i <= bLength; ++i) { | |
this.current[i] = 0; | |
this.prev[i] = i; | |
this.prevPrev[i] = Number.MAX_VALUE; | |
} | |
// row by row | |
for (let i = 1; i <= aLength; ++i) { | |
this.current[0] = i; | |
const aIdx = i - 1; | |
// column by column | |
for (let j = 1; j <= bLength; ++j) { | |
const bIdx = j - 1; | |
// There is no cost to substitute a character with itself. | |
const substitutionCost = a[aIdx] === b[bIdx] ? 0 : 1; | |
this.current[j] = Math.min( | |
// deletion | |
this.prev[j] + 1, | |
// insertion | |
this.current[j - 1] + 1, | |
// substitution | |
this.prev[j - 1] + substitutionCost, | |
); | |
if ((i > 1) && (j > 1) && (a[aIdx] === b[bIdx - 1]) && (a[aIdx - 1] === b[bIdx])) { | |
// transposition | |
this.current[j] = Math.min( | |
this.current[j], | |
this.prevPrev[j - 2] + 1, | |
); | |
} | |
} | |
// Rotate the buffers, reusing the memory | |
const prevPrevTmp = this.prevPrev; | |
this.prevPrev = this.prev; | |
this.prev = this.current; | |
this.current = prevPrevTmp; | |
} | |
// `prev` because we already rotated the buffers. | |
const distance = this.prev[bLength]; | |
return distance <= limit ? distance : (limit + 1); | |
} | |
} | |
function buildString(seed) { | |
let string = ""; | |
let alphabet = "ABCD"; | |
let seed_builder = seed; | |
while (seed_builder > 0) { | |
string += alphabet[seed_builder & 3]; | |
seed_builder = seed_builder >> 2; | |
} | |
return string; | |
} | |
for (let limit = 0; limit < 4; limit += 1) { | |
console.log(limit); | |
for (let seed = 0; seed < 1024; seed += 1) { | |
let string1 = buildString(seed); | |
for (let seed2 = 0; seed2 < 1024; seed2 += 1) { | |
let string2 = buildString(seed2); | |
let calculator = new EditDistanceCalculator(string1, limit); | |
let matcherPasses = calculator.matcherRegExp.test(string2); | |
let calculatePasses = calculator.calculate(string2, limit) < limit + 1; | |
if (matcherPasses !== calculatePasses) { | |
console.log(calculator.matcherRegExp); | |
throw new Error("failed string1=" + string1 + " string2=" + string2 + " limit=" + limit + " matcherPasses=" + matcherPasses + " calculatePasses=" + calculatePasses); | |
} | |
} | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment