Skip to content

Instantly share code, notes, and snippets.

@notriddle
Last active November 7, 2024 21:06
Show Gist options
  • Save notriddle/aa8edfe3c1b0a4f5e777d6db179464a2 to your computer and use it in GitHub Desktop.
Save notriddle/aa8edfe3c1b0a4f5e777d6db179464a2 to your computer and use it in GitHub Desktop.
// need to escape specials
// https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Regular_expressions
const REGEX_SPECIALS = /[\$\(\)\*\+\.\/\?\[\\\]\^\{\|\}]/;
class EditDistanceCalculator {
constructor(string, limit) {
this.string = string;
this.current = [];
this.prev = [];
this.prevPrev = [];
this.matcherRegExp = new RegExp("^" + this.buildMatcherRegExp(string, limit) + "$");
}
buildMatcherRegExp(string, limit) {
let prev = [];
let current = [];
const l = string.length;
const esc = [];
for (let i = 0; i < l; i += 1) {
esc[i] = string[i].replace(REGEX_SPECIALS, x => `\\${x}`);
prev[i] = string.substring(i).replace(REGEX_SPECIALS, x => `\\${x}`);
}
prev[l] = "";
for (let dist = 1; dist <= limit; dist += 1) {
for (let i = 0; i <= l; i += 1) {
if (dist <= 2 && l - i === 1) {
// Fast path were we already know the correct regex
const c = esc[i];
current[i] = dist === 2 ?
("(?:(:?" + c + ".{0,2})|(?:." + c + ".?)|(?:.{0,2}" + c + "?))") :
("(?:(:?" + c + ".)|(?:." + c + ")|.?)");
continue;
}
let result = "(";
let close = ")";
for (let j = i; j < l; j += 1) {
// substitution | deletion
result += "(?:.?" + prev[j + 1] + ")|";
// insertion
result += "(?:." + prev[j] + ")|";
// transposition
if (j + 1 < l && string[j + 1] !== string[j]) {
result += "(?:" + esc[j + 1] + esc[j] + prev[j + 2] + ")|";
}
// verbatim
result += "(?:" + esc[j] + "(?:";
close += "))";
}
// insertion after verbatim
if (dist > 1) {
result += ".{0," + dist + "}";
} else {
result += ".?";
}
result += close;
current[i] = result;
}
const tmp = prev;
prev = current;
current = prev;
}
return prev[0];
}
/*
* This function was translated, mostly line-for-line, from
* https://github.com/rust-lang/rust/blob/ff4b772f805ec1e/compiler/rustc_span/src/edit_distance.rs
*
* The current implementation is the restricted Damerau-Levenshtein algorithm. It is restricted
* because it does not permit modifying characters that have already been transposed. The specific
* algorithm should not matter to the caller of the methods, which is why it is not noted in the
* documentation.
*/
calculate(b, limit) {
let a = this.string;
// Ensure that `b` is the shorter string, minimizing memory use.
if (a.length < b.length) {
const aTmp = a;
a = b;
b = aTmp;
}
const minDist = a.length - b.length;
// If we know the limit will be exceeded, we can return early.
if (minDist > limit) {
return limit + 1;
}
// Strip common prefix.
// We know that `b` is the shorter string, so we don't need to check
// `a.length`.
while (b.length > 0 && b[0] === a[0]) {
a = a.substring(1);
b = b.substring(1);
}
// Strip common suffix.
while (b.length > 0 && b[b.length - 1] === a[a.length - 1]) {
a = a.substring(0, a.length - 1);
b = b.substring(0, b.length - 1);
}
// If either string is empty, the distance is the length of the other.
// We know that `b` is the shorter string, so we don't need to check `a`.
if (b.length === 0) {
return minDist;
}
const aLength = a.length;
const bLength = b.length;
for (let i = 0; i <= bLength; ++i) {
this.current[i] = 0;
this.prev[i] = i;
this.prevPrev[i] = Number.MAX_VALUE;
}
// row by row
for (let i = 1; i <= aLength; ++i) {
this.current[0] = i;
const aIdx = i - 1;
// column by column
for (let j = 1; j <= bLength; ++j) {
const bIdx = j - 1;
// There is no cost to substitute a character with itself.
const substitutionCost = a[aIdx] === b[bIdx] ? 0 : 1;
this.current[j] = Math.min(
// deletion
this.prev[j] + 1,
// insertion
this.current[j - 1] + 1,
// substitution
this.prev[j - 1] + substitutionCost,
);
if ((i > 1) && (j > 1) && (a[aIdx] === b[bIdx - 1]) && (a[aIdx - 1] === b[bIdx])) {
// transposition
this.current[j] = Math.min(
this.current[j],
this.prevPrev[j - 2] + 1,
);
}
}
// Rotate the buffers, reusing the memory
const prevPrevTmp = this.prevPrev;
this.prevPrev = this.prev;
this.prev = this.current;
this.current = prevPrevTmp;
}
// `prev` because we already rotated the buffers.
const distance = this.prev[bLength];
return distance <= limit ? distance : (limit + 1);
}
}
function buildString(seed) {
let string = "";
let alphabet = "ABCD";
let seed_builder = seed;
while (seed_builder > 0) {
string += alphabet[seed_builder & 3];
seed_builder = seed_builder >> 2;
}
return string;
}
for (let limit = 0; limit < 4; limit += 1) {
console.log(limit);
for (let seed = 0; seed < 1024; seed += 1) {
let string1 = buildString(seed);
for (let seed2 = 0; seed2 < 1024; seed2 += 1) {
let string2 = buildString(seed2);
let calculator = new EditDistanceCalculator(string1, limit);
let matcherPasses = calculator.matcherRegExp.test(string2);
let calculatePasses = calculator.calculate(string2, limit) < limit + 1;
if (matcherPasses !== calculatePasses) {
console.log(calculator.matcherRegExp);
throw new Error("failed string1=" + string1 + " string2=" + string2 + " limit=" + limit + " matcherPasses=" + matcherPasses + " calculatePasses=" + calculatePasses);
}
}
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment