Skip to content

Instantly share code, notes, and snippets.

@nickfargo
Created December 28, 2018 02:40
Show Gist options
  • Save nickfargo/ae14c5a381722b9b1b0529913835e3b8 to your computer and use it in GitHub Desktop.
Save nickfargo/ae14c5a381722b9b1b0529913835e3b8 to your computer and use it in GitHub Desktop.
type SubseqAsyncResult = {
sequence: string;
matches: Array<{
string: string,
score: number,
positions: number[]
}>;
};
type SubseqStringTableEntry = {
index: number,
score: number,
positions: number[]
};
type SubseqStringTable = {
[key: string]: null | SubseqStringTableEntry;
};
// Table of char codes 0-255, where codes of all alphabetic characters
// are mapped to those of their corresponding lowercase characters with
// diacritics removed. E.g.: {`A`, `a`, `À`, ...} → `a`
// Special case: AE ligature {`Æ`, `æ`} → `e`
const charCodesGeneralized: number[] = (() => {
const a = [];
for (let i = 0; i < 256; i++) a.push(i);
for (let i = 65; i < 91; i++) a[i] = i + 32;
for (let i = 224; i < 230; i++) a[i] = 97;
a[230] = 101;
a[231] = 99;
for (let i = 232; i < 236; i++) a[i] = 101;
for (let i = 236; i < 240; i++) a[i] = 105;
a[241] = 110;
for (let i = 242; i < 247; i++) a[i] = 111;
a[248] = 111;
for (let i = 249; i < 253; i++) a[i] = 117;
a[253] = 121;
a[255] = 121;
for (let i = 192; i < 215; i++) a[i] = a[i + 32];
for (let i = 216; i < 224; i++) a[i] = a[i + 32];
return a;
})();
// Returns the sum of positions within `str` of each char in `sub` sequence.
// Ranges from `0` if `str.startsWith(sub)` to `a*b - b*(b+1)/2` if
// `str.endsWith(sub)`. Returns `alt` if `sub` is not a subsequence of `str`.
function subseq (str: string, sub: string, alt = -1): number {
const a = str.length;
const b = sub.length;
if (b < a) {
let score = 0;
iter_sub: for (let i = 0, j = 0; i < b; i++) {
for (const code = sub.charCodeAt(i); j < a; j++) {
if (code === charCodesGeneralized[str.charCodeAt(j)]) {
score += j - i;
continue iter_sub;
}
}
return alt;
}
return score;
}
else if (b > a) return alt;
else return sub === str ? 0 : alt;
}
// Given a set of `strings`, yield a subset, such that the concatenated sequence of
// strings received from `source` is a valid subsequence of each string in the subset.
async function* subseqAsync (strings: string[], source: Iterable<string>) {
// Table of data associated with each string in `strings`.
const stringTable: SubseqStringTable = {};
// Subset of `strings` still valid under the subsequence received so far from `source`.
// Initializes to an ordered set of all unique `strings`.
let validStrings: string[] = [];
strings.forEach(s => {
if (!(s in stringTable)) {
stringTable[s] = {index: 0, score: 0, positions: []};
validStrings.push(s);
}
});
// For sorting the set of valid strings prior to each `yield`.
const compareByScore = (a: string, b: string): number => {
const entryA = stringTable[a] as SubseqStringTableEntry;
const entryB = stringTable[b] as SubseqStringTableEntry;
return entryA.score - entryB.score;
}
let sequence = '';
for await (const part of source) {
if (part === '') continue;
const concat = sequence + part;
const survivors: string[] = [];
for (const string of validStrings) {
const entry = stringTable[string];
if (entry == null) throw Error;
let {index, score} = entry;
if (concat.length < string.length) {
iterating_part:
for (let i = 0; i < part.length; i++) {
for (const code = part.charCodeAt(i); index < string.length; index++) {
if (code === charCodesGeneralized[string.charCodeAt(index)]) {
entry.positions.push(index);
score += index - i - sequence.length;
continue iterating_part;
}
}
stringTable[string] = null;
break;
}
entry.index = index;
entry.score = score;
survivors.push(string);
} else if (concat.length > string.length) {
stringTable[string] = null;
} else if (string === concat) {
if (entry.score !== 0) throw Error;
survivors.push(string);
} else {
stringTable[string] = null;
}
}
sequence = concat;
yield survivors
.sort(compareByScore)
.reduce<SubseqAsyncResult>((result, string) => {
const entry = stringTable[string];
if (entry == null) throw Error;
const {score, positions} = entry;
result.matches.push({string, score, positions});
return result;
}, {sequence, matches: []});
validStrings = survivors;
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment