Created
December 28, 2018 02:40
-
-
Save nickfargo/ae14c5a381722b9b1b0529913835e3b8 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
type SubseqAsyncResult = { | |
sequence: string; | |
matches: Array<{ | |
string: string, | |
score: number, | |
positions: number[] | |
}>; | |
}; | |
type SubseqStringTableEntry = { | |
index: number, | |
score: number, | |
positions: number[] | |
}; | |
type SubseqStringTable = { | |
[key: string]: null | SubseqStringTableEntry; | |
}; | |
// Table of char codes 0-255, where codes of all alphabetic characters | |
// are mapped to those of their corresponding lowercase characters with | |
// diacritics removed. E.g.: {`A`, `a`, `À`, ...} → `a` | |
// Special case: AE ligature {`Æ`, `æ`} → `e` | |
const charCodesGeneralized: number[] = (() => { | |
const a = []; | |
for (let i = 0; i < 256; i++) a.push(i); | |
for (let i = 65; i < 91; i++) a[i] = i + 32; | |
for (let i = 224; i < 230; i++) a[i] = 97; | |
a[230] = 101; | |
a[231] = 99; | |
for (let i = 232; i < 236; i++) a[i] = 101; | |
for (let i = 236; i < 240; i++) a[i] = 105; | |
a[241] = 110; | |
for (let i = 242; i < 247; i++) a[i] = 111; | |
a[248] = 111; | |
for (let i = 249; i < 253; i++) a[i] = 117; | |
a[253] = 121; | |
a[255] = 121; | |
for (let i = 192; i < 215; i++) a[i] = a[i + 32]; | |
for (let i = 216; i < 224; i++) a[i] = a[i + 32]; | |
return a; | |
})(); | |
// Returns the sum of positions within `str` of each char in `sub` sequence. | |
// Ranges from `0` if `str.startsWith(sub)` to `a*b - b*(b+1)/2` if | |
// `str.endsWith(sub)`. Returns `alt` if `sub` is not a subsequence of `str`. | |
function subseq (str: string, sub: string, alt = -1): number { | |
const a = str.length; | |
const b = sub.length; | |
if (b < a) { | |
let score = 0; | |
iter_sub: for (let i = 0, j = 0; i < b; i++) { | |
for (const code = sub.charCodeAt(i); j < a; j++) { | |
if (code === charCodesGeneralized[str.charCodeAt(j)]) { | |
score += j - i; | |
continue iter_sub; | |
} | |
} | |
return alt; | |
} | |
return score; | |
} | |
else if (b > a) return alt; | |
else return sub === str ? 0 : alt; | |
} | |
// Given a set of `strings`, yield a subset, such that the concatenated sequence of | |
// strings received from `source` is a valid subsequence of each string in the subset. | |
async function* subseqAsync (strings: string[], source: Iterable<string>) { | |
// Table of data associated with each string in `strings`. | |
const stringTable: SubseqStringTable = {}; | |
// Subset of `strings` still valid under the subsequence received so far from `source`. | |
// Initializes to an ordered set of all unique `strings`. | |
let validStrings: string[] = []; | |
strings.forEach(s => { | |
if (!(s in stringTable)) { | |
stringTable[s] = {index: 0, score: 0, positions: []}; | |
validStrings.push(s); | |
} | |
}); | |
// For sorting the set of valid strings prior to each `yield`. | |
const compareByScore = (a: string, b: string): number => { | |
const entryA = stringTable[a] as SubseqStringTableEntry; | |
const entryB = stringTable[b] as SubseqStringTableEntry; | |
return entryA.score - entryB.score; | |
} | |
let sequence = ''; | |
for await (const part of source) { | |
if (part === '') continue; | |
const concat = sequence + part; | |
const survivors: string[] = []; | |
for (const string of validStrings) { | |
const entry = stringTable[string]; | |
if (entry == null) throw Error; | |
let {index, score} = entry; | |
if (concat.length < string.length) { | |
iterating_part: | |
for (let i = 0; i < part.length; i++) { | |
for (const code = part.charCodeAt(i); index < string.length; index++) { | |
if (code === charCodesGeneralized[string.charCodeAt(index)]) { | |
entry.positions.push(index); | |
score += index - i - sequence.length; | |
continue iterating_part; | |
} | |
} | |
stringTable[string] = null; | |
break; | |
} | |
entry.index = index; | |
entry.score = score; | |
survivors.push(string); | |
} else if (concat.length > string.length) { | |
stringTable[string] = null; | |
} else if (string === concat) { | |
if (entry.score !== 0) throw Error; | |
survivors.push(string); | |
} else { | |
stringTable[string] = null; | |
} | |
} | |
sequence = concat; | |
yield survivors | |
.sort(compareByScore) | |
.reduce<SubseqAsyncResult>((result, string) => { | |
const entry = stringTable[string]; | |
if (entry == null) throw Error; | |
const {score, positions} = entry; | |
result.matches.push({string, score, positions}); | |
return result; | |
}, {sequence, matches: []}); | |
validStrings = survivors; | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment