Last active
February 10, 2023 21:54
-
-
Save samkcarlile/4d5c88716f7fd425d2d1d8f735844ad4 to your computer and use it in GitHub Desktop.
TypeScript Tokenizer
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
type TokenSpec<T = any> = { match: RegExp; value: (s: string) => T }; | |
type TokenConfig = Record<string, RegExp | TokenSpec>; | |
type TokenTypes<T extends TokenConfig> = { | |
[Type in keyof T]: T[Type] extends TokenSpec | |
? { type: Type; value: ReturnType<T[Type]['value']> } | |
: { type: Type; value: string }; | |
}[keyof T]; | |
/** | |
* @param str The input string | |
* @param tokens An object of token types and their regex | |
* @param strict If true, throws an error on unmatched text. By default unmatched text has a token type of `undefined` | |
*/ | |
export function tokenize<T extends Record<string, RegExp | TokenSpec>>( | |
str: string, | |
tokens: T, | |
strict?: boolean | |
): TokenTypes<T>[] { | |
const parser = new RegExp( | |
Object.entries(tokens) | |
.map(([type, regex]) => | |
regex instanceof RegExp | |
? `(?<${type}>${regex.source})` | |
: `(?<${type}>${regex.match.source})` | |
) | |
.join('|'), | |
'gm' | |
); | |
const transformers = Object.fromEntries( | |
Object.entries(tokens) | |
.filter(([, value]) => !(value instanceof RegExp)) | |
.map(([key, spec]) => [key, (spec as TokenSpec).value]) | |
); | |
const results: { type: keyof T | undefined; value: string }[] = []; | |
let match: RegExpMatchArray | null; | |
let lastIndex = 0; | |
while ((match = parser.exec(str))) { | |
const token = Object.entries(match.groups!).filter(([, value]) => value)[0]; | |
const noMatch = str.slice(lastIndex, parser.lastIndex - token[1].length); | |
if (strict && noMatch) | |
throw new Error(`unknown token at position ${lastIndex}: "${noMatch}"`); | |
else if (noMatch) results.push({ type: undefined, value: noMatch }); | |
const transform = transformers[token[0]]; | |
results.push({ | |
type: token[0], | |
value: transform ? transform(token[1]) : token[1], | |
}); | |
lastIndex = parser.lastIndex; | |
} | |
const endOfString = str.slice(lastIndex); | |
if (endOfString) results.push({ type: undefined, value: endOfString }); | |
return results as any; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment