Created
December 2, 2022 15:56
-
-
Save JakeCoxon/51f72be4eb278e1450c3aa2e5772492b to your computer and use it in GitHub Desktop.
Tokenize Python-like whitespace language
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
function tokenize(input: string) { | |
const regexes = { | |
KEYWORD: | |
/^(?:and|assert|as|break|class|continue|def|elif|else|false|for|if|import|in|is|lambda|null|not|or|pass|return|try|while|with)/, | |
IDENTIFIER: /^[a-zA-Z_][a-zA-Z_0-9]*/, | |
LITERAL: /^(?:"(?:[^"\\]|\\.)*"|'(?:[^'\\]|\\.)*')/, | |
SPECIALNUMBER: /^0[xXbB][0-9a-zA-Z_]+/, | |
NUMBER: /^-?[0-9][0-9_]*(\.[0-9_]+)?/, | |
COMMENT: /^#.+(?=\n)/, | |
OPENPAREN: /^[\{\{\(]/, | |
CLOSEPAREN: /^[\]\}\)]/, | |
PUNCTUATION: /^(?:==|!=|[:,=,])/, | |
NEWLINE: /^\n/, // Precedence before whitespace | |
WHITESPACE: /^\s+/, | |
INDENT: /^ +/ // Don't actually match this one, but it will be covered by whitespace instead | |
}; | |
const tokens = []; | |
let indent = { level: 0, numSpaces: 0 }; | |
let lineNumber = 0; | |
let lineStart = 0; | |
let tokenIndex = 0; | |
let parentheses = 0; | |
let match; | |
const exec = (regex, type) => { | |
if ((match = regex.exec(line))) { | |
line = line.substring(match[0].length); | |
const token = { value: match[0], type, lineNumber, tokenIndex }; | |
tokenIndex += match[0].length; | |
tokens.push(token); | |
return token; | |
} | |
}; | |
// First is line by line | |
let line = input; | |
while (line.length > 0) { | |
if (exec(regexes.NEWLINE, "NEWLINE")) { | |
lineNumber++; | |
lineStart = tokenIndex; | |
continue; | |
} | |
if (line.length > 0) { | |
let token = exec(regexes.INDENT, ""); | |
const numSpaces = token?.value.length; | |
if (token && numSpaces > indent.numSpaces) { | |
indent = { level: indent.level + 1, numSpaces }; | |
token.type = "INDENT"; | |
} else if (token && numSpaces < indent.numSpaces) { | |
indent = { level: indent.level - 1, numSpaces }; | |
token.type = "OUTDENT"; | |
} | |
} | |
// Tokens after the indentation, or within a grouped expression | |
while (line.length > 0 && (parentheses > 0 || line[0] !== "\n")) { | |
let token; | |
for (const [type, regex] of Object.entries(regexes)) { | |
if ((token = exec(regex, type))) break; | |
} | |
if (!token) { | |
const line = input.substring(lineStart, input.indexOf("\n", lineStart)); | |
const repeat = " ".repeat(tokenIndex - lineStart); | |
const message = `Unable to tokenize line ${lineNumber} \n${line}\n${repeat}^-- here`; | |
throw new Error(message); | |
} | |
if (token.type === "WHITESPACE" || token.type === "COMMENT") tokens.pop(); | |
if (token.type === "NEWLINE") { lineNumber++; lineStart = tokenIndex; tokens.pop(); } // prettier-ignore | |
if (token.type === "OPENPAREN") parentheses++; | |
if (token.type === "CLOSEPAREN") parentheses--; | |
} | |
} | |
while (indent.level) { | |
tokens.push({ value: "", type: "OUTDENT" }); | |
indent.level--; | |
} | |
return tokens; | |
} | |
(() => { | |
const input = ` | |
assert false, "unexp\\"ected thing" | |
if something == true: | |
do(something) # this is some cool | |
foo( | |
bar, baz, baw) | |
foo(1,2, 0xff) | |
`; | |
const out = tokenize(input); | |
const html = (() => { | |
let html = ``; | |
let last = 0; | |
const colors = { | |
KEYWORD: "#ff6767", | |
IDENTIFIER: "pink", | |
LITERAL: "#8b8bff", | |
SPECIALNUMBER: "#b0ffb0", | |
NUMBER: "#b0ffb0", | |
COMMENT: "lightgreen", | |
OPENPAREN: "#ff6eff", | |
CLOSEPAREN: "#ff6eff", | |
PUNCTUATION: "#ff91ff", | |
NEWLINE: "white", | |
WHITESPACE: "lightgrey", | |
INDENT: "lightgrey" | |
}; | |
out.forEach((token) => { | |
if (token.tokenIndex != last) { | |
html += input.substring(last, token.tokenIndex); | |
} | |
last = token.tokenIndex + token.value.length; | |
html += `<span style="background-color: ${colors[token.type]}">${token.value}</span>`; | |
}); | |
return html; | |
})(); | |
let str = JSON.stringify(out, null, 2); | |
str = str | |
.replaceAll("&", "&") | |
.replaceAll("<", "<") | |
.replaceAll(">", ">") | |
.replaceAll("'", "'") | |
.replaceAll('"', """); | |
document.querySelector("#app")!!.innerHTML = `<pre>${html}</pre><br><pre>${str}</pre>`; | |
})(); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment