Last active
March 5, 2025 00:15
-
-
Save frostney/22a34cd39c3b5b48e57ab3fb0e2ac5ea to your computer and use it in GitHub Desktop.
Simple tokenizer for Bun - Basically a nicer and more general-purpose version of the StringReader functionality I made years back
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import { describe, it, expect } from "bun:test"; | |
import { createTokenizer } from "./createTokenizer"; | |
import type { Token } from "./createTokenizer"; | |
describe("createTokenizer", () => { | |
const simpleInput = "let x = 1;"; | |
it("should return the input if no delimiters are provided", () => { | |
const tokenizer = createTokenizer(simpleInput); | |
const tokens = tokenizer([]); | |
expect(Array.from(tokens)).toEqual([ | |
{ value: simpleInput, position: { line: 1, column: 1, index: 0 } }, | |
]); | |
}); | |
it("should return the input if the value is not found", () => { | |
const tokenizer = createTokenizer(simpleInput); | |
const tokens = tokenizer(["_"]); | |
expect(Array.from(tokens)).toEqual([ | |
{ value: simpleInput, position: { line: 1, column: 1, index: 0 } }, | |
]); | |
}); | |
it("should tokenize a string", () => { | |
const tokenizer = createTokenizer(simpleInput); | |
const tokens = tokenizer([" ", ";", "\n", "\r", "\t"]); | |
expect(Array.from(tokens)).toEqual([ | |
{ value: "let", position: { line: 1, column: 1, index: 0 } }, | |
{ value: " ", position: { line: 1, column: 4, index: 3 } }, | |
{ value: "x", position: { line: 1, column: 5, index: 4 } }, | |
{ value: " ", position: { line: 1, column: 6, index: 5 } }, | |
{ value: "=", position: { line: 1, column: 7, index: 6 } }, | |
{ value: " ", position: { line: 1, column: 8, index: 7 } }, | |
{ value: "1", position: { line: 1, column: 9, index: 8 } }, | |
{ value: ";", position: { line: 1, column: 10, index: 9 } }, | |
]); | |
}); | |
it("allows to define a custom matcher", () => { | |
type CustomToken = { | |
value: string; | |
position: { line: number; column: number; index: number }; | |
type: "one" | "two" | "three" | "whitespace"; | |
}; | |
const tokenizer = createTokenizer<CustomToken>("1 2 3"); | |
const tokens = tokenizer([" "], (token: Token) => { | |
if (token.value === "1") { | |
return { | |
...token, | |
type: "one", | |
}; | |
} | |
if (token.value === "2") { | |
return { | |
...token, | |
type: "two", | |
}; | |
} | |
if (token.value === "3") { | |
return { | |
...token, | |
type: "three", | |
}; | |
} | |
return { ...token, type: "whitespace" }; | |
}); | |
expect(Array.from(tokens)).toEqual([ | |
{ value: "1", type: "one", position: { line: 1, column: 1, index: 0 } }, | |
{ | |
value: " ", | |
type: "whitespace", | |
position: { line: 1, column: 2, index: 1 }, | |
}, | |
{ value: "2", type: "two", position: { line: 1, column: 3, index: 2 } }, | |
{ | |
value: " ", | |
type: "whitespace", | |
position: { line: 1, column: 4, index: 3 }, | |
}, | |
{ value: "3", type: "three", position: { line: 1, column: 5, index: 4 } }, | |
]); | |
}); | |
it("should take the order of the delimiter into account", () => { | |
const tokenizer = createTokenizer("/ 1 // 2"); | |
const tokens1 = tokenizer([" ", "/", "//"]); | |
expect(Array.from(tokens1)).toEqual([ | |
{ value: "/", position: { line: 1, column: 1, index: 0 } }, | |
{ value: " ", position: { line: 1, column: 2, index: 1 } }, | |
{ value: "1", position: { line: 1, column: 3, index: 2 } }, | |
{ value: " ", position: { line: 1, column: 4, index: 3 } }, | |
{ value: "/", position: { line: 1, column: 5, index: 4 } }, | |
{ value: "/", position: { line: 1, column: 6, index: 5 } }, | |
{ value: " ", position: { line: 1, column: 7, index: 6 } }, | |
{ value: "2", position: { line: 1, column: 8, index: 7 } }, | |
]); | |
const tokens2 = tokenizer([" ", "//", "/"]); | |
expect(Array.from(tokens2)).toEqual([ | |
{ value: "/", position: { line: 1, column: 1, index: 0 } }, | |
{ value: " ", position: { line: 1, column: 2, index: 1 } }, | |
{ value: "1", position: { line: 1, column: 3, index: 2 } }, | |
{ value: " ", position: { line: 1, column: 4, index: 3 } }, | |
{ value: "//", position: { line: 1, column: 5, index: 4 } }, | |
{ value: " ", position: { line: 1, column: 7, index: 6 } }, | |
{ value: "2", position: { line: 1, column: 8, index: 7 } }, | |
]); | |
}); | |
}); |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// Splits a string into tokens based on a token matcher | |
export type Token = { | |
value: string; | |
position: { line: number; column: number; index: number }; | |
}; | |
export function createTokenizer<T extends Token>(input: string) { | |
return function* tokens( | |
delimiters: string[], | |
tokenMatcher: (token: Token) => T = (token) => token as T | |
): Generator<T> { | |
let index = 0; | |
let line = 1; | |
let column = 1; | |
let buffer = ""; | |
let bufferStartIndex = 0; | |
let bufferStartLine = 1; | |
let bufferStartColumn = 1; | |
while (index < input.length) { | |
// Find matching delimiter based on order in the array | |
let matchedDelimiter: string | null = null; | |
for (const delimiter of delimiters) { | |
// Check if the input at current position matches this delimiter | |
if (input.substring(index, index + delimiter.length) === delimiter) { | |
matchedDelimiter = delimiter; | |
break; // Stop at first match to respect order priority | |
} | |
} | |
if (matchedDelimiter) { | |
// If we have accumulated a buffer, yield it as a token | |
if (buffer) { | |
yield tokenMatcher({ | |
value: buffer, | |
position: { | |
line: bufferStartLine, | |
column: bufferStartColumn, | |
index: bufferStartIndex, | |
}, | |
}); | |
buffer = ""; | |
} | |
// Yield the delimiter as a token | |
yield tokenMatcher({ | |
value: matchedDelimiter, | |
position: { line, column, index }, | |
}); | |
// Update position tracking | |
if (matchedDelimiter.includes("\n")) { | |
// Handle newlines in multi-character delimiters | |
const newlines = matchedDelimiter.split("\n").length - 1; | |
line += newlines; | |
const lastNewlineIndex = matchedDelimiter.lastIndexOf("\n"); | |
if (lastNewlineIndex !== -1) { | |
column = matchedDelimiter.length - lastNewlineIndex; | |
} else { | |
column += matchedDelimiter.length; | |
} | |
} else { | |
column += matchedDelimiter.length; | |
} | |
index += matchedDelimiter.length; | |
// Reset buffer start position for next token | |
bufferStartIndex = index; | |
bufferStartLine = line; | |
bufferStartColumn = column; | |
} else { | |
// No delimiter found, add character to buffer | |
const char = input[index]; | |
buffer += char; | |
// If this is the first character in the buffer, record its position | |
if (buffer.length === 1) { | |
bufferStartIndex = index; | |
bufferStartLine = line; | |
bufferStartColumn = column; | |
} | |
// Update position tracking | |
if (char === "\n") { | |
line++; | |
column = 1; | |
} else { | |
column++; | |
} | |
index++; | |
} | |
} | |
// Don't forget to yield the last buffer if not empty | |
if (buffer) { | |
yield tokenMatcher({ | |
value: buffer, | |
position: { | |
line: bufferStartLine, | |
column: bufferStartColumn, | |
index: bufferStartIndex, | |
}, | |
}); | |
} | |
}; | |
} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
type PascalToken = { | |
type: | |
| "keyword" | |
| "identifier" | |
| "number" | |
| "string" | |
| "operator" | |
| "punctuation" | |
| "comment"; | |
value: string; | |
position: { line: number; column: number; index: number }; | |
}; | |
const tokenizer = createTokenizer<PascalToken>(` | |
program test; | |
begin | |
WriteLn('Hello, world!'); | |
end. | |
`); | |
const keywords = ["program", "begin", "end"]; | |
const operators = ["+", "-", "*", "/", "=", "<", ">", "<=", ">="]; | |
const punctuation = [";", ".", ",", "(", ")", "[", "]", "{", "}"]; | |
const stringDelimiters = ["'"]; | |
const tokenMatcher = (token: Token): PascalToken => { | |
if (keywords.includes(token.value)) { | |
return { ...token, type: "keyword" }; | |
} | |
if (operators.includes(token.value)) { | |
return { ...token, type: "operator" }; | |
} | |
if (punctuation.includes(token.value)) { | |
return { ...token, type: "punctuation" }; | |
} | |
if (stringDelimiters.includes(token.value)) { | |
return { ...token, type: "string" }; | |
} | |
return { ...token, type: "identifier" }; | |
}; | |
for (const token of tokenizer( | |
[ | |
" ", | |
"/", | |
"//", | |
";", | |
".", | |
",", | |
"(", | |
")", | |
"[", | |
"]", | |
"{", | |
"}", | |
"'", | |
"\n", | |
"\r", | |
"\t", | |
], | |
tokenMatcher | |
)) { | |
console.log(token); | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment