Skip to content

Instantly share code, notes, and snippets.

@frostney
Last active March 5, 2025 00:15
Show Gist options
  • Save frostney/22a34cd39c3b5b48e57ab3fb0e2ac5ea to your computer and use it in GitHub Desktop.
Save frostney/22a34cd39c3b5b48e57ab3fb0e2ac5ea to your computer and use it in GitHub Desktop.
Simple tokenizer for Bun - Basically a nicer and more general-purpose version of the StringReader functionality I made years back
import { describe, it, expect } from "bun:test";
import { createTokenizer } from "./createTokenizer";
import type { Token } from "./createTokenizer";
describe("createTokenizer", () => {
const simpleInput = "let x = 1;";
it("should return the input if no delimiters are provided", () => {
const tokenizer = createTokenizer(simpleInput);
const tokens = tokenizer([]);
expect(Array.from(tokens)).toEqual([
{ value: simpleInput, position: { line: 1, column: 1, index: 0 } },
]);
});
it("should return the input if the value is not found", () => {
const tokenizer = createTokenizer(simpleInput);
const tokens = tokenizer(["_"]);
expect(Array.from(tokens)).toEqual([
{ value: simpleInput, position: { line: 1, column: 1, index: 0 } },
]);
});
it("should tokenize a string", () => {
const tokenizer = createTokenizer(simpleInput);
const tokens = tokenizer([" ", ";", "\n", "\r", "\t"]);
expect(Array.from(tokens)).toEqual([
{ value: "let", position: { line: 1, column: 1, index: 0 } },
{ value: " ", position: { line: 1, column: 4, index: 3 } },
{ value: "x", position: { line: 1, column: 5, index: 4 } },
{ value: " ", position: { line: 1, column: 6, index: 5 } },
{ value: "=", position: { line: 1, column: 7, index: 6 } },
{ value: " ", position: { line: 1, column: 8, index: 7 } },
{ value: "1", position: { line: 1, column: 9, index: 8 } },
{ value: ";", position: { line: 1, column: 10, index: 9 } },
]);
});
it("allows to define a custom matcher", () => {
type CustomToken = {
value: string;
position: { line: number; column: number; index: number };
type: "one" | "two" | "three" | "whitespace";
};
const tokenizer = createTokenizer<CustomToken>("1 2 3");
const tokens = tokenizer([" "], (token: Token) => {
if (token.value === "1") {
return {
...token,
type: "one",
};
}
if (token.value === "2") {
return {
...token,
type: "two",
};
}
if (token.value === "3") {
return {
...token,
type: "three",
};
}
return { ...token, type: "whitespace" };
});
expect(Array.from(tokens)).toEqual([
{ value: "1", type: "one", position: { line: 1, column: 1, index: 0 } },
{
value: " ",
type: "whitespace",
position: { line: 1, column: 2, index: 1 },
},
{ value: "2", type: "two", position: { line: 1, column: 3, index: 2 } },
{
value: " ",
type: "whitespace",
position: { line: 1, column: 4, index: 3 },
},
{ value: "3", type: "three", position: { line: 1, column: 5, index: 4 } },
]);
});
it("should take the order of the delimiter into account", () => {
const tokenizer = createTokenizer("/ 1 // 2");
const tokens1 = tokenizer([" ", "/", "//"]);
expect(Array.from(tokens1)).toEqual([
{ value: "/", position: { line: 1, column: 1, index: 0 } },
{ value: " ", position: { line: 1, column: 2, index: 1 } },
{ value: "1", position: { line: 1, column: 3, index: 2 } },
{ value: " ", position: { line: 1, column: 4, index: 3 } },
{ value: "/", position: { line: 1, column: 5, index: 4 } },
{ value: "/", position: { line: 1, column: 6, index: 5 } },
{ value: " ", position: { line: 1, column: 7, index: 6 } },
{ value: "2", position: { line: 1, column: 8, index: 7 } },
]);
const tokens2 = tokenizer([" ", "//", "/"]);
expect(Array.from(tokens2)).toEqual([
{ value: "/", position: { line: 1, column: 1, index: 0 } },
{ value: " ", position: { line: 1, column: 2, index: 1 } },
{ value: "1", position: { line: 1, column: 3, index: 2 } },
{ value: " ", position: { line: 1, column: 4, index: 3 } },
{ value: "//", position: { line: 1, column: 5, index: 4 } },
{ value: " ", position: { line: 1, column: 7, index: 6 } },
{ value: "2", position: { line: 1, column: 8, index: 7 } },
]);
});
});
// Splits a string into tokens based on a token matcher
export type Token = {
value: string;
position: { line: number; column: number; index: number };
};
export function createTokenizer<T extends Token>(input: string) {
return function* tokens(
delimiters: string[],
tokenMatcher: (token: Token) => T = (token) => token as T
): Generator<T> {
let index = 0;
let line = 1;
let column = 1;
let buffer = "";
let bufferStartIndex = 0;
let bufferStartLine = 1;
let bufferStartColumn = 1;
while (index < input.length) {
// Find matching delimiter based on order in the array
let matchedDelimiter: string | null = null;
for (const delimiter of delimiters) {
// Check if the input at current position matches this delimiter
if (input.substring(index, index + delimiter.length) === delimiter) {
matchedDelimiter = delimiter;
break; // Stop at first match to respect order priority
}
}
if (matchedDelimiter) {
// If we have accumulated a buffer, yield it as a token
if (buffer) {
yield tokenMatcher({
value: buffer,
position: {
line: bufferStartLine,
column: bufferStartColumn,
index: bufferStartIndex,
},
});
buffer = "";
}
// Yield the delimiter as a token
yield tokenMatcher({
value: matchedDelimiter,
position: { line, column, index },
});
// Update position tracking
if (matchedDelimiter.includes("\n")) {
// Handle newlines in multi-character delimiters
const newlines = matchedDelimiter.split("\n").length - 1;
line += newlines;
const lastNewlineIndex = matchedDelimiter.lastIndexOf("\n");
if (lastNewlineIndex !== -1) {
column = matchedDelimiter.length - lastNewlineIndex;
} else {
column += matchedDelimiter.length;
}
} else {
column += matchedDelimiter.length;
}
index += matchedDelimiter.length;
// Reset buffer start position for next token
bufferStartIndex = index;
bufferStartLine = line;
bufferStartColumn = column;
} else {
// No delimiter found, add character to buffer
const char = input[index];
buffer += char;
// If this is the first character in the buffer, record its position
if (buffer.length === 1) {
bufferStartIndex = index;
bufferStartLine = line;
bufferStartColumn = column;
}
// Update position tracking
if (char === "\n") {
line++;
column = 1;
} else {
column++;
}
index++;
}
}
// Don't forget to yield the last buffer if not empty
if (buffer) {
yield tokenMatcher({
value: buffer,
position: {
line: bufferStartLine,
column: bufferStartColumn,
index: bufferStartIndex,
},
});
}
};
}
type PascalToken = {
type:
| "keyword"
| "identifier"
| "number"
| "string"
| "operator"
| "punctuation"
| "comment";
value: string;
position: { line: number; column: number; index: number };
};
const tokenizer = createTokenizer<PascalToken>(`
program test;
begin
WriteLn('Hello, world!');
end.
`);
const keywords = ["program", "begin", "end"];
const operators = ["+", "-", "*", "/", "=", "<", ">", "<=", ">="];
const punctuation = [";", ".", ",", "(", ")", "[", "]", "{", "}"];
const stringDelimiters = ["'"];
const tokenMatcher = (token: Token): PascalToken => {
if (keywords.includes(token.value)) {
return { ...token, type: "keyword" };
}
if (operators.includes(token.value)) {
return { ...token, type: "operator" };
}
if (punctuation.includes(token.value)) {
return { ...token, type: "punctuation" };
}
if (stringDelimiters.includes(token.value)) {
return { ...token, type: "string" };
}
return { ...token, type: "identifier" };
};
for (const token of tokenizer(
[
" ",
"/",
"//",
";",
".",
",",
"(",
")",
"[",
"]",
"{",
"}",
"'",
"\n",
"\r",
"\t",
],
tokenMatcher
)) {
console.log(token);
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment