frostney · March 5, 2025 00:15
diff --git a/createTokenizer.test.ts b/createTokenizer.test.ts
 import { describe, it, expect } from "bun:test";

 import { createTokenizer } from "./createTokenizer";
 import type { Token } from "./createTokenizer";

 describe("createTokenizer", () => {
  const simpleInput = "let x = 1;";

  it("should return the input if no delimiters are provided", () => {
    const tokenizer = createTokenizer(simpleInput);
    const tokens = tokenizer([]);

    expect(Array.from(tokens)).toEqual([
      { value: simpleInput, position: { line: 1, column: 1, index: 0 } },
    ]);
  });

  it("should return the input if the value is not found", () => {
    const tokenizer = createTokenizer(simpleInput);
    const tokens = tokenizer(["_"]);

    expect(Array.from(tokens)).toEqual([
      { value: simpleInput, position: { line: 1, column: 1, index: 0 } },
    ]);
  });

  it("should tokenize a string", () => {
    const tokenizer = createTokenizer(simpleInput);
    const tokens = tokenizer([" ", ";", "\n", "\r", "\t"]);

    expect(Array.from(tokens)).toEqual([
      { value: "let", position: { line: 1, column: 1, index: 0 } },
      { value: " ", position: { line: 1, column: 4, index: 3 } },
      { value: "x", position: { line: 1, column: 5, index: 4 } },
      { value: " ", position: { line: 1, column: 6, index: 5 } },
      { value: "=", position: { line: 1, column: 7, index: 6 } },
      { value: " ", position: { line: 1, column: 8, index: 7 } },
      { value: "1", position: { line: 1, column: 9, index: 8 } },
      { value: ";", position: { line: 1, column: 10, index: 9 } },
    ]);
  });

  it("allows to define a custom matcher", () => {
    type CustomToken = {
      value: string;
      position: { line: number; column: number; index: number };
      type: "one" | "two" | "three" | "whitespace";
    };

    const tokenizer = createTokenizer<CustomToken>("1 2 3");
    const tokens = tokenizer([" "], (token: Token) => {
      if (token.value === "1") {
        return {
          ...token,
          type: "one",
        };
      }

      if (token.value === "2") {
        return {
          ...token,
          type: "two",
        };
      }

      if (token.value === "3") {
        return {
          ...token,
          type: "three",
        };
      }

      return { ...token, type: "whitespace" };
    });

    expect(Array.from(tokens)).toEqual([
      { value: "1", type: "one", position: { line: 1, column: 1, index: 0 } },
      {
        value: " ",
        type: "whitespace",
        position: { line: 1, column: 2, index: 1 },
      },
      { value: "2", type: "two", position: { line: 1, column: 3, index: 2 } },
      {
        value: " ",
        type: "whitespace",
        position: { line: 1, column: 4, index: 3 },
      },
      { value: "3", type: "three", position: { line: 1, column: 5, index: 4 } },
    ]);
  });

  it("should take the order of the delimiter into account", () => {
    const tokenizer = createTokenizer("/ 1 // 2");
    const tokens1 = tokenizer([" ", "/", "//"]);

    expect(Array.from(tokens1)).toEqual([
      { value: "/", position: { line: 1, column: 1, index: 0 } },
      { value: " ", position: { line: 1, column: 2, index: 1 } },
      { value: "1", position: { line: 1, column: 3, index: 2 } },
      { value: " ", position: { line: 1, column: 4, index: 3 } },
      { value: "/", position: { line: 1, column: 5, index: 4 } },
      { value: "/", position: { line: 1, column: 6, index: 5 } },
      { value: " ", position: { line: 1, column: 7, index: 6 } },
      { value: "2", position: { line: 1, column: 8, index: 7 } },
    ]);

    const tokens2 = tokenizer([" ", "//", "/"]);

    expect(Array.from(tokens2)).toEqual([
      { value: "/", position: { line: 1, column: 1, index: 0 } },
      { value: " ", position: { line: 1, column: 2, index: 1 } },
      { value: "1", position: { line: 1, column: 3, index: 2 } },
      { value: " ", position: { line: 1, column: 4, index: 3 } },
      { value: "//", position: { line: 1, column: 5, index: 4 } },
      { value: " ", position: { line: 1, column: 7, index: 6 } },
      { value: "2", position: { line: 1, column: 8, index: 7 } },
    ]);
  });
 });
diff --git a/createTokenizer.ts b/createTokenizer.ts
 // Splits a string into tokens based on a token matcher
 export type Token = {
  value: string;
  position: { line: number; column: number; index: number };
 };

 export function createTokenizer<T extends Token>(input: string) {
  return function* tokens(
    delimiters: string[],
    tokenMatcher: (token: Token) => T = (token) => token as T
  ): Generator<T> {
    let index = 0;
    let line = 1;
    let column = 1;

    let buffer = "";
    let bufferStartIndex = 0;
    let bufferStartLine = 1;
    let bufferStartColumn = 1;

    while (index < input.length) {
      // Find matching delimiter based on order in the array
      let matchedDelimiter: string | null = null;

      for (const delimiter of delimiters) {
        // Check if the input at current position matches this delimiter
        if (input.substring(index, index + delimiter.length) === delimiter) {
          matchedDelimiter = delimiter;
          break; // Stop at first match to respect order priority
        }
      }

      if (matchedDelimiter) {
        // If we have accumulated a buffer, yield it as a token
        if (buffer) {
          yield tokenMatcher({
            value: buffer,
            position: {
              line: bufferStartLine,
              column: bufferStartColumn,
              index: bufferStartIndex,
            },
          });
          buffer = "";
        }

        // Yield the delimiter as a token
        yield tokenMatcher({
          value: matchedDelimiter,
          position: { line, column, index },
        });

        // Update position tracking
        if (matchedDelimiter.includes("\n")) {
          // Handle newlines in multi-character delimiters
          const newlines = matchedDelimiter.split("\n").length - 1;
          line += newlines;
          const lastNewlineIndex = matchedDelimiter.lastIndexOf("\n");
          if (lastNewlineIndex !== -1) {
            column = matchedDelimiter.length - lastNewlineIndex;
          } else {
            column += matchedDelimiter.length;
          }
        } else {
          column += matchedDelimiter.length;
        }

        index += matchedDelimiter.length;

        // Reset buffer start position for next token
        bufferStartIndex = index;
        bufferStartLine = line;
        bufferStartColumn = column;
      } else {
        // No delimiter found, add character to buffer
        const char = input[index];
        buffer += char;

        // If this is the first character in the buffer, record its position
        if (buffer.length === 1) {
          bufferStartIndex = index;
          bufferStartLine = line;
          bufferStartColumn = column;
        }

        // Update position tracking
        if (char === "\n") {
          line++;
          column = 1;
        } else {
          column++;
        }

        index++;
      }
    }

    // Don't forget to yield the last buffer if not empty
    if (buffer) {
      yield tokenMatcher({
        value: buffer,
        position: {
          line: bufferStartLine,
          column: bufferStartColumn,
          index: bufferStartIndex,
        },
      });
    }
  };
 }
diff --git a/example.ts b/example.ts
 type PascalToken = {
  type:
    | "keyword"
    | "identifier"
    | "number"
    | "string"
    | "operator"
    | "punctuation"
    | "comment";
  value: string;
  position: { line: number; column: number; index: number };
 };

 const tokenizer = createTokenizer<PascalToken>(`
  program test;

  begin
    WriteLn('Hello, world!');
  end.
 `);

 const keywords = ["program", "begin", "end"];
 const operators = ["+", "-", "*", "/", "=", "<", ">", "<=", ">="];
 const punctuation = [";", ".", ",", "(", ")", "[", "]", "{", "}"];
 const stringDelimiters = ["'"];

 const tokenMatcher = (token: Token): PascalToken => {
  if (keywords.includes(token.value)) {
    return { ...token, type: "keyword" };
  }

  if (operators.includes(token.value)) {
    return { ...token, type: "operator" };
  }

  if (punctuation.includes(token.value)) {
    return { ...token, type: "punctuation" };
  }

  if (stringDelimiters.includes(token.value)) {
    return { ...token, type: "string" };
  }

  return { ...token, type: "identifier" };
 };

 for (const token of tokenizer(
  [
    " ",
    "/",
    "//",
    ";",
    ".",
    ",",
    "(",
    ")",
    "[",
    "]",
    "{",
    "}",
    "'",
    "\n",
    "\r",
    "\t",
  ],
  tokenMatcher
 )) {
  console.log(token);
 }
	import { describe, it, expect } from "bun:test";

	import { createTokenizer } from "./createTokenizer";
	import type { Token } from "./createTokenizer";

	describe("createTokenizer", () => {
	const simpleInput = "let x = 1;";

	it("should return the input if no delimiters are provided", () => {
	const tokenizer = createTokenizer(simpleInput);
	const tokens = tokenizer([]);

	expect(Array.from(tokens)).toEqual([
	{ value: simpleInput, position: { line: 1, column: 1, index: 0 } },
	]);
	});

	it("should return the input if the value is not found", () => {
	const tokenizer = createTokenizer(simpleInput);
	const tokens = tokenizer(["_"]);

	expect(Array.from(tokens)).toEqual([
	{ value: simpleInput, position: { line: 1, column: 1, index: 0 } },
	]);
	});

	it("should tokenize a string", () => {
	const tokenizer = createTokenizer(simpleInput);
	const tokens = tokenizer([" ", ";", "\n", "\r", "\t"]);

	expect(Array.from(tokens)).toEqual([
	{ value: "let", position: { line: 1, column: 1, index: 0 } },
	{ value: " ", position: { line: 1, column: 4, index: 3 } },
	{ value: "x", position: { line: 1, column: 5, index: 4 } },
	{ value: " ", position: { line: 1, column: 6, index: 5 } },
	{ value: "=", position: { line: 1, column: 7, index: 6 } },
	{ value: " ", position: { line: 1, column: 8, index: 7 } },
	{ value: "1", position: { line: 1, column: 9, index: 8 } },
	{ value: ";", position: { line: 1, column: 10, index: 9 } },
	]);
	});

	it("allows to define a custom matcher", () => {
	type CustomToken = {
	value: string;
	position: { line: number; column: number; index: number };
	type: "one" \| "two" \| "three" \| "whitespace";
	};

	const tokenizer = createTokenizer<CustomToken>("1 2 3");
	const tokens = tokenizer([" "], (token: Token) => {
	if (token.value === "1") {
	return {
	...token,
	type: "one",
	};
	}

	if (token.value === "2") {
	return {
	...token,
	type: "two",
	};
	}

	if (token.value === "3") {
	return {
	...token,
	type: "three",
	};
	}

	return { ...token, type: "whitespace" };
	});

	expect(Array.from(tokens)).toEqual([
	{ value: "1", type: "one", position: { line: 1, column: 1, index: 0 } },
	{
	value: " ",
	type: "whitespace",
	position: { line: 1, column: 2, index: 1 },
	},
	{ value: "2", type: "two", position: { line: 1, column: 3, index: 2 } },
	{
	value: " ",
	type: "whitespace",
	position: { line: 1, column: 4, index: 3 },
	},
	{ value: "3", type: "three", position: { line: 1, column: 5, index: 4 } },
	]);
	});

	it("should take the order of the delimiter into account", () => {
	const tokenizer = createTokenizer("/ 1 // 2");
	const tokens1 = tokenizer([" ", "/", "//"]);

	expect(Array.from(tokens1)).toEqual([
	{ value: "/", position: { line: 1, column: 1, index: 0 } },
	{ value: " ", position: { line: 1, column: 2, index: 1 } },
	{ value: "1", position: { line: 1, column: 3, index: 2 } },
	{ value: " ", position: { line: 1, column: 4, index: 3 } },
	{ value: "/", position: { line: 1, column: 5, index: 4 } },
	{ value: "/", position: { line: 1, column: 6, index: 5 } },
	{ value: " ", position: { line: 1, column: 7, index: 6 } },
	{ value: "2", position: { line: 1, column: 8, index: 7 } },
	]);

	const tokens2 = tokenizer([" ", "//", "/"]);

	expect(Array.from(tokens2)).toEqual([
	{ value: "/", position: { line: 1, column: 1, index: 0 } },
	{ value: " ", position: { line: 1, column: 2, index: 1 } },
	{ value: "1", position: { line: 1, column: 3, index: 2 } },
	{ value: " ", position: { line: 1, column: 4, index: 3 } },
	{ value: "//", position: { line: 1, column: 5, index: 4 } },
	{ value: " ", position: { line: 1, column: 7, index: 6 } },
	{ value: "2", position: { line: 1, column: 8, index: 7 } },
	]);
	});
	});
	// Splits a string into tokens based on a token matcher
	export type Token = {
	value: string;
	position: { line: number; column: number; index: number };
	};

	export function createTokenizer<T extends Token>(input: string) {
	return function* tokens(
	delimiters: string[],
	tokenMatcher: (token: Token) => T = (token) => token as T
	): Generator<T> {
	let index = 0;
	let line = 1;
	let column = 1;

	let buffer = "";
	let bufferStartIndex = 0;
	let bufferStartLine = 1;
	let bufferStartColumn = 1;

	while (index < input.length) {
	// Find matching delimiter based on order in the array
	let matchedDelimiter: string \| null = null;

	for (const delimiter of delimiters) {
	// Check if the input at current position matches this delimiter
	if (input.substring(index, index + delimiter.length) === delimiter) {
	matchedDelimiter = delimiter;
	break; // Stop at first match to respect order priority
	}
	}

	if (matchedDelimiter) {
	// If we have accumulated a buffer, yield it as a token
	if (buffer) {
	yield tokenMatcher({
	value: buffer,
	position: {
	line: bufferStartLine,
	column: bufferStartColumn,
	index: bufferStartIndex,
	},
	});
	buffer = "";
	}

	// Yield the delimiter as a token
	yield tokenMatcher({
	value: matchedDelimiter,
	position: { line, column, index },
	});

	// Update position tracking
	if (matchedDelimiter.includes("\n")) {
	// Handle newlines in multi-character delimiters
	const newlines = matchedDelimiter.split("\n").length - 1;
	line += newlines;
	const lastNewlineIndex = matchedDelimiter.lastIndexOf("\n");
	if (lastNewlineIndex !== -1) {
	column = matchedDelimiter.length - lastNewlineIndex;
	} else {
	column += matchedDelimiter.length;
	}
	} else {
	column += matchedDelimiter.length;
	}

	index += matchedDelimiter.length;

	// Reset buffer start position for next token
	bufferStartIndex = index;
	bufferStartLine = line;
	bufferStartColumn = column;
	} else {
	// No delimiter found, add character to buffer
	const char = input[index];
	buffer += char;

	// If this is the first character in the buffer, record its position
	if (buffer.length === 1) {
	bufferStartIndex = index;
	bufferStartLine = line;
	bufferStartColumn = column;
	}

	// Update position tracking
	if (char === "\n") {
	line++;
	column = 1;
	} else {
	column++;
	}

	index++;
	}
	}

	// Don't forget to yield the last buffer if not empty
	if (buffer) {
	yield tokenMatcher({
	value: buffer,
	position: {
	line: bufferStartLine,
	column: bufferStartColumn,
	index: bufferStartIndex,
	},
	});
	}
	};
	}
	type PascalToken = {
	type:
	\| "keyword"
	\| "identifier"
	\| "number"
	\| "string"
	\| "operator"
	\| "punctuation"
	\| "comment";
	value: string;
	position: { line: number; column: number; index: number };
	};

	const tokenizer = createTokenizer<PascalToken>(`
	program test;

	begin
	WriteLn('Hello, world!');
	end.
	`);

	const keywords = ["program", "begin", "end"];
	const operators = ["+", "-", "*", "/", "=", "<", ">", "<=", ">="];
	const punctuation = [";", ".", ",", "(", ")", "[", "]", "{", "}"];
	const stringDelimiters = ["'"];

	const tokenMatcher = (token: Token): PascalToken => {
	if (keywords.includes(token.value)) {
	return { ...token, type: "keyword" };
	}

	if (operators.includes(token.value)) {
	return { ...token, type: "operator" };
	}

	if (punctuation.includes(token.value)) {
	return { ...token, type: "punctuation" };
	}

	if (stringDelimiters.includes(token.value)) {
	return { ...token, type: "string" };
	}

	return { ...token, type: "identifier" };
	};

	for (const token of tokenizer(
	[
	" ",
	"/",
	"//",
	";",
	".",
	",",
	"(",
	")",
	"[",
	"]",
	"{",
	"}",
	"'",
	"\n",
	"\r",
	"\t",
	],
	tokenMatcher
	)) {
	console.log(token);
	}