JakeCoxon · December 2, 2022 15:56
diff --git a/tokeniser.ts b/tokeniser.ts

 function tokenize(input: string) {
  const regexes = {
    KEYWORD:
      /^(?:and|assert|as|break|class|continue|def|elif|else|false|for|if|import|in|is|lambda|null|not|or|pass|return|try|while|with)/,
    IDENTIFIER: /^[a-zA-Z_][a-zA-Z_0-9]*/,
    LITERAL: /^(?:"(?:[^"\\]|\\.)*"|'(?:[^'\\]|\\.)*')/,
    SPECIALNUMBER: /^0[xXbB][0-9a-zA-Z_]+/,
    NUMBER: /^-?[0-9][0-9_]*(\.[0-9_]+)?/,
    COMMENT: /^#.+(?=\n)/,
    OPENPAREN: /^[\{\{\(]/,
    CLOSEPAREN: /^[\]\}\)]/,
    PUNCTUATION: /^(?:==|!=|[:,=,])/,
    NEWLINE: /^\n/, // Precedence before whitespace
    WHITESPACE: /^\s+/,

    INDENT: /^ +/ // Don't actually match this one, but it will be covered by whitespace instead
  };

  const tokens = [];

  let indent = { level: 0, numSpaces: 0 };
  let lineNumber = 0;
  let lineStart = 0;
  let tokenIndex = 0;
  let parentheses = 0;

  let match;

  const exec = (regex, type) => {
    if ((match = regex.exec(line))) {
      line = line.substring(match[0].length);
      const token = { value: match[0], type, lineNumber, tokenIndex };
      tokenIndex += match[0].length;
      tokens.push(token);
      return token;
    }
  };

  // First is line by line
  let line = input;
  while (line.length > 0) {
    if (exec(regexes.NEWLINE, "NEWLINE")) {
      lineNumber++;
      lineStart = tokenIndex;
      continue;
    }

    if (line.length > 0) {
      let token = exec(regexes.INDENT, "");
      const numSpaces = token?.value.length;
      if (token && numSpaces > indent.numSpaces) {
        indent = { level: indent.level + 1, numSpaces };
        token.type = "INDENT";
      } else if (token && numSpaces < indent.numSpaces) {
        indent = { level: indent.level - 1, numSpaces };
        token.type = "OUTDENT";
      }
    }

    // Tokens after the indentation, or within a grouped expression
    while (line.length > 0 && (parentheses > 0 || line[0] !== "\n")) {
      let token;
      for (const [type, regex] of Object.entries(regexes)) {
        if ((token = exec(regex, type))) break;
      }

      if (!token) {
        const line = input.substring(lineStart, input.indexOf("\n", lineStart));
        const repeat = " ".repeat(tokenIndex - lineStart);
        const message = `Unable to tokenize line ${lineNumber} \n${line}\n${repeat}^-- here`;
        throw new Error(message);
      }
      if (token.type === "WHITESPACE" || token.type === "COMMENT") tokens.pop();
      if (token.type === "NEWLINE") { lineNumber++; lineStart = tokenIndex; tokens.pop(); } // prettier-ignore
      if (token.type === "OPENPAREN") parentheses++;
      if (token.type === "CLOSEPAREN") parentheses--;
    }
  }

  while (indent.level) {
    tokens.push({ value: "", type: "OUTDENT" });
    indent.level--;
  }

  return tokens;
 }

 (() => {
  const input = `
 assert false, "unexp\\"ected thing"
 if something == true:
  do(something) # this is some cool
  foo(
        bar, baz, baw) 
  foo(1,2, 0xff)
 `;
  const out = tokenize(input);

  const html = (() => {
    let html = ``;
    let last = 0;
    const colors = {
      KEYWORD: "#ff6767",
      IDENTIFIER: "pink",
      LITERAL: "#8b8bff",
      SPECIALNUMBER: "#b0ffb0",
      NUMBER: "#b0ffb0",
      COMMENT: "lightgreen",
      OPENPAREN: "#ff6eff",
      CLOSEPAREN: "#ff6eff",
      PUNCTUATION: "#ff91ff",
      NEWLINE: "white",
      WHITESPACE: "lightgrey",
      INDENT: "lightgrey"
    };
    out.forEach((token) => {
      if (token.tokenIndex != last) {
        html += input.substring(last, token.tokenIndex);
      }
      last = token.tokenIndex + token.value.length;
      html += `<span style="background-color: ${colors[token.type]}">${token.value}</span>`;
    });
    return html;
  })();
  let str = JSON.stringify(out, null, 2);
  str = str
    .replaceAll("&", "&amp")
    .replaceAll("<", "&lt")
    .replaceAll(">", "&gt;")
    .replaceAll("'", "&#39;")
    .replaceAll('"', "&quot;");
  document.querySelector("#app")!!.innerHTML = `<pre>${html}</pre><br><pre>${str}</pre>`;
 })();

	function tokenize(input: string) {
	const regexes = {
	KEYWORD:
	/^(?:and\|assert\|as\|break\|class\|continue\|def\|elif\|else\|false\|for\|if\|import\|in\|is\|lambda\|null\|not\|or\|pass\|return\|try\|while\|with)/,
	IDENTIFIER: /^[a-zA-Z_][a-zA-Z_0-9]*/,
	LITERAL: /^(?:"(?:[^"\\]\|\\.)"\|'(?:[^'\\]\|\\.)')/,
	SPECIALNUMBER: /^0[xXbB][0-9a-zA-Z_]+/,
	NUMBER: /^-?[0-9][0-9_]*(\.[0-9_]+)?/,
	COMMENT: /^#.+(?=\n)/,
	OPENPAREN: /^[\{\{\(]/,
	CLOSEPAREN: /^[\]\}\)]/,
	PUNCTUATION: /^(?:==\|!=\|[:,=,])/,
	NEWLINE: /^\n/, // Precedence before whitespace
	WHITESPACE: /^\s+/,

	INDENT: /^ +/ // Don't actually match this one, but it will be covered by whitespace instead
	};

	const tokens = [];

	let indent = { level: 0, numSpaces: 0 };
	let lineNumber = 0;
	let lineStart = 0;
	let tokenIndex = 0;
	let parentheses = 0;

	let match;

	const exec = (regex, type) => {
	if ((match = regex.exec(line))) {
	line = line.substring(match[0].length);
	const token = { value: match[0], type, lineNumber, tokenIndex };
	tokenIndex += match[0].length;
	tokens.push(token);
	return token;
	}
	};

	// First is line by line
	let line = input;
	while (line.length > 0) {
	if (exec(regexes.NEWLINE, "NEWLINE")) {
	lineNumber++;
	lineStart = tokenIndex;
	continue;
	}

	if (line.length > 0) {
	let token = exec(regexes.INDENT, "");
	const numSpaces = token?.value.length;
	if (token && numSpaces > indent.numSpaces) {
	indent = { level: indent.level + 1, numSpaces };
	token.type = "INDENT";
	} else if (token && numSpaces < indent.numSpaces) {
	indent = { level: indent.level - 1, numSpaces };
	token.type = "OUTDENT";
	}
	}

	// Tokens after the indentation, or within a grouped expression
	while (line.length > 0 && (parentheses > 0 \|\| line[0] !== "\n")) {
	let token;
	for (const [type, regex] of Object.entries(regexes)) {
	if ((token = exec(regex, type))) break;
	}

	if (!token) {
	const line = input.substring(lineStart, input.indexOf("\n", lineStart));
	const repeat = " ".repeat(tokenIndex - lineStart);
	const message = `Unable to tokenize line ${lineNumber} \n${line}\n${repeat}^-- here`;
	throw new Error(message);
	}
	if (token.type === "WHITESPACE" \|\| token.type === "COMMENT") tokens.pop();
	if (token.type === "NEWLINE") { lineNumber++; lineStart = tokenIndex; tokens.pop(); } // prettier-ignore
	if (token.type === "OPENPAREN") parentheses++;
	if (token.type === "CLOSEPAREN") parentheses--;
	}
	}

	while (indent.level) {
	tokens.push({ value: "", type: "OUTDENT" });
	indent.level--;
	}

	return tokens;
	}

	(() => {
	const input = `
	assert false, "unexp\\"ected thing"
	if something == true:
	do(something) # this is some cool
	foo(
	bar, baz, baw)
	foo(1,2, 0xff)
	`;
	const out = tokenize(input);

	const html = (() => {
	let html = ``;
	let last = 0;
	const colors = {
	KEYWORD: "#ff6767",
	IDENTIFIER: "pink",
	LITERAL: "#8b8bff",
	SPECIALNUMBER: "#b0ffb0",
	NUMBER: "#b0ffb0",
	COMMENT: "lightgreen",
	OPENPAREN: "#ff6eff",
	CLOSEPAREN: "#ff6eff",
	PUNCTUATION: "#ff91ff",
	NEWLINE: "white",
	WHITESPACE: "lightgrey",
	INDENT: "lightgrey"
	};
	out.forEach((token) => {
	if (token.tokenIndex != last) {
	html += input.substring(last, token.tokenIndex);
	}
	last = token.tokenIndex + token.value.length;
	html += `<span style="background-color: ${colors[token.type]}">${token.value}</span>`;
	});
	return html;
	})();
	let str = JSON.stringify(out, null, 2);
	str = str
	.replaceAll("&", "&amp")
	.replaceAll("<", "&lt")
	.replaceAll(">", ">")
	.replaceAll("'", "'")
	.replaceAll('"', """);
	document.querySelector("#app")!!.innerHTML = `<pre>${html}</pre><br><pre>${str}</pre>`;
	})();