ndesmic · October 16, 2024 22:38
diff --git a/match.js b/match.js
 import { Tokenizer } from "./tokenizer.js";

 const textOutput = Deno.readTextFileSync("./output/doc.txt");
 const textExpected = Deno.readTextFileSync("./expected/doc.txt");

 const tokenizer = new Tokenizer([
 	{ matcher: /\{\{\{/, type: "regex-start" },
 	{ matcher: /\}\}\}/, type: "regex-end" },
 	{ matcher: /[a-zA-Z0-9\[\]\(\)\{\}\.\$\-:\!\\ \t]/, type: "string", valueExtractor: x => x }
 ]);

 let i = 0;
 let isRegex = false;
 let regexBuffer = "";
 const tokens = [...tokenizer.tokenize(textExpected)];
 for (const token of tokens){
 	switch(token.type){
 		case "string": {
 			if(isRegex){
 				regexBuffer += token.value;
 			} else {
 				if (textOutput[i] === token.value) {
 					i++;
 					continue;
 				} else {
 					throw new Error(`Text did not match at index ${i}`)
 				}
 			}
 			break;
 		}
 		case "regex-start": {
 			isRegex = true;
 			break;
 		}
 		case "regex-end": {
 			isRegex = false;
 			const regex = new RegExp(regexBuffer, "y");
 			regex.lastIndex = i;
 			const matched = regex.exec(textOutput);
 			i += matched[0].length;
 			regexBuffer = "";
 			break;
 		}
 	}
 }

 console.log("Matched!")
diff --git a/tokenizer.js b/tokenizer.js
 export const END = Symbol("END");

 export class Tokenizer {
 	#tokenTypes;

 	constructor(tokenTypes) {
 		this.#tokenTypes = tokenTypes;
 	}

 	*tokenize(text) {
 		let index = 0;
 		while (index < text.length) {
 			let hasMatch = false;

 			for (const { matcher, type, valueExtractor } of this.#tokenTypes) {
 				const currentMatcher = new RegExp(matcher.source, "y");
 				currentMatcher.lastIndex = index;
 				const matched = currentMatcher.exec(text);
 				if (matched !== null) {
 					index += matched[0].length;
 					if (type != null) {
 						const token = { type };
 						if (valueExtractor) {
 							token.value = valueExtractor(matched[0]);
 						}
 						yield token;
 					}
 					hasMatch = true;
 				}
 			}
 			if (!hasMatch) {
 				throw new Error(`Unexpected token at index ${index}`);
 			}
 		}
 		yield { type: END };
 	}
 }
	import { Tokenizer } from "./tokenizer.js";

	const textOutput = Deno.readTextFileSync("./output/doc.txt");
	const textExpected = Deno.readTextFileSync("./expected/doc.txt");

	const tokenizer = new Tokenizer([
	{ matcher: /\{\{\{/, type: "regex-start" },
	{ matcher: /\}\}\}/, type: "regex-end" },
	{ matcher: /[a-zA-Z0-9\[\]\(\)\{\}\.\$\-:\!\\ \t]/, type: "string", valueExtractor: x => x }
	]);

	let i = 0;
	let isRegex = false;
	let regexBuffer = "";
	const tokens = [...tokenizer.tokenize(textExpected)];
	for (const token of tokens){
	switch(token.type){
	case "string": {
	if(isRegex){
	regexBuffer += token.value;
	} else {
	if (textOutput[i] === token.value) {
	i++;
	continue;
	} else {
	throw new Error(`Text did not match at index ${i}`)
	}
	}
	break;
	}
	case "regex-start": {
	isRegex = true;
	break;
	}
	case "regex-end": {
	isRegex = false;
	const regex = new RegExp(regexBuffer, "y");
	regex.lastIndex = i;
	const matched = regex.exec(textOutput);
	i += matched[0].length;
	regexBuffer = "";
	break;
	}
	}
	}

	console.log("Matched!")
	export const END = Symbol("END");

	export class Tokenizer {
	#tokenTypes;

	constructor(tokenTypes) {
	this.#tokenTypes = tokenTypes;
	}

	*tokenize(text) {
	let index = 0;
	while (index < text.length) {
	let hasMatch = false;

	for (const { matcher, type, valueExtractor } of this.#tokenTypes) {
	const currentMatcher = new RegExp(matcher.source, "y");
	currentMatcher.lastIndex = index;
	const matched = currentMatcher.exec(text);
	if (matched !== null) {
	index += matched[0].length;
	if (type != null) {
	const token = { type };
	if (valueExtractor) {
	token.value = valueExtractor(matched[0]);
	}
	yield token;
	}
	hasMatch = true;
	}
	}
	if (!hasMatch) {
	throw new Error(`Unexpected token at index ${index}`);
	}
	}
	yield { type: END };
	}
	}