Skip to content

Instantly share code, notes, and snippets.

@102
Last active October 20, 2020 07:03
Show Gist options
  • Save 102/dc9149ecda01e45090a8e5b8a915e7e7 to your computer and use it in GitHub Desktop.
Save 102/dc9149ecda01e45090a8e5b8a915e7e7 to your computer and use it in GitHub Desktop.
pseudo-HTML Lexer and Parser (without attributes support)
/**
* @typedef LexemType
* @type {'data' | 'startTagStart' | 'endTagStart' | 'selfClosedTagEnd' | 'tagEnd' | 'tagName'}
*/
/** @type {{ [lexem: string]: LexemType }} */
const Lexem = {
startTagStart: "startTagStart", // `<` in `<div>`
endTagStart: "endTagStart", // `</` in `</div>
selfClosedTagEnd: "selfClosedTagEnd", // `/>` in `<br />`
tagEnd: "tagEnd", // `>` in `<div>`
data: "data" // ` foo ` in `<div> foo </div>`
};
/**
* @typedef Lexem
* @type {object}
* @property {LexemType} type
* @property {string} [data]
*/
/**
* @param {string} input
* @returns {Lexem[]}
*/
const lexer = (input) => {
/** @type {Lexem[]} */
let result = [];
let i = 0;
let data = "";
const flushData = () => {
result.push({ type: "data", data });
data = "";
};
while (i < input.length) {
if (input[i] === "<") {
flushData();
if (input[i + 1] === "/") {
result.push({ type: Lexem.endTagStart });
i += 2;
} else {
result.push({ type: Lexem.startTagStart });
i += 1;
}
continue;
}
if (input[i] === "/" && input[i + 1] === ">") {
flushData();
result.push({ type: "selfClosedTagEnd" });
i += 2;
continue;
}
if (input[i] === ">") {
flushData();
result.push({ type: "tagEnd" });
i += 1;
continue;
}
data += input[i];
i += 1;
continue;
}
return result.filter((lexem) => lexem.type !== "data" || Boolean(lexem.data));
};
/**
* @typedef ASTNodeType
* @type {'data' | 'node' | 'selfClosedNode'}
*/
/**
* @typedef ASTNode
* @type {object}
* @property {ASTNodeType} type
* @property {ASTNode[]} [children]
* @property {string} [data]
*/
/**
* @typedef NiceLexems
* @type {{type: 'startTag'|'selfClosedTag'|'endTag'|'data', data?: string}[]}
*/
/**
* @param {Lexem[]} _lexems
* @param {NiceLexems} [__lexems]
* @returns {ASTNode[]}
*/
const parser = (_lexems, __lexems) => {
/** @type {NiceLexems} */
const lexems = __lexems || [];
let i = 0;
if (!lexems.length) {
while (i < _lexems.length) {
const lexem = _lexems[i];
if (lexem.type === "data") {
lexems.push({ type: "data", data: lexem.data });
i += 1;
continue;
}
if (lexem.type === "startTagStart") {
if (_lexems[i + 1].type === "data") {
if (_lexems[i + 2].type === "selfClosedTagEnd") {
lexems.push({ type: "selfClosedTag", data: _lexems[i + 1].data });
i += 3;
continue;
} else if (_lexems[i + 2].type === "tagEnd") {
lexems.push({ type: "startTag", data: _lexems[i + 1].data });
i += 3;
continue;
} else {
throw new Error("Failed to pre-parse");
}
}
}
if (lexem.type === "endTagStart") {
if (_lexems[i + 1].type === "data") {
if (_lexems[i + 2].type === "tagEnd") {
lexems.push({ type: "endTag", data: _lexems[i + 1].data });
i += 3;
continue;
}
}
}
throw new Error("You should not be here");
}
}
/** @type {ASTNode[]} */
let result = [];
i = 0;
while (i < lexems.length) {
const lexem0 = lexems[i];
if (lexem0.type === "data") {
result.push({ type: "data", data: lexem0.data });
i += 1;
continue;
} else if (lexem0.type === "selfClosedTag") {
result.push({ type: "selfClosedNode", data: lexem0.data });
i += 1;
continue;
} else if (lexem0.type === "startTag") {
let stack = [lexem0.data.trim()];
let j = i + 1;
let open = true;
while (open && j < lexems.length) {
if (lexems[j].type === "startTag") {
stack.push(lexems[j].data.trim());
j += 1;
} else if (
lexems[j].type === "endTag" &&
lexems[j].data.trim() === stack[stack.length - 1]
) {
stack.pop();
if (stack.length === 0) {
open = false;
result.push({
type: "node",
children: parser([], lexems.slice(i + 1, j)),
data: lexem0.data
});
i = j + 1;
continue;
} else {
j += 1;
continue;
}
} else {
j += 1;
continue;
}
}
if (stack.length) {
throw new Error(
`Unexpected non-closed tag \`${stack[stack.length - 1]}\``
);
}
} else {
throw new Error(
`Unexpected lexem ${lexem0}, 'data', 'selfClosedTag', or 'startTag' expected`
);
}
}
return result;
};
/**
* @param {string} input
* @returns {boolean}
*/
const validator = (input) => {
try {
parser(lexer(input));
return true;
} catch (error) {
return false;
}
};
module.exports.htmlValidator = validator;
module.exports.lexer = lexer;
module.exports.parser = parser;
// @ts-ignore
const util = require("util");
const { lexer, parser } = require("./");
const lexems = lexer("<div ><br/><div >asd <br/> kek</div></div >");
console.log(lexems);
const ast = parser(lexems);
console.log(util.inspect(ast, false, 5));
const { htmlValidator } = require("./");
/**
* @type {[string, boolean][]}
*/
const cases = [
["<tag>I love coding <Component />!</tag>", true],
["<div>", false],
["<div><div><div/></div></div>", true],
["test", true],
["<p/>", true],
["<div ></div>", true],
["<div></div></div>", false],
["<a><b></a></b>", false]
];
cases.forEach(([input, expected]) => {
const result = htmlValidator(input);
if (result !== expected) {
console.error(`FAIL: ${input} is "${result}", "${expected}" expected`);
}
});
{
"compilerOptions": {
"allowJs": true,
"checkJs": true,
"noImplicitAny": true,
"noEmit": true
},
"include": ["./index.js", "./test.js", "./smoke.js"]
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment