Last active
October 20, 2020 07:03
-
-
Save 102/dc9149ecda01e45090a8e5b8a915e7e7 to your computer and use it in GitHub Desktop.
pseudo-HTML Lexer and Parser (without attributes support)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/** | |
* @typedef LexemType | |
* @type {'data' | 'startTagStart' | 'endTagStart' | 'selfClosedTagEnd' | 'tagEnd' | 'tagName'} | |
*/ | |
/** @type {{ [lexem: string]: LexemType }} */ | |
const Lexem = { | |
startTagStart: "startTagStart", // `<` in `<div>` | |
endTagStart: "endTagStart", // `</` in `</div> | |
selfClosedTagEnd: "selfClosedTagEnd", // `/>` in `<br />` | |
tagEnd: "tagEnd", // `>` in `<div>` | |
data: "data" // ` foo ` in `<div> foo </div>` | |
}; | |
/** | |
* @typedef Lexem | |
* @type {object} | |
* @property {LexemType} type | |
* @property {string} [data] | |
*/ | |
/** | |
* @param {string} input | |
* @returns {Lexem[]} | |
*/ | |
const lexer = (input) => { | |
/** @type {Lexem[]} */ | |
let result = []; | |
let i = 0; | |
let data = ""; | |
const flushData = () => { | |
result.push({ type: "data", data }); | |
data = ""; | |
}; | |
while (i < input.length) { | |
if (input[i] === "<") { | |
flushData(); | |
if (input[i + 1] === "/") { | |
result.push({ type: Lexem.endTagStart }); | |
i += 2; | |
} else { | |
result.push({ type: Lexem.startTagStart }); | |
i += 1; | |
} | |
continue; | |
} | |
if (input[i] === "/" && input[i + 1] === ">") { | |
flushData(); | |
result.push({ type: "selfClosedTagEnd" }); | |
i += 2; | |
continue; | |
} | |
if (input[i] === ">") { | |
flushData(); | |
result.push({ type: "tagEnd" }); | |
i += 1; | |
continue; | |
} | |
data += input[i]; | |
i += 1; | |
continue; | |
} | |
return result.filter((lexem) => lexem.type !== "data" || Boolean(lexem.data)); | |
}; | |
/** | |
* @typedef ASTNodeType | |
* @type {'data' | 'node' | 'selfClosedNode'} | |
*/ | |
/** | |
* @typedef ASTNode | |
* @type {object} | |
* @property {ASTNodeType} type | |
* @property {ASTNode[]} [children] | |
* @property {string} [data] | |
*/ | |
/** | |
* @typedef NiceLexems | |
* @type {{type: 'startTag'|'selfClosedTag'|'endTag'|'data', data?: string}[]} | |
*/ | |
/** | |
* @param {Lexem[]} _lexems | |
* @param {NiceLexems} [__lexems] | |
* @returns {ASTNode[]} | |
*/ | |
const parser = (_lexems, __lexems) => { | |
/** @type {NiceLexems} */ | |
const lexems = __lexems || []; | |
let i = 0; | |
if (!lexems.length) { | |
while (i < _lexems.length) { | |
const lexem = _lexems[i]; | |
if (lexem.type === "data") { | |
lexems.push({ type: "data", data: lexem.data }); | |
i += 1; | |
continue; | |
} | |
if (lexem.type === "startTagStart") { | |
if (_lexems[i + 1].type === "data") { | |
if (_lexems[i + 2].type === "selfClosedTagEnd") { | |
lexems.push({ type: "selfClosedTag", data: _lexems[i + 1].data }); | |
i += 3; | |
continue; | |
} else if (_lexems[i + 2].type === "tagEnd") { | |
lexems.push({ type: "startTag", data: _lexems[i + 1].data }); | |
i += 3; | |
continue; | |
} else { | |
throw new Error("Failed to pre-parse"); | |
} | |
} | |
} | |
if (lexem.type === "endTagStart") { | |
if (_lexems[i + 1].type === "data") { | |
if (_lexems[i + 2].type === "tagEnd") { | |
lexems.push({ type: "endTag", data: _lexems[i + 1].data }); | |
i += 3; | |
continue; | |
} | |
} | |
} | |
throw new Error("You should not be here"); | |
} | |
} | |
/** @type {ASTNode[]} */ | |
let result = []; | |
i = 0; | |
while (i < lexems.length) { | |
const lexem0 = lexems[i]; | |
if (lexem0.type === "data") { | |
result.push({ type: "data", data: lexem0.data }); | |
i += 1; | |
continue; | |
} else if (lexem0.type === "selfClosedTag") { | |
result.push({ type: "selfClosedNode", data: lexem0.data }); | |
i += 1; | |
continue; | |
} else if (lexem0.type === "startTag") { | |
let stack = [lexem0.data.trim()]; | |
let j = i + 1; | |
let open = true; | |
while (open && j < lexems.length) { | |
if (lexems[j].type === "startTag") { | |
stack.push(lexems[j].data.trim()); | |
j += 1; | |
} else if ( | |
lexems[j].type === "endTag" && | |
lexems[j].data.trim() === stack[stack.length - 1] | |
) { | |
stack.pop(); | |
if (stack.length === 0) { | |
open = false; | |
result.push({ | |
type: "node", | |
children: parser([], lexems.slice(i + 1, j)), | |
data: lexem0.data | |
}); | |
i = j + 1; | |
continue; | |
} else { | |
j += 1; | |
continue; | |
} | |
} else { | |
j += 1; | |
continue; | |
} | |
} | |
if (stack.length) { | |
throw new Error( | |
`Unexpected non-closed tag \`${stack[stack.length - 1]}\`` | |
); | |
} | |
} else { | |
throw new Error( | |
`Unexpected lexem ${lexem0}, 'data', 'selfClosedTag', or 'startTag' expected` | |
); | |
} | |
} | |
return result; | |
}; | |
/** | |
* @param {string} input | |
* @returns {boolean} | |
*/ | |
const validator = (input) => { | |
try { | |
parser(lexer(input)); | |
return true; | |
} catch (error) { | |
return false; | |
} | |
}; | |
module.exports.htmlValidator = validator; | |
module.exports.lexer = lexer; | |
module.exports.parser = parser; |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// @ts-ignore | |
const util = require("util"); | |
const { lexer, parser } = require("./"); | |
const lexems = lexer("<div ><br/><div >asd <br/> kek</div></div >"); | |
console.log(lexems); | |
const ast = parser(lexems); | |
console.log(util.inspect(ast, false, 5)); |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
const { htmlValidator } = require("./"); | |
/** | |
* @type {[string, boolean][]} | |
*/ | |
const cases = [ | |
["<tag>I love coding <Component />!</tag>", true], | |
["<div>", false], | |
["<div><div><div/></div></div>", true], | |
["test", true], | |
["<p/>", true], | |
["<div ></div>", true], | |
["<div></div></div>", false], | |
["<a><b></a></b>", false] | |
]; | |
cases.forEach(([input, expected]) => { | |
const result = htmlValidator(input); | |
if (result !== expected) { | |
console.error(`FAIL: ${input} is "${result}", "${expected}" expected`); | |
} | |
}); |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Show hidden characters
{ | |
"compilerOptions": { | |
"allowJs": true, | |
"checkJs": true, | |
"noImplicitAny": true, | |
"noEmit": true | |
}, | |
"include": ["./index.js", "./test.js", "./smoke.js"] | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment