Created
May 16, 2019 18:13
-
-
Save dan-dr/01a87f2d1671a5e40306d0efec83beb1 to your computer and use it in GitHub Desktop.
Validate HTML and save to markdown table
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
"use strict" | |
const glob = require("glob") | |
const fs = require("fs") | |
function unwrap(str) { | |
let arr = str.split(","), | |
val, | |
o = {} | |
while ((val = arr.pop())) { | |
o[val] = true | |
} | |
return o | |
} | |
function ERROR(status, msg) { | |
const arg = Array.prototype.slice.call(arguments, 2) | |
msg = msg.replace(/(^|[^\\])\{(\w+)\}/g, (m, p, index) => { | |
const x = arg[index] | |
return (p || "") + (x !== undefined ? x : "") | |
}) | |
return { | |
status, | |
message: msg | |
} | |
} | |
//HTML 4 and 5 void tags | |
const voidTags = unwrap( | |
"area,base,basefont,br,col,command,embed,frame,hr,img,input,keygen,link,meta,param,source,track,wbr" | |
), | |
singlelevel = unwrap("script,style"), | |
regxstr = { | |
tagname: "[\\-A-Za-z0-9_:]+", | |
attrname: "[\\w\\-]+", | |
attrvalue: /(?:(?:"[^"]*")|(?:'[^']*')|[^>\s]+)/.toString().slice(1, -1) //quoted and unquoted strings | |
}, | |
regx = { | |
// Start tag regex: /[^<]*<([\-A-Za-z0-9_]+:)(?:\s+[\w\-]+(?:\s*=\s*(?:(?:"[^"]*")|(?:'[^']*')|[^>\s]+))?)*[^>]*>(.*)/, | |
opentag: new RegExp( | |
`^[^<]*?<(${ | |
regxstr.tagname | |
})` + | |
`(?:\\s+${ | |
regxstr.attrname | |
}(?:\\s*=\\s*${ | |
regxstr.attrvalue | |
})?` + | |
`)*` + | |
`([^>]*?)>((?:.|\\n)*)` | |
), | |
othertag: /^[^<]*?<([!\-\[\]\/A-Za-z0-9_:]+)([^>]*?)>((?:.|\n)*)/, //close tags, doctype, comments, cdata | |
comment: /^[^<]*?<!--(?:.|\n)*?-->/, | |
cdata: /^[^<]*?<!\[CDATA\[(?:.|\n)*?\]\]>/ | |
} | |
function ValidateHtml(html) { | |
let str = html.replace(/[\r]/g, "").trim(), | |
tag, | |
rawTag, | |
isCloseTag, | |
matches, | |
stack = [], | |
lineNumber, | |
tagStartLineNumber = 1, | |
tagEndLineNumber = 1, | |
last, | |
broken = "", | |
level = 0, | |
replaceComment = function(m) { | |
tagEndLineNumber = | |
tagStartLineNumber + | |
(m.substr(m.indexOf("<")).match(/\n/g) || []).length | |
//prepare for the next tag. | |
tagStartLineNumber = tagEndLineNumber | |
return "" | |
}, | |
replaceSingleLevel = function(m) { | |
tagEndLineNumber = tagStartLineNumber + (m.match(/\n/g) || []).length | |
//prepare for the next tag. | |
tagStartLineNumber = tagEndLineNumber | |
return "" | |
}, | |
pos | |
while (str) { | |
matches = str.match(regx.opentag) || str.match(regx.othertag) | |
if (!matches) { | |
pos = str.indexOf("<") | |
if (pos >= 0) { | |
//add new lines. | |
tagStartLineNumber += (str.substr(0, pos).match(/\n/g) || []).length | |
str = str.substr(pos + 1) | |
continue | |
} | |
break | |
} | |
rawTag = matches[1] | |
tag = rawTag.toLowerCase() //html is case insensitive | |
tagStartLineNumber += ( | |
str.substring(0, str.indexOf("<")).match(/\n/g) || [] | |
).length | |
tagEndLineNumber += ( | |
str.substring(0, str.length - matches[3].length).match(/\n/g) || [] | |
).length | |
lineNumber = tagStartLineNumber | |
str = matches[3] | |
//Identify close tag | |
if (tag[0] === "/") { | |
isCloseTag = true | |
tag = tag.substr(1) | |
} else { | |
isCloseTag = false | |
} | |
//Do something per tag | |
if (tag[0] === "!") { | |
//Either doctype or comment, so ignore them | |
if (tag.indexOf("![cdata[") === 0) { | |
if (!regx.cdata.test(matches[0])) { | |
broken = ERROR( | |
"CDATANotClosed", | |
"Line {0}: CDATA section not closed properly.", | |
lineNumber | |
) | |
break | |
} | |
str = matches[0].replace(regx.cdata, replaceComment) | |
} else if (tag.indexOf("!--") === 0) { | |
if (!regx.comment.test(matches[0])) { | |
broken = ERROR( | |
"CommentNotClosed", | |
"Line {0}: HTML comment not closed properly.", | |
lineNumber | |
) | |
break | |
} | |
str = matches[0].replace(regx.comment, replaceComment) | |
} | |
continue | |
} else if (voidTags[tag]) { | |
continue | |
} else if (singlelevel[tag]) { | |
//prepare for counting the \n between start of tag and end angle bracket of end tag | |
tagStartLineNumber = tagEndLineNumber | |
//remove everything upto end tag | |
const specialEndTagRegex = new RegExp( | |
`^((?:.|\\n)*?)</${tag}[^>]*>` | |
) | |
if (!specialEndTagRegex.test(str.toLowerCase())) { | |
broken = ERROR( | |
"MissingEndTag", | |
"Line {0}: {1} start tag missing corresponding end tag.", | |
lineNumber, | |
`<${tag}>` | |
) | |
break | |
} | |
str = str.replace(specialEndTagRegex, replaceSingleLevel) | |
continue | |
} | |
if (isCloseTag) { | |
level -= 1 | |
} | |
if (level < 0) { | |
broken = ERROR( | |
"ExtraTag", | |
"Line {0}: Extra end tag found: {1}", | |
lineNumber, | |
`<${rawTag}>` | |
) | |
break | |
} | |
if (!isCloseTag) { | |
level += 1 | |
} | |
if (!isCloseTag) { | |
stack.push({ | |
tag, | |
line: lineNumber | |
}) | |
} else { | |
last = stack[stack.length - 1] | |
if (last.tag !== tag) { | |
pos = -1 | |
stack.some((o, index) => { | |
if (o.tag === tag) { | |
pos = index | |
return true | |
} | |
}) | |
if (pos < 0) { | |
broken = ERROR( | |
"ExtraTag", | |
"Line {0}: Extra end tag found: {1}", | |
lineNumber, | |
`<${rawTag}>` | |
) | |
} else { | |
broken = ERROR( | |
"WrongTag", | |
"Line {0}: {1} start tag from line {2} should be closed before {3}.", | |
lineNumber, | |
`<${last.tag}>`, | |
last.line, | |
`<${rawTag}>` | |
) | |
} | |
break | |
} | |
stack.pop() | |
} | |
//Prepare for next tag. | |
tagStartLineNumber = tagEndLineNumber | |
} | |
if (!broken && stack.length > 0) { | |
last = stack[stack.length - 1] | |
broken = ERROR( | |
"MissingEndTag", | |
"Line {0}: {1} start tag missing corresponding end tag.", | |
last.line, | |
`<${last.tag}>` | |
) | |
} | |
return broken ? broken : true | |
} | |
function jsonToMarkdownTable (array, columns) { | |
const cols = columns | |
? columns.split(",") | |
: Object.keys(array[0]) | |
let table = "" | |
table += cols.join(" | ") | |
table += "\r\n" | |
table += cols.map(() => { | |
return '---' | |
}).join(' | ') | |
table += "\r\n" | |
array.forEach((item) => { | |
table += `${Object.keys(item).map(function (key) { | |
return String(item[key]) | |
}).join(" | ") }\r\n` | |
}) | |
return table | |
} | |
function html_escape(text){ | |
var text = text.replace(/&/g, '&') | |
.replace(/>/g, '>') | |
.replace(/</g, '<') | |
.replace(/"/g, '"') | |
return text | |
} | |
glob(`${__dirname}/**/*.html`, {ignore: [`**/node_modules/**`]}, async (err, files) => { | |
const results = await Promise.all( | |
files.map( | |
(f) => | |
new Promise((resolve, reject) => { | |
try { | |
fs.readFile(f, 'utf-8', (err, data) => { | |
if (err) reject(err) | |
resolve({ file: f.replace(__dirname, '') , ...ValidateHtml(data) }) | |
}) | |
} catch (error) { | |
reject(error) | |
} | |
}) | |
) | |
) | |
fs.writeFileSync('badhtml.md', html_escape(jsonToMarkdownTable(results.filter(({status}) => status), null, 2))) | |
}) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment