Skip to content

Instantly share code, notes, and snippets.

@dan-dr
Created May 16, 2019 18:13
Show Gist options
  • Save dan-dr/01a87f2d1671a5e40306d0efec83beb1 to your computer and use it in GitHub Desktop.
Save dan-dr/01a87f2d1671a5e40306d0efec83beb1 to your computer and use it in GitHub Desktop.
Validate HTML and save to markdown table
"use strict"
const glob = require("glob")
const fs = require("fs")
function unwrap(str) {
let arr = str.split(","),
val,
o = {}
while ((val = arr.pop())) {
o[val] = true
}
return o
}
function ERROR(status, msg) {
const arg = Array.prototype.slice.call(arguments, 2)
msg = msg.replace(/(^|[^\\])\{(\w+)\}/g, (m, p, index) => {
const x = arg[index]
return (p || "") + (x !== undefined ? x : "")
})
return {
status,
message: msg
}
}
//HTML 4 and 5 void tags
const voidTags = unwrap(
"area,base,basefont,br,col,command,embed,frame,hr,img,input,keygen,link,meta,param,source,track,wbr"
),
singlelevel = unwrap("script,style"),
regxstr = {
tagname: "[\\-A-Za-z0-9_:]+",
attrname: "[\\w\\-]+",
attrvalue: /(?:(?:"[^"]*")|(?:'[^']*')|[^>\s]+)/.toString().slice(1, -1) //quoted and unquoted strings
},
regx = {
// Start tag regex: /[^<]*<([\-A-Za-z0-9_]+:)(?:\s+[\w\-]+(?:\s*=\s*(?:(?:"[^"]*")|(?:'[^']*')|[^>\s]+))?)*[^>]*>(.*)/,
opentag: new RegExp(
`^[^<]*?<(${
regxstr.tagname
})` +
`(?:\\s+${
regxstr.attrname
}(?:\\s*=\\s*${
regxstr.attrvalue
})?` +
`)*` +
`([^>]*?)>((?:.|\\n)*)`
),
othertag: /^[^<]*?<([!\-\[\]\/A-Za-z0-9_:]+)([^>]*?)>((?:.|\n)*)/, //close tags, doctype, comments, cdata
comment: /^[^<]*?<!--(?:.|\n)*?-->/,
cdata: /^[^<]*?<!\[CDATA\[(?:.|\n)*?\]\]>/
}
function ValidateHtml(html) {
let str = html.replace(/[\r]/g, "").trim(),
tag,
rawTag,
isCloseTag,
matches,
stack = [],
lineNumber,
tagStartLineNumber = 1,
tagEndLineNumber = 1,
last,
broken = "",
level = 0,
replaceComment = function(m) {
tagEndLineNumber =
tagStartLineNumber +
(m.substr(m.indexOf("<")).match(/\n/g) || []).length
//prepare for the next tag.
tagStartLineNumber = tagEndLineNumber
return ""
},
replaceSingleLevel = function(m) {
tagEndLineNumber = tagStartLineNumber + (m.match(/\n/g) || []).length
//prepare for the next tag.
tagStartLineNumber = tagEndLineNumber
return ""
},
pos
while (str) {
matches = str.match(regx.opentag) || str.match(regx.othertag)
if (!matches) {
pos = str.indexOf("<")
if (pos >= 0) {
//add new lines.
tagStartLineNumber += (str.substr(0, pos).match(/\n/g) || []).length
str = str.substr(pos + 1)
continue
}
break
}
rawTag = matches[1]
tag = rawTag.toLowerCase() //html is case insensitive
tagStartLineNumber += (
str.substring(0, str.indexOf("<")).match(/\n/g) || []
).length
tagEndLineNumber += (
str.substring(0, str.length - matches[3].length).match(/\n/g) || []
).length
lineNumber = tagStartLineNumber
str = matches[3]
//Identify close tag
if (tag[0] === "/") {
isCloseTag = true
tag = tag.substr(1)
} else {
isCloseTag = false
}
//Do something per tag
if (tag[0] === "!") {
//Either doctype or comment, so ignore them
if (tag.indexOf("![cdata[") === 0) {
if (!regx.cdata.test(matches[0])) {
broken = ERROR(
"CDATANotClosed",
"Line {0}: CDATA section not closed properly.",
lineNumber
)
break
}
str = matches[0].replace(regx.cdata, replaceComment)
} else if (tag.indexOf("!--") === 0) {
if (!regx.comment.test(matches[0])) {
broken = ERROR(
"CommentNotClosed",
"Line {0}: HTML comment not closed properly.",
lineNumber
)
break
}
str = matches[0].replace(regx.comment, replaceComment)
}
continue
} else if (voidTags[tag]) {
continue
} else if (singlelevel[tag]) {
//prepare for counting the \n between start of tag and end angle bracket of end tag
tagStartLineNumber = tagEndLineNumber
//remove everything upto end tag
const specialEndTagRegex = new RegExp(
`^((?:.|\\n)*?)</${tag}[^>]*>`
)
if (!specialEndTagRegex.test(str.toLowerCase())) {
broken = ERROR(
"MissingEndTag",
"Line {0}: {1} start tag missing corresponding end tag.",
lineNumber,
`<${tag}>`
)
break
}
str = str.replace(specialEndTagRegex, replaceSingleLevel)
continue
}
if (isCloseTag) {
level -= 1
}
if (level < 0) {
broken = ERROR(
"ExtraTag",
"Line {0}: Extra end tag found: {1}",
lineNumber,
`<${rawTag}>`
)
break
}
if (!isCloseTag) {
level += 1
}
if (!isCloseTag) {
stack.push({
tag,
line: lineNumber
})
} else {
last = stack[stack.length - 1]
if (last.tag !== tag) {
pos = -1
stack.some((o, index) => {
if (o.tag === tag) {
pos = index
return true
}
})
if (pos < 0) {
broken = ERROR(
"ExtraTag",
"Line {0}: Extra end tag found: {1}",
lineNumber,
`<${rawTag}>`
)
} else {
broken = ERROR(
"WrongTag",
"Line {0}: {1} start tag from line {2} should be closed before {3}.",
lineNumber,
`<${last.tag}>`,
last.line,
`<${rawTag}>`
)
}
break
}
stack.pop()
}
//Prepare for next tag.
tagStartLineNumber = tagEndLineNumber
}
if (!broken && stack.length > 0) {
last = stack[stack.length - 1]
broken = ERROR(
"MissingEndTag",
"Line {0}: {1} start tag missing corresponding end tag.",
last.line,
`<${last.tag}>`
)
}
return broken ? broken : true
}
function jsonToMarkdownTable (array, columns) {
const cols = columns
? columns.split(",")
: Object.keys(array[0])
let table = ""
table += cols.join(" | ")
table += "\r\n"
table += cols.map(() => {
return '---'
}).join(' | ')
table += "\r\n"
array.forEach((item) => {
table += `${Object.keys(item).map(function (key) {
return String(item[key])
}).join(" | ") }\r\n`
})
return table
}
function html_escape(text){
var text = text.replace(/&/g, '&amp;')
.replace(/>/g, '&gt;')
.replace(/</g, '&lt;')
.replace(/"/g, '&quot;')
return text
}
glob(`${__dirname}/**/*.html`, {ignore: [`**/node_modules/**`]}, async (err, files) => {
const results = await Promise.all(
files.map(
(f) =>
new Promise((resolve, reject) => {
try {
fs.readFile(f, 'utf-8', (err, data) => {
if (err) reject(err)
resolve({ file: f.replace(__dirname, '') , ...ValidateHtml(data) })
})
} catch (error) {
reject(error)
}
})
)
)
fs.writeFileSync('badhtml.md', html_escape(jsonToMarkdownTable(results.filter(({status}) => status), null, 2)))
})
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment