dan-dr · May 16, 2019 18:13
diff --git a/badhtml.js b/badhtml.js
 "use strict"

 const glob = require("glob")
 const fs = require("fs")

 function unwrap(str) {
  let arr = str.split(","),
    val,
    o = {}
  while ((val = arr.pop())) {
    o[val] = true
  }
  return o
 }

 function ERROR(status, msg) {
  const arg = Array.prototype.slice.call(arguments, 2)
  msg = msg.replace(/(^|[^\\])\{(\w+)\}/g, (m, p, index) => {
    const x = arg[index]
    return (p || "") + (x !== undefined ? x : "")
  })
  return {
    status,
    message: msg
  }
 }

 //HTML 4 and 5 void tags
 const voidTags = unwrap(
    "area,base,basefont,br,col,command,embed,frame,hr,img,input,keygen,link,meta,param,source,track,wbr"
  ),
  singlelevel = unwrap("script,style"),
  regxstr = {
    tagname: "[\\-A-Za-z0-9_:]+",
    attrname: "[\\w\\-]+",
    attrvalue: /(?:(?:"[^"]*")|(?:'[^']*')|[^>\s]+)/.toString().slice(1, -1) //quoted and unquoted strings
  },
  regx = {
    // Start tag regex: /[^<]*<([\-A-Za-z0-9_]+:)(?:\s+[\w\-]+(?:\s*=\s*(?:(?:"[^"]*")|(?:'[^']*')|[^>\s]+))?)*[^>]*>(.*)/,
    opentag: new RegExp(
      `^[^<]*?<(${
        regxstr.tagname
        })` +
        `(?:\\s+${
        regxstr.attrname
        }(?:\\s*=\\s*${
        regxstr.attrvalue
        })?` +
        `)*` +
        `([^>]*?)>((?:.|\\n)*)`
    ),
    othertag: /^[^<]*?<([!\-\[\]\/A-Za-z0-9_:]+)([^>]*?)>((?:.|\n)*)/, //close tags, doctype, comments, cdata
    comment: /^[^<]*?<!--(?:.|\n)*?-->/,
    cdata: /^[^<]*?<!\[CDATA\[(?:.|\n)*?\]\]>/
  }

 function ValidateHtml(html) {
  let str = html.replace(/[\r]/g, "").trim(),
    tag,
    rawTag,
    isCloseTag,
    matches,
    stack = [],
    lineNumber,
    tagStartLineNumber = 1,
    tagEndLineNumber = 1,
    last,
    broken = "",
    level = 0,
    replaceComment = function(m) {
      tagEndLineNumber =
        tagStartLineNumber +
        (m.substr(m.indexOf("<")).match(/\n/g) || []).length
      //prepare for the next tag.
      tagStartLineNumber = tagEndLineNumber
      return ""
    },
    replaceSingleLevel = function(m) {
      tagEndLineNumber = tagStartLineNumber + (m.match(/\n/g) || []).length
      //prepare for the next tag.
      tagStartLineNumber = tagEndLineNumber
      return ""
    },
    pos
  while (str) {
    matches = str.match(regx.opentag) || str.match(regx.othertag)
    if (!matches) {
      pos = str.indexOf("<")
      if (pos >= 0) {
        //add new lines.
        tagStartLineNumber += (str.substr(0, pos).match(/\n/g) || []).length
        str = str.substr(pos + 1)
        continue
      }
      break
    }
    rawTag = matches[1]
    tag = rawTag.toLowerCase() //html is case insensitive

    tagStartLineNumber += (
      str.substring(0, str.indexOf("<")).match(/\n/g) || []
    ).length
    tagEndLineNumber += (
      str.substring(0, str.length - matches[3].length).match(/\n/g) || []
    ).length
    lineNumber = tagStartLineNumber

    str = matches[3]

    //Identify close tag
    if (tag[0] === "/") {
      isCloseTag = true
      tag = tag.substr(1)
    } else {
      isCloseTag = false
    }

    //Do something per tag
    if (tag[0] === "!") {
      //Either doctype or comment, so ignore them
      if (tag.indexOf("![cdata[") === 0) {
        if (!regx.cdata.test(matches[0])) {
          broken = ERROR(
            "CDATANotClosed",
            "Line {0}: CDATA section not closed properly.",
            lineNumber
          )
          break
        }
        str = matches[0].replace(regx.cdata, replaceComment)
      } else if (tag.indexOf("!--") === 0) {
        if (!regx.comment.test(matches[0])) {
          broken = ERROR(
            "CommentNotClosed",
            "Line {0}: HTML comment not closed properly.",
            lineNumber
          )
          break
        }
        str = matches[0].replace(regx.comment, replaceComment)
      }
      continue
    } else if (voidTags[tag]) {
      continue
    } else if (singlelevel[tag]) {
      //prepare for counting the \n between start of tag and end angle bracket of end tag
      tagStartLineNumber = tagEndLineNumber
      //remove everything upto end tag
      const specialEndTagRegex = new RegExp(
        `^((?:.|\\n)*?)</${tag}[^>]*>`
      )
      if (!specialEndTagRegex.test(str.toLowerCase())) {
        broken = ERROR(
          "MissingEndTag",
          "Line {0}: {1} start tag missing corresponding end tag.",
          lineNumber,
          `<${tag}>`
        )
        break
      }
      str = str.replace(specialEndTagRegex, replaceSingleLevel)
      continue
    }

    if (isCloseTag) {
      level -= 1
    }
    if (level < 0) {
      broken = ERROR(
        "ExtraTag",
        "Line {0}: Extra end tag found: {1}",
        lineNumber,
        `<${rawTag}>`
      )
      break
    }
    if (!isCloseTag) {
      level += 1
    }

    if (!isCloseTag) {
      stack.push({
        tag,
        line: lineNumber
      })
    } else {
      last = stack[stack.length - 1]
      if (last.tag !== tag) {
        pos = -1
        stack.some((o, index) => {
          if (o.tag === tag) {
            pos = index
            return true
          }
        })
        if (pos < 0) {
          broken = ERROR(
            "ExtraTag",
            "Line {0}: Extra end tag found: {1}",
            lineNumber,
            `<${rawTag}>`
          )
        } else {
          broken = ERROR(
            "WrongTag",
            "Line {0}: {1} start tag from line {2} should be closed before {3}.",
            lineNumber,
            `<${last.tag}>`,
            last.line,
            `<${rawTag}>`
          )
        }
        break
      }
      stack.pop()
    }

    //Prepare for next tag.
    tagStartLineNumber = tagEndLineNumber
  }
  if (!broken && stack.length > 0) {
    last = stack[stack.length - 1]
    broken = ERROR(
      "MissingEndTag",
      "Line {0}: {1} start tag missing corresponding end tag.",
      last.line,
      `<${last.tag}>`
    )
  }
  return broken ? broken : true
 }

 function jsonToMarkdownTable (array, columns) {
  const cols = columns
    ? columns.split(",")
    : Object.keys(array[0])

  let table = ""

  table += cols.join(" | ")
  table += "\r\n"
  table += cols.map(() => {
    return '---'
  }).join(' | ')
  table += "\r\n"

  array.forEach((item) => {
    table += `${Object.keys(item).map(function (key) {
      return String(item[key])
    }).join(" | ")  }\r\n`
  })

  return table
 }

 function html_escape(text){
  var text = text.replace(/&/g, '&amp;')
                 .replace(/>/g, '&gt;')
                 .replace(/</g, '&lt;')
                 .replace(/"/g, '&quot;')
  return text
 }

 glob(`${__dirname}/**/*.html`, {ignore: [`**/node_modules/**`]}, async (err, files) => {
  const results = await Promise.all(
    files.map(
      (f) =>
        new Promise((resolve, reject) => {
          try {
            fs.readFile(f, 'utf-8', (err, data) => {
              if (err) reject(err)

              resolve({ file: f.replace(__dirname, '') , ...ValidateHtml(data) })
            })
          } catch (error) {
            reject(error)
          }
        })
    )
  )

  fs.writeFileSync('badhtml.md', html_escape(jsonToMarkdownTable(results.filter(({status}) => status), null, 2)))
 })
	"use strict"

	const glob = require("glob")
	const fs = require("fs")

	function unwrap(str) {
	let arr = str.split(","),
	val,
	o = {}
	while ((val = arr.pop())) {
	o[val] = true
	}
	return o
	}

	function ERROR(status, msg) {
	const arg = Array.prototype.slice.call(arguments, 2)
	msg = msg.replace(/(^\|[^\\])\{(\w+)\}/g, (m, p, index) => {
	const x = arg[index]
	return (p \|\| "") + (x !== undefined ? x : "")
	})
	return {
	status,
	message: msg
	}
	}

	//HTML 4 and 5 void tags
	const voidTags = unwrap(
	"area,base,basefont,br,col,command,embed,frame,hr,img,input,keygen,link,meta,param,source,track,wbr"
	),
	singlelevel = unwrap("script,style"),
	regxstr = {
	tagname: "[\\-A-Za-z0-9_:]+",
	attrname: "[\\w\\-]+",
	attrvalue: /(?:(?:"[^"]")\|(?:'[^']')\|[^>\s]+)/.toString().slice(1, -1) //quoted and unquoted strings
	},
	regx = {
	// Start tag regex: /[^<]<([\-A-Za-z0-9_]+:)(?:\s+[\w\-]+(?:\s=\s(?:(?:"[^"]")\|(?:'[^']')\|[^>\s]+))?)[^>]>(.)/,
	opentag: new RegExp(
	`^[^<]*?<(${
	regxstr.tagname
	})` +
	`(?:\\s+${
	regxstr.attrname
	}(?:\\s=\\s${
	regxstr.attrvalue
	})?` +
	`)*` +
	`([^>]?)>((?:.\|\\n))`
	),
	othertag: /^[^<]?<([!\-\[\]\/A-Za-z0-9_:]+)([^>]?)>((?:.\|\n)*)/, //close tags, doctype, comments, cdata
	comment: /^[^<]?<!--(?:.\|\n)?-->/,
	cdata: /^[^<]?<!\[CDATA\[(?:.\|\n)?\]\]>/
	}

	function ValidateHtml(html) {
	let str = html.replace(/[\r]/g, "").trim(),
	tag,
	rawTag,
	isCloseTag,
	matches,
	stack = [],
	lineNumber,
	tagStartLineNumber = 1,
	tagEndLineNumber = 1,
	last,
	broken = "",
	level = 0,
	replaceComment = function(m) {
	tagEndLineNumber =
	tagStartLineNumber +
	(m.substr(m.indexOf("<")).match(/\n/g) \|\| []).length
	//prepare for the next tag.
	tagStartLineNumber = tagEndLineNumber
	return ""
	},
	replaceSingleLevel = function(m) {
	tagEndLineNumber = tagStartLineNumber + (m.match(/\n/g) \|\| []).length
	//prepare for the next tag.
	tagStartLineNumber = tagEndLineNumber
	return ""
	},
	pos
	while (str) {
	matches = str.match(regx.opentag) \|\| str.match(regx.othertag)
	if (!matches) {
	pos = str.indexOf("<")
	if (pos >= 0) {
	//add new lines.
	tagStartLineNumber += (str.substr(0, pos).match(/\n/g) \|\| []).length
	str = str.substr(pos + 1)
	continue
	}
	break
	}
	rawTag = matches[1]
	tag = rawTag.toLowerCase() //html is case insensitive

	tagStartLineNumber += (
	str.substring(0, str.indexOf("<")).match(/\n/g) \|\| []
	).length
	tagEndLineNumber += (
	str.substring(0, str.length - matches[3].length).match(/\n/g) \|\| []
	).length
	lineNumber = tagStartLineNumber

	str = matches[3]

	//Identify close tag
	if (tag[0] === "/") {
	isCloseTag = true
	tag = tag.substr(1)
	} else {
	isCloseTag = false
	}

	//Do something per tag
	if (tag[0] === "!") {
	//Either doctype or comment, so ignore them
	if (tag.indexOf("![cdata[") === 0) {
	if (!regx.cdata.test(matches[0])) {
	broken = ERROR(
	"CDATANotClosed",
	"Line {0}: CDATA section not closed properly.",
	lineNumber
	)
	break
	}
	str = matches[0].replace(regx.cdata, replaceComment)
	} else if (tag.indexOf("!--") === 0) {
	if (!regx.comment.test(matches[0])) {
	broken = ERROR(
	"CommentNotClosed",
	"Line {0}: HTML comment not closed properly.",
	lineNumber
	)
	break
	}
	str = matches[0].replace(regx.comment, replaceComment)
	}
	continue
	} else if (voidTags[tag]) {
	continue
	} else if (singlelevel[tag]) {
	//prepare for counting the \n between start of tag and end angle bracket of end tag
	tagStartLineNumber = tagEndLineNumber
	//remove everything upto end tag
	const specialEndTagRegex = new RegExp(
	`^((?:.\|\\n)?)</${tag}[^>]>`
	)
	if (!specialEndTagRegex.test(str.toLowerCase())) {
	broken = ERROR(
	"MissingEndTag",
	"Line {0}: {1} start tag missing corresponding end tag.",
	lineNumber,
	`<${tag}>`
	)
	break
	}
	str = str.replace(specialEndTagRegex, replaceSingleLevel)
	continue
	}

	if (isCloseTag) {
	level -= 1
	}
	if (level < 0) {
	broken = ERROR(
	"ExtraTag",
	"Line {0}: Extra end tag found: {1}",
	lineNumber,
	`<${rawTag}>`
	)
	break
	}
	if (!isCloseTag) {
	level += 1
	}

	if (!isCloseTag) {
	stack.push({
	tag,
	line: lineNumber
	})
	} else {
	last = stack[stack.length - 1]
	if (last.tag !== tag) {
	pos = -1
	stack.some((o, index) => {
	if (o.tag === tag) {
	pos = index
	return true
	}
	})
	if (pos < 0) {
	broken = ERROR(
	"ExtraTag",
	"Line {0}: Extra end tag found: {1}",
	lineNumber,
	`<${rawTag}>`
	)
	} else {
	broken = ERROR(
	"WrongTag",
	"Line {0}: {1} start tag from line {2} should be closed before {3}.",
	lineNumber,
	`<${last.tag}>`,
	last.line,
	`<${rawTag}>`
	)
	}
	break
	}
	stack.pop()
	}

	//Prepare for next tag.
	tagStartLineNumber = tagEndLineNumber
	}
	if (!broken && stack.length > 0) {
	last = stack[stack.length - 1]
	broken = ERROR(
	"MissingEndTag",
	"Line {0}: {1} start tag missing corresponding end tag.",
	last.line,
	`<${last.tag}>`
	)
	}
	return broken ? broken : true
	}

	function jsonToMarkdownTable (array, columns) {
	const cols = columns
	? columns.split(",")
	: Object.keys(array[0])

	let table = ""

	table += cols.join(" \| ")
	table += "\r\n"
	table += cols.map(() => {
	return '---'
	}).join(' \| ')
	table += "\r\n"

	array.forEach((item) => {
	table += `${Object.keys(item).map(function (key) {
	return String(item[key])
	}).join(" \| ") }\r\n`
	})

	return table
	}

	function html_escape(text){
	var text = text.replace(/&/g, '&')
	.replace(/>/g, '>')
	.replace(/</g, '<')
	.replace(/"/g, '"')
	return text
	}

	glob(`${__dirname}/*/.html`, {ignore: [`/node_modules/`]}, async (err, files) => {
	const results = await Promise.all(
	files.map(
	(f) =>
	new Promise((resolve, reject) => {
	try {
	fs.readFile(f, 'utf-8', (err, data) => {
	if (err) reject(err)

	resolve({ file: f.replace(__dirname, '') , ...ValidateHtml(data) })
	})
	} catch (error) {
	reject(error)
	}
	})
	)
	)

	fs.writeFileSync('badhtml.md', html_escape(jsonToMarkdownTable(results.filter(({status}) => status), null, 2)))
	})