Last active
April 18, 2022 07:38
-
-
Save Munawwar/9353965 to your computer and use it in GitHub Desktop.
Unbalanced HTML markup detection
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/** | |
* Detect unsafe (and potentially unsafe) unbalanced tags in a given HTML snippet. | |
* Hints taken from an html parse (https://gist.github.com/cburgmer/2877758). | |
* | |
* Example: | |
* An unclosed div tag is considered unsafe, because if the snippet is pasted in between two div tags | |
* then it could end up breaking the HTML document. | |
* Self closing tags (tags that you can intentioanlly leave open like <table><tr><td>some text</table>) are also considered unsafe, for the same reason. | |
* However an unclosed void tag (like meta tag) is safe, because browsers will ignore it without any side effects. | |
* | |
* Usage: ValidateHtml('<html string>'); | |
*/ | |
(function (root, factory) { | |
if (typeof define === "function" && define.amd) { | |
define(factory); | |
} else if (typeof exports === 'object') { //For NodeJS | |
module.exports = factory(); | |
} else { //For browsers | |
root.ValidateHtml = factory(); | |
} | |
}(this, function () { | |
function unwrap(str) { | |
var arr = str.split(','), val, o = {}; | |
while ((val = arr.pop())) { | |
o[val] = true; | |
} | |
return o; | |
} | |
function ERROR(status, msg) { | |
var arg = Array.prototype.slice.call(arguments, 2); | |
msg = msg.replace(/(^|[^\\])\{(\w+)\}/g, function (m, p, index) { | |
var x = arg[index]; | |
return (p || '') + (x !== undefined ? x : ''); | |
}); | |
return { | |
status: status, | |
message: msg | |
}; | |
} | |
//HTML 4 and 5 void tags | |
var voidTags = unwrap('area,base,basefont,br,col,command,embed,frame,hr,img,input,keygen,link,meta,param,source,track,wbr'), | |
singlelevel = unwrap('script,style'), | |
regxstr = { | |
tagname: "[\\-A-Za-z0-9_:]+", | |
attrname: "[\\w\\-]+", | |
attrvalue: (/(?:(?:"[^"]*")|(?:'[^']*')|[^>\s]+)/).toString().slice(1, -1) //quoted and unquoted strings | |
}, | |
regx = { | |
// Start tag regex: /[^<]*<([\-A-Za-z0-9_]+:)(?:\s+[\w\-]+(?:\s*=\s*(?:(?:"[^"]*")|(?:'[^']*')|[^>\s]+))?)*[^>]*>(.*)/, | |
opentag: new RegExp('^[^<]*?<(' + regxstr.tagname + ')' + | |
'(?:\\s+' + regxstr.attrname + | |
'(?:\\s*=\\s*' + regxstr.attrvalue + ')?' + | |
')*' + | |
'([^>]*?)>((?:.|\\n)*)'), | |
othertag: /^[^<]*?<([!\-\[\]\/A-Za-z0-9_:]+)([^>]*?)>((?:.|\n)*)/, //close tags, doctype, comments, cdata | |
comment: /^[^<]*?<!--(?:.|\n)*?-->/, | |
cdata: /^[^<]*?<!\[CDATA\[(?:.|\n)*?\]\]>/ | |
}; | |
return function (html) { | |
var str = html.replace(/[\r]/g, '').trim(), | |
tag, rawTag, isCloseTag, | |
matches, stack = [], lineNumber, tagStartLineNumber = 1, tagEndLineNumber = 1, last, | |
broken = '', level = 0, | |
replaceComment = function (m) { | |
tagEndLineNumber = tagStartLineNumber + (m.substr(m.indexOf('<')).match(/\n/g) || []).length; | |
//prepare for the next tag. | |
tagStartLineNumber = tagEndLineNumber; | |
return ''; | |
}, | |
replaceSingleLevel = function (m) { | |
tagEndLineNumber = tagStartLineNumber + (m.match(/\n/g) || []).length; | |
//prepare for the next tag. | |
tagStartLineNumber = tagEndLineNumber; | |
return ''; | |
}, | |
pos; | |
while (str) { | |
matches = (str.match(regx.opentag) || str.match(regx.othertag)); | |
if (!matches) { | |
pos = str.indexOf('<'); | |
if (pos >= 0) { | |
//add new lines. | |
tagStartLineNumber += (str.substr(0, pos).match(/\n/g) || []).length; | |
str = str.substr(pos + 1); | |
continue; | |
} | |
break; | |
} | |
rawTag = matches[1]; | |
tag = rawTag.toLowerCase(); //html is case insensitive | |
tagStartLineNumber += (str.substring(0, str.indexOf('<')).match(/\n/g) || []).length; | |
tagEndLineNumber += (str.substring(0, str.length - matches[3].length).match(/\n/g) || []).length; | |
lineNumber = tagStartLineNumber; | |
str = matches[3]; | |
//Identify close tag | |
if (tag[0] === '/') { | |
isCloseTag = true; | |
tag = tag.substr(1); | |
} else { | |
isCloseTag = false; | |
} | |
//Do something per tag | |
if (tag[0] === '!') { | |
//Either doctype or comment, so ignore them | |
if (tag.indexOf('![cdata[') === 0) { | |
if (!regx.cdata.test(matches[0])) { | |
broken = ERROR('CDATANotClosed', 'Line {0}: CDATA section not closed properly.', lineNumber); | |
break; | |
} | |
str = matches[0].replace(regx.cdata, replaceComment); | |
} else if (tag.indexOf('!--') === 0) { | |
if (!regx.comment.test(matches[0])) { | |
broken = ERROR('CommentNotClosed', 'Line {0}: HTML comment not closed properly.', lineNumber); | |
break; | |
} | |
str = matches[0].replace(regx.comment, replaceComment); | |
} | |
continue; | |
} else if (voidTags[tag]) { | |
continue; | |
} else if (singlelevel[tag]) { | |
//prepare for counting the \n between start of tag and end angle bracket of end tag | |
tagStartLineNumber = tagEndLineNumber; | |
//remove everything upto end tag | |
var specialEndTagRegex = new RegExp("^((?:.|\\n)*?)</" + tag + "[^>]*>"); | |
if (!specialEndTagRegex.test(str.toLowerCase())) { | |
broken = ERROR('MissingEndTag', 'Line {0}: {1} start tag missing corresponding end tag.', lineNumber, '<' + tag + '>'); | |
break; | |
} | |
str = str.replace(specialEndTagRegex, replaceSingleLevel); | |
continue; | |
} | |
if (isCloseTag) { | |
level -= 1; | |
} | |
if (level < 0) { | |
broken = ERROR('ExtraTag', 'Line {0}: Extra end tag found: {1}', lineNumber, '<' + rawTag + '>'); | |
break; | |
} | |
if (!isCloseTag) { | |
level += 1; | |
} | |
if (!isCloseTag) { | |
stack.push({ | |
tag: tag, | |
line: lineNumber | |
}); | |
} else { | |
last = stack[stack.length - 1]; | |
if (last.tag !== tag) { | |
pos = -1; | |
stack.some(function (o, index) { | |
if (o.tag === tag) { | |
pos = index; | |
return true; | |
} | |
}); | |
if (pos < 0) { | |
broken = ERROR('ExtraTag', 'Line {0}: Extra end tag found: {1}', lineNumber, '<' + rawTag + '>'); | |
} else { | |
broken = ERROR('WrongTag', 'Line {0}: {1} start tag from line {2} should be closed before {3}.', lineNumber, | |
'<' + last.tag + '>', last.line, '<' + rawTag + '>'); | |
} | |
break; | |
} | |
stack.pop(); | |
} | |
//Prepare for next tag. | |
tagStartLineNumber = tagEndLineNumber; | |
} | |
if (!broken && stack.length > 0) { | |
last = stack[stack.length - 1]; | |
broken = ERROR('MissingEndTag', 'Line {0}: {1} start tag missing corresponding end tag.', last.line, '<' + last.tag + '>'); | |
} | |
return broken ? broken : true; | |
}; | |
})); |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<!DOCTYPE html> | |
<html> | |
<head> | |
<!-- | |
Test this a multi line comment. >> | |
--> | |
<meta title="unclosed meta is ok. unclosed div isn't ok."></meta> | |
<script> | |
var script = '<script>'; | |
</script> | |
<![CDATA[This is a cdata | |
section]]> | |
</head> | |
<body> | |
<input type=text data-extra="text"></input> | |
< | |
div> | |
<div> | |
<div></div > | |
</div> | |
<x:blah></x:blah> | |
<x-custom></x-custom> | |
</body> | |
</html> | |
</html> |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<!DOCTYPE html> | |
<html> | |
<head> | |
<script src="html-validator.js"></script> | |
</head> | |
<body> | |
<script> | |
var xhr = new XMLHttpRequest(); | |
xhr.open('GET', 'test.html', false); | |
xhr.send(); | |
console.log(ValidateHtml(xhr.responseText)); | |
</script> | |
</body> | |
</html> |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<!DOCTYPE HTML> | |
<html> | |
<head> | |
<script src="html-validator.js"></script> | |
</head> | |
<body> | |
<textarea id="input" rows="25" style="width:100%"></textarea> | |
<textarea id="output" readonly rows="5" style="width:100%"></textarea> | |
<button id="btn">Valdate HTML</button> | |
<script type="text/javascript"> | |
(function () { | |
var input = document.querySelector('#input'), | |
output = document.querySelector('#output'); | |
document.querySelector('#btn').onclick = function () { | |
output.value = JSON.stringify(window.ValidateHtml(input.value), null, 2); | |
}; | |
}()); | |
</script> | |
</body> | |
</html> |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment