Created
December 21, 2010 16:43
-
-
Save winhamwr/750190 to your computer and use it in GitHub Desktop.
HTML normalization to allow for consistent comparison of HTML
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/** | |
* Order HTML attributes for consistent HTML comparison. | |
* | |
* Adapted from google-code-prettify | |
* http://code.google.com/p/google-code-prettify/source/browse/trunk/src/prettify.js#311 | |
* Apache license, Copyright (C) 2006 Google Inc. | |
*/ | |
function normalizeHtml(node) { | |
var html = ''; | |
switch (node.nodeType) { | |
case 1: // an element | |
var name = node.tagName.toLowerCase(); | |
html += '<'+name; | |
var attrs = node.attributes; | |
var n = attrs.length; | |
if (n) { | |
var sortedAttrs = []; | |
for (var i = n; --i >= 0;) { sortedAttrs[i] = attrs[i]; } | |
sortedAttrs.sort(function (a, b) { | |
return (a.name < b.name) ? -1 : a.name === b.name ? 0 : 1; | |
}); | |
attrs = sortedAttrs; | |
for (var i = 0; i < n; ++i) { | |
var attr = attrs[i]; | |
if (!attr.specified) { continue; } | |
html += ' ' + attr.name.toLowerCase() + | |
'="' + attribToHtml(attr.value) + '"'; | |
} | |
} | |
html += '>'; | |
for (var child = node.firstChild; child; child = child.nextSibling) { | |
html += normalizeHtml(child); | |
} | |
if (node.firstChild || !/^(?:br|link|img)$/.test(name)) { | |
html += '<\/' + name + '>'; | |
} | |
break; | |
case 3: case 4: // text | |
html += textToHtml(node.nodeValue); | |
break; | |
} | |
return html; | |
} | |
function attribToHtml(str) { | |
return str.replace(pr_amp, '&') | |
.replace(pr_lt, '<') | |
.replace(pr_gt, '>') | |
.replace(pr_quot, '"'); | |
} | |
var pr_amp = /&/g; | |
var pr_lt = /</g; | |
var pr_gt = />/g; | |
var pr_quot = /\"/g; | |
/** | |
* Escape html special characters. | |
*/ | |
function textToHtml(str) { | |
return str.replace(pr_amp, '&') | |
.replace(pr_lt, '<') | |
.replace(pr_gt, '>'); | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment