Last active
October 16, 2015 07:43
-
-
Save adon-at-work/5cd2c63e2430100706c7 to your computer and use it in GitHub Desktop.
rawText and styleTag Canonicalizations
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* | |
* The following works for all RAWTEXT tags except a minor usability issue on the obsolete <xmp> tag. Take <style> as example. | |
* Canonicalization Goal: the </style> immediately after styleContent cannot be put inside an unclosed HTML comment, | |
* otherwise, older browsers (e.g., IE7) won't be able to close the style tag as in HTML5 parsing | |
* There's no problem regardless of whether the original style block is already enclosed in comment <!-- --> | |
* If it does, either styleContent<!----> or <!--styleContent--><!----> is okay, the injected block just opens and closes itself | |
* If it opens without closing, i.e., <!--styleContent<!----> is okay too, we successfully ended the comment with the last --> | |
* <%%> is similar and is for IE in particular | |
*/ | |
function rawTextCanonicalization(content) { | |
return content + '<!--<%%>-->'; | |
} | |
var cssSensitivePattern = /(?:expression)/gi, | |
// cssEscapeChar can be simplied from http://www.w3.org/TR/css3-syntax/#escape-diagram | |
// since we just need a quick check to ban the use of \ in statement areas (i.e., non-string) | |
cssEscapeChar = /\\./g, | |
// cssCommentPattern follows http://www.w3.org/TR/css3-syntax/#comment-diagram | |
// reg expression extracted from http://www.w3.org/TR/2003/WD-css3-syntax-20030813/#SUBTOK-string | |
cssCommentPattern = /\/\*[^\*]*\*(?:[^\/][^\*]*\*+)*\//g, | |
// cssStrPattern are generally derived from the spec: http://www.w3.org/TR/css3-syntax/#string-token-diagram | |
// difference 1: newline inside str is matched for replacement, as a kind of parse error handling (IE takes newline in str) | |
// difference 2: based on our use case, it's okay string're not properly quoted at EOF, escape from statement processing | |
// difference 3: prefixed with (?:^|[^\\])(?:\\.)* to ensure that the upcoming quote is NOT backslahed (hence, \"notStr"hereIsStr") | |
cssStrPattern = /(?:^|[^\\])(?:\\.)*(?:(")(?:[^"\\]|\\(?:.|\n|\r\n?|\f))*?(?:"|$)|(')(?:[^'\\]|\\(?:.|\n|\r\n?|\f))*?(?:'|$))/g, | |
// cssNonEscapedLinebreak is to match linebreaks | |
// it shares the same reason as in difference 3 above for cssStrPattern (\\n is not what we interested) | |
cssNonEscapedLinebreak = /((?:^|[^\\])(?:\\.)*)(?:\n|\r\n?|\f)/g; | |
/* | |
* styleTagCanonicalization is able to differentiate CSS string contexts from non-string (referred to as statement) contexts | |
* Purposes: (1) stop script execution (XSS); (2) make string even it has linebreaks | |
* (1) To stop XSS, we leverage it to suppress CSS expression() in the statement context unless preserveExpression is set | |
* To archieve this, first remove all CSS comments as required, and replace case insensitively "expression" with "unsafeExpression" | |
* This version is however limitated in that it prohibits the use of escaped chars in statement contexts for easier matching (which rarely happens) | |
* (2) It's found that at least IE accepts linebreaks inside string | |
* So, we decided to canonicalize string by removing the linebreaks (check comments for cssStrPattern) | |
* Therefore, the parsing experience across browsers are aligned | |
* This implementation is good enough for its purpose, especially without wasting cycles on fully tokenizing the whole content | |
* | |
* @param {string} styleContent - content extracted from the style tag | |
* @param {boolean} preserveExpression - | |
* @return the styleContent with CSS expression treated (unless preserveExpression is set) and canonicalized using rawTextCanonicalization() | |
*/ | |
function styleTagCanonicalization(styleContent, preserveExpression) { | |
var res, str, stmt, strStartIndex, strEndIndex = 0, output = ''; | |
function suppressExpression(stmt) { | |
// cease processing if escape char is found inside statement | |
if (cssEscapeChar.test(stmt)) { | |
return null; | |
} | |
// void sensitive pattern after comment removal, according to the correct order | |
return stmt.replace(cssCommentPattern, '') | |
.replace(cssSensitivePattern, 'unsafeExpression'); | |
} | |
// void CSS expression with minimal context-sensitive replacement | |
// i.e., distinguish string from non-string, treat only non-string context | |
while (res = cssStrPattern.exec(styleContent)) { | |
// str starts from the matching position + location of the first quote | |
strStartIndex = res.index + res[0].indexOf(res[1] || res[2]); | |
// anything non-string is considered as statement | |
stmt = styleContent.slice(strEndIndex, strStartIndex); | |
if (!preserveExpression) { | |
stmt = suppressExpression(stmt); | |
if (stmt === null) { | |
return '/* Use of escaped chars are prohibited */'; | |
} | |
} | |
strEndIndex = cssStrPattern.lastIndex; | |
// append processed statement | |
// append the string literal with non-escaped newline removed | |
output += stmt + styleContent.slice(strStartIndex, strEndIndex).replace(cssNonEscapedLinebreak, '$1'); | |
} | |
// When no (more) string is ever found in the styleContent | |
stmt = (strEndIndex === 0) ? styleContent : styleContent.slice(strEndIndex); | |
stmt = suppressExpression(stmt); | |
if (stmt === null) { | |
return '/* Use of escaped chars are prohibited */'; | |
} | |
return output + stmt; | |
} | |
// assume str is extracted from inside <style></style> | |
var str = "<!-- body div:after{font-size:expression(alert(0));x:expres/*fwe a**f we/f f*/sion(alert(1));content:'<!-- asdfdf--> asdf expression(alert(2)) '} --> <!-- div{font:'asdf}'} --> div{background:url()} div:before{content:' expression(alert(3))"; | |
str = "div{font-family:'\nArialExpression',expr/* well */ession(alert(35))}"; | |
rawTextCanonicalization(styleTagCanonicalization(str)); | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment