Last active
December 20, 2017 15:25
-
-
Save nikitastore/09c6c4a792d5d3f6c758 to your computer and use it in GitHub Desktop.
PHP: clean_html AND xss_clean
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?php | |
function clean_html(&$html) { | |
libxml_use_internal_errors(true) AND libxml_clear_errors(); | |
$whitelist = array( | |
"#text", | |
"a" => array("target", "href", "name", "type", "rel", "download"), | |
"img" => array("src", "alt", "width", "height"), | |
"table" => array("width"), | |
"td" => array("rowspan", "colspan"), | |
"ul" => array("type"), | |
"ol" => array("type", "reversed", "start"), | |
"li" => array("type", "value"), | |
"span", "dd", "dl", "dt", "caption", "thead", "tbody", "tfoot", "th", "tr", "p", "div", "br", "hr", "pre", "blockquote", "center", "h1", "h2", "h3", "h4", "h5", "h6", "abbr", "b", "strong", "big", "small", "cite", "code", "listing", "del", "s", "ins", "u", "em", "i", "kbd", "mark", "nobr", "q", "sub", "sup", "time"); | |
if (is_object($html)) { | |
if ($html->hasChildNodes()) { | |
foreach (range($html->childNodes->length - 1, 0) as $i) { | |
clean_html($html->childNodes->item($i), $whitelist); | |
} | |
} | |
if (!in_array($html->nodeName, $whitelist) && !isset($whitelist[$html->nodeName]) && is_object($html->childNodes)) { | |
$fragment = $html->ownerDocument->createDocumentFragment(); | |
while ($html->childNodes->length > 0) { | |
$fragment->appendChild($html->childNodes->item(0)); | |
} | |
return $html->parentNode->replaceChild($fragment, $html); | |
} | |
if($html->hasAttributes()) { | |
if(isset($whitelist[$html->nodeName]) && is_array($whitelist[$html->nodeName])) { | |
for ( $k = $html->attributes->length - 1; $k >= 0; --$k ) { | |
$attribute = $html->attributes->item($k); | |
if(!in_array($attribute->nodeName, $whitelist[$html->nodeName])) { | |
$html->removeAttributeNode( $attribute ); | |
} else { | |
$attribute->value = xss_clean((string)$attribute->value); | |
} | |
} | |
} else { | |
while ($html->hasAttributes()) { | |
$html->removeAttributeNode($html->attributes->item(0)); | |
} | |
} | |
} | |
} else { | |
$dimp1 = new DOMImplementation(); | |
$doctype = $dimp1->createDocumentType('html', | |
'-//W3C//DTD XHTML 1.1//EN', | |
'http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd'); | |
$dimp2 = new DOMImplementation(); | |
$document = $dimp2->createDocument('http://www.w3.org/1999/xhtml', | |
'html', | |
$doctype); | |
$document->encoding = "UTF-8"; | |
$document->formatOutput = TRUE; | |
$document->recover = TRUE; | |
$html = mb_convert_encoding($html, 'HTML-ENTITIES', "UTF-8"); | |
if ($document->loadHTML($html)) { | |
clean_html($document->documentElement, $whitelist); | |
$result = ""; | |
foreach($document->childNodes as $node) | |
$result .= $document->saveXML($node, LIBXML_NOEMPTYTAG)."\n"; | |
$result = preg_replace('~<(?:!DOCTYPE|/?(?:html|body))[^>]*>\s*~i', '', $result); | |
return $result; | |
} | |
} | |
} | |
function xss_clean($str, $is_image = FALSE) { | |
static $_xss_hash = ""; | |
$charset = 'UTF-8'; | |
if ($_xss_hash == "") { | |
mt_srand(); | |
$_xss_hash = md5(time() + mt_rand(0, 1999999999)); | |
} | |
/* | |
* Is the string an array? | |
* | |
*/ | |
if (is_array($str)) { | |
while (list($key) = each($str)) { | |
$str[$key] = xss_clean($str[$key]); | |
} | |
return $str; | |
} | |
/* | |
* Remove Invisible Characters | |
*/ | |
$non_displayables = array(); | |
// every control character except newline (dec 10) | |
// carriage return (dec 13), and horizontal tab (dec 09) | |
$non_displayables[] = '/%0[0-8bcef]/'; | |
// url encoded 00-08, 11, 12, 14, 15 | |
$non_displayables[] = '/%1[0-9a-f]/'; | |
// url encoded 16-31 | |
$non_displayables[] = '/[\x00-\x08\x0B\x0C\x0E-\x1F\x7F]+/S'; | |
// 00-08, 11, 12, 14-31, 127 | |
do { | |
$str = preg_replace($non_displayables, "", $str, -1, $count); | |
} while ($count); | |
// Validate Entities in URLs | |
/* | |
* Protect GET variables in URLs | |
*/ | |
// 901119URL5918AMP18930PROTECT8198 | |
$str = preg_replace('|\&([a-z\_0-9\-]+)\=([a-z\_0-9\-]+)|i', $_xss_hash . "\\1=\\2", $str); | |
/* | |
* Validate standard character entities | |
* | |
* Add a semicolon if missing. We do this to enable | |
* the conversion of entities to ASCII later. | |
* | |
*/ | |
$str = preg_replace('#(&\#?[0-9a-z]{2,})([\x00-\x20])*;?#i', "\\1;\\2", $str); | |
/* | |
* Validate UTF16 two byte encoding (x00) | |
* | |
* Just as above, adds a semicolon if missing. | |
* | |
*/ | |
$str = preg_replace('#(&\#x?)([0-9A-F]+);?#i', "\\1\\2;", $str); | |
/* | |
* Un-Protect GET variables in URLs | |
*/ | |
$str = str_replace($_xss_hash, '&', $str); | |
/* | |
* URL Decode | |
* | |
* Just in case stuff like this is submitted: | |
* | |
* <a href="http://%77%77%77%2E%67%6F%6F%67%6C%65%2E%63%6F%6D">Google</a> | |
* | |
* Note: Use rawurldecode() so it does not remove plus signs | |
* | |
*/ | |
$str = rawurldecode($str); | |
/* | |
* Convert character entities to ASCII | |
* | |
* This permits our tests below to work reliably. | |
* We only convert entities that are within tags since | |
* these are the ones that will pose security problems. | |
* | |
*/ | |
$str = preg_replace_callback("/[a-z]+=([\'\"]).*?\\1/si", function ($match) { | |
return str_replace(array( | |
'>', | |
'<', | |
'\\' | |
) , array( | |
'>', | |
'<', | |
'\\\\' | |
) , $match[0]); | |
} | |
, $str); | |
$str = preg_replace_callback("/<\w+.*?(?=>|<|$)/si", function ($match) { | |
if (stristr($match[0], '&') === FALSE) { | |
return $match[0]; | |
} | |
$out = html_entity_decode($match[0], ENT_COMPAT, $charset); | |
$out = preg_replace('~&#x(0*[0-9a-f]{2,5})~ei', 'chr(hexdec("\\1"))', $out); | |
return preg_replace('~&#([0-9]{2,4})~e', 'chr(\\1)', $out); | |
} | |
, $str); | |
/* | |
* Remove Invisible Characters Again! | |
*/ | |
$non_displayables = array(); | |
// every control character except newline (dec 10) | |
// carriage return (dec 13), and horizontal tab (dec 09) | |
$non_displayables[] = '/%0[0-8bcef]/'; | |
// url encoded 00-08, 11, 12, 14, 15 | |
$non_displayables[] = '/%1[0-9a-f]/'; | |
// url encoded 16-31 | |
$non_displayables[] = '/[\x00-\x08\x0B\x0C\x0E-\x1F\x7F]+/S'; | |
// 00-08, 11, 12, 14-31, 127 | |
do { | |
$str = preg_replace($non_displayables, "", $str, -1, $count); | |
} while ($count); | |
/* | |
* Convert all tabs to spaces | |
* | |
* This prevents strings like this: ja vascript | |
* NOTE: we deal with spaces between characters later. | |
* NOTE: preg_replace was found to be amazingly slow here on | |
* large blocks of data, so we use str_replace. | |
*/ | |
if (strpos($str, "\t") !== FALSE) { | |
$str = str_replace("\t", ' ', $str); | |
} | |
/* | |
* Capture converted string for later comparison | |
*/ | |
$converted_string = $str; | |
// Remove Strings that are never allowed | |
$_never_allowed_str = array( | |
'document.cookie' => '[removed]', | |
'document.write' => '[removed]', | |
'.parentNode' => '[removed]', | |
'.innerHTML' => '[removed]', | |
'window.location' => '[removed]', | |
'-moz-binding' => '[removed]', | |
'<!--' => '<!--', | |
'-->' => '-->', | |
'<![CDATA[' => '<![CDATA[', | |
'<comment>' => '<comment>' | |
); | |
$_never_allowed_regex = array( | |
'javascript\s*:', | |
'expression\s*(\(|&\#40;)', | |
// CSS and IE | |
'vbscript\s*:', | |
// IE, surprise! | |
'Redirect\s+302', | |
"([\"'])?data\s*:[^\\1]*?base64[^\\1]*?,[^\\1]*?\\1?" | |
); | |
$str = str_replace(array_keys($_never_allowed_str) , $_never_allowed_str, $str); | |
foreach ($_never_allowed_regex as $regex) { | |
$str = preg_replace('#' . $regex . '#is', '[removed]', $str); | |
} | |
/* | |
* Makes PHP tags safe | |
* | |
* Note: XML tags are inadvertently replaced too: | |
* | |
* <?xml | |
* | |
* But it doesn't seem to pose a problem. | |
*/ | |
if ($is_image === TRUE) { | |
// Images have a tendency to have the PHP short opening and | |
// closing tags every so often so we skip those and only | |
// do the long opening tags. | |
$str = preg_replace('/<\?(php)/i', "<?\\1", $str); | |
} else { | |
$str = str_replace(array( | |
'<?', | |
'?' . '>' | |
) , array( | |
'<?', | |
'?>' | |
) , $str); | |
} | |
/* | |
* Compact any exploded words | |
* | |
* This corrects words like: j a v a s c r i p t | |
* These words are compacted back to their correct state. | |
*/ | |
$words = array( | |
'javascript', | |
'expression', | |
'vbscript', | |
'script', | |
'base64', | |
'applet', | |
'alert', | |
'document', | |
'write', | |
'cookie', | |
'window' | |
); | |
foreach ($words as $word) { | |
$temp = ""; | |
for ($i = 0, $wordlen = strlen($word);$i < $wordlen;$i++) { | |
$temp.= substr($word, $i, 1) . "\s*"; | |
} | |
// We only want to do this when it is followed by a non-word character | |
// That way valid stuff like "dealer to" does not become "dealerto" | |
$str = preg_replace_callback('#(' . substr($temp, 0, -3) . ')(\W)#is', function ($matches) { | |
return preg_replace('/\s+/s', "", $matches[1]) . $matches[2]; | |
} | |
, $str); | |
} | |
/* | |
* Remove disallowed Javascript in links or img tags | |
* We used to do some version comparisons and use of stripos for PHP5, | |
* but it is dog slow compared to these simplified non-capturing | |
* preg_match(), especially if the pattern exists in the string | |
*/ | |
do { | |
$original = $str; | |
if (preg_match("/<a/i", $str)) { | |
$str = preg_replace_callback("#<a\s+([^>]*?)(>|$)#si", function ($match) { | |
$subject = str_replace(array( | |
'<', | |
'>' | |
) , "", $match[1]); | |
$out = ""; | |
if (preg_match_all('#\s*[a-z\-]+\s*=\s*(\042|\047)([^\\1]*?)\\1#is', $subject, $matches)) { | |
foreach ($matches[0] as $match) { | |
$out.= preg_replace("#/\*.*?\*/#s", "", $match); | |
} | |
} | |
$subject = $out; | |
$replace = preg_replace('#href=.*?(alert\(|alert&\#40;|javascript\:|livescript\:|mocha\:|charset\=|window\.|document\.|\.cookie|<script|<xss|data\s*:)#si', "", $subject); | |
return str_replace($match[1], $replace, $match[0]); | |
} | |
, $str); | |
} | |
if (preg_match("/<img/i", $str)) { | |
$str = preg_replace_callback("#<img\s+([^>]*?)(\s?/?>|$)#si", function ($match) { | |
$subject = str_replace(array( | |
'<', | |
'>' | |
) , "", $match[1]); | |
$out = ""; | |
if (preg_match_all('#\s*[a-z\-]+\s*=\s*(\042|\047)([^\\1]*?)\\1#is', $subject, $matches)) { | |
foreach ($matches[0] as $match) { | |
$out.= preg_replace("#/\*.*?\*/#s", "", $match); | |
} | |
} | |
$subject = $out; | |
$replace = preg_replace('#src=.*?(alert\(|alert&\#40;|javascript\:|livescript\:|mocha\:|charset\=|window\.|document\.|\.cookie|<script|<xss|base64\s*,)#si', "", $subject); | |
return str_replace($match[1], $replace, $match[0]); | |
} | |
, $str); | |
} | |
if (preg_match("/script/i", $str) OR preg_match("/xss/i", $str)) { | |
$str = preg_replace("#<(/*)(script|xss)(.*?)\>#si", '[removed]', $str); | |
} | |
} while ($original != $str); | |
unset($original); | |
// Remove evil attributes such as style, onclick and xmlns | |
// All javascript event handlers (e.g. onload, onclick, onmouseover), style, and xmlns | |
$evil_attributes = array( | |
'on\w*', | |
'style', | |
'xmlns', | |
'formaction' | |
); | |
if ($is_image === TRUE) { | |
/* | |
* Adobe Photoshop puts XML metadata into JFIF images, | |
* including namespacing, so we have to allow this for images. | |
*/ | |
unset($evil_attributes[array_search('xmlns', $evil_attributes) ]); | |
} | |
do { | |
$count = 0; | |
$attribs = array(); | |
// find occurrences of illegal attribute strings with quotes (042 and 047 are octal quotes) | |
preg_match_all('/(' . implode('|', $evil_attributes) . ')\s*=\s*(\042|\047)([^\\2]*?)(\\2)/is', $str, $matches, PREG_SET_ORDER); | |
foreach ($matches as $attr) { | |
$attribs[] = preg_quote($attr[0], '/'); | |
} | |
// find occurrences of illegal attribute strings without quotes | |
preg_match_all('/(' . implode('|', $evil_attributes) . ')\s*=\s*([^\s>]*)/is', $str, $matches, PREG_SET_ORDER); | |
foreach ($matches as $attr) { | |
$attribs[] = preg_quote($attr[0], '/'); | |
} | |
// replace illegal attribute strings that are inside an html tag | |
if (count($attribs) > 0) { | |
$str = preg_replace('/(<?)(\/?[^><]+?)([^A-Za-z<>\-])(.*?)(' . implode('|', $attribs) . ')(.*?)([\s><]?)([><]*)/i', '$1$2 $4$6$7$8', $str, -1, $count); | |
} | |
} while ($count); | |
/* | |
* Sanitize naughty HTML elements | |
* | |
* If a tag containing any of the words in the list | |
* below is found, the tag gets converted to entities. | |
* | |
* So this: <blink> | |
* Becomes: <blink> | |
*/ | |
$naughty = 'alert|applet|audio|basefont|base|behavior|bgsound|blink|body|embed|expression|form|frameset|frame|head|html|ilayer|iframe|input|isindex|layer|link|meta|object|plaintext|style|script|textarea|title|video|xml|xss'; | |
$str = preg_replace_callback('#<(/*\s*)(' . $naughty . ')([^><]*)([><]*)#is', function ($matches) { | |
// encode opening brace | |
$out = '<' . $matches[1] . $matches[2] . $matches[3]; | |
// encode captured opening or closing brace to prevent recursive vectors | |
$out.= str_replace(array('>', '<'), array('>', '<'), $matches[4]); | |
return $out; | |
}, $str); | |
/* | |
* Sanitize naughty scripting elements | |
* | |
* Similar to above, only instead of looking for | |
* tags it looks for PHP and JavaScript commands | |
* that are disallowed. Rather than removing the | |
* code, it simply converts the parenthesis to entities | |
* rendering the code un-executable. | |
* | |
* For example: eval('some code') | |
* Becomes: eval('some code') | |
*/ | |
$str = preg_replace('#(alert|cmd|passthru|eval|exec|expression|system|fopen|fsockopen|file|file_get_contents|readfile|unlink)(\s*)\((.*?)\)#si', "\\1\\2(\\3)", $str); | |
// Final clean up | |
// This adds a bit of extra precaution in case | |
// something got through the above filters | |
$str = str_replace(array_keys($_never_allowed_str) , $_never_allowed_str, $str); | |
foreach ($_never_allowed_regex as $regex) { | |
$str = preg_replace('#' . $regex . '#is', '[removed]', $str); | |
} | |
/* | |
* Images are Handled in a Special Way | |
* - Essentially, we want to know that after all of the character | |
* conversion is done whether any unwanted, likely XSS, code was found. | |
* If not, we return TRUE, as the image is clean. | |
* However, if the string post-conversion does not matched the | |
* string post-removal of XSS, then it fails, as there was unwanted XSS | |
* code found and removed/changed during processing. | |
*/ | |
if ($is_image === TRUE) { | |
return ($str == $converted_string) ? TRUE : FALSE; | |
} | |
return $str; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment