Created
February 8, 2013 13:03
-
-
Save amad/4738933 to your computer and use it in GitHub Desktop.
Purge is standalone version of well-known Codeigniter xss_clean. just copy and pasted from CI to make it work out of box. Purge::xss_clean($data);
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?php | |
Class Purge { | |
static $instance = false; | |
/** | |
* IP address of the current user | |
* | |
* @var string | |
*/ | |
var $ip_address = FALSE; | |
/** | |
* user agent (web browser) being used by the current user | |
* | |
* @var string | |
*/ | |
var $user_agent = FALSE; | |
/** | |
* List of all HTTP request headers | |
* | |
* @var array | |
*/ | |
protected $headers = array(); | |
/** | |
* List of never allowed strings | |
* | |
* @var array | |
* @access protected | |
*/ | |
/** | |
* Random Hash for protecting URLs | |
* | |
* @var string | |
* @access protected | |
*/ | |
static $_xss_hash = ''; | |
static protected $_never_allowed_str = array( | |
'document.cookie' => '[removed]', | |
'document.write' => '[removed]', | |
'.parentNode' => '[removed]', | |
'.innerHTML' => '[removed]', | |
'window.location' => '[removed]', | |
'-moz-binding' => '[removed]', | |
'<!--' => '<!--', | |
'-->' => '-->', | |
'<![CDATA[' => '<![CDATA[', | |
'<comment>' => '<comment>' | |
); | |
/* never allowed, regex replacement */ | |
/** | |
* List of never allowed regex replacement | |
* | |
* @var array | |
* @access protected | |
*/ | |
static protected $_never_allowed_regex = array( | |
'javascript\s*:', | |
'expression\s*(\(|&\#40;)', // CSS and IE | |
'vbscript\s*:', // IE, surprise! | |
'Redirect\s+302', | |
"([\"'])?data\s*:[^\\1]*?base64[^\\1]*?,[^\\1]*?\\1?" | |
); | |
public function &init() | |
{ | |
if (self::$instance === false) { | |
self::$instance = new self(); | |
} | |
return self::$instance; | |
} | |
/** | |
* Fetch from array | |
* | |
* This is a helper function to retrieve values from global arrays | |
* | |
* @param array $array | |
* @param string $index | |
* @return string | |
*/ | |
private function _fetch_from_array(&$array, $index='') | |
{ | |
if ( ! isset($array[$index])) | |
{ | |
return FALSE; | |
} | |
return self::xss_clean($array[$index]); | |
} | |
/** | |
* Fetch an item from the GET array | |
* | |
* @param string $index | |
* @return string | |
*/ | |
public function get($index = NULL) | |
{ | |
// Check if a field has been provided | |
if ($index === NULL AND ! empty($_GET)) | |
{ | |
$get = array(); | |
// loop through the full _GET array | |
foreach (array_keys($_GET) as $key) | |
{ | |
$get[$key] = self::_fetch_from_array($_GET, $key); | |
} | |
return $get; | |
} | |
return self::_fetch_from_array($_GET, $index); | |
} | |
/** | |
* Fetch an item from the POST array | |
* | |
* @param string $string | |
* @return string | |
*/ | |
public function post($index = NULL) | |
{ | |
// Check if a field has been provided | |
if ($index === NULL AND ! empty($_POST)) | |
{ | |
$post = array(); | |
// Loop through the full _POST array and return it | |
foreach (array_keys($_POST) as $key) | |
{ | |
$post[$key] = self::_fetch_from_array($_POST, $key); | |
} | |
return $post; | |
} | |
return self::_fetch_from_array($_POST, $index); | |
} | |
/** | |
* Fetch an item from either the GET array or the POST | |
* | |
* @param string $index | |
* @return string | |
*/ | |
public function get_post($index = '') | |
{ | |
if ( ! isset($_POST[$index]) ) | |
{ | |
return self::get($index); | |
} | |
else | |
{ | |
return self::post($index); | |
} | |
} | |
/** | |
* Fetch an item from the COOKIE array | |
* | |
* @param string $index | |
* @return string | |
*/ | |
public function cookie($index = '') | |
{ | |
return self::_fetch_from_array($_COOKIE, $index); | |
} | |
/** | |
* Fetch an item from the SERVER array | |
* | |
* @param string $index | |
* @return string | |
*/ | |
public function server($index = '') | |
{ | |
return self::_fetch_from_array($_SERVER, $index); | |
} | |
/** | |
* User Agent | |
* | |
* @return string | |
*/ | |
public function user_agent() | |
{ | |
if (self::$user_agent !== FALSE) | |
{ | |
return self::$user_agent; | |
} | |
self::$user_agent = ( ! isset($_SERVER['HTTP_USER_AGENT'])) ? FALSE : $_SERVER['HTTP_USER_AGENT']; | |
return self::$user_agent; | |
} | |
/** | |
* Sanitize Globals | |
* | |
* This function does the following: | |
* | |
* Unsets $_GET data (if query strings are not enabled) | |
* | |
* Unsets all globals if register_globals is enabled | |
* | |
* Standardizes newline characters to \n | |
* | |
* @access private | |
* @return void | |
*/ | |
function _sanitize_globals() | |
{ | |
// Clean $_GET Data | |
if (is_array($_GET) AND count($_GET) > 0) | |
{ | |
foreach ($_GET as $key => $val) | |
{ | |
$_GET[self::_clean_input_keys($key)] = self::_clean_input_data($val); | |
} | |
} | |
// Clean $_POST Data | |
if (is_array($_POST) AND count($_POST) > 0) | |
{ | |
foreach ($_POST as $key => $val) | |
{ | |
$_POST[self::_clean_input_keys($key)] = self::_clean_input_data($val); | |
} | |
} | |
// Sanitize PHP_SELF | |
$_SERVER['PHP_SELF'] = strip_tags($_SERVER['PHP_SELF']); | |
} | |
/** | |
* XSS Clean | |
* | |
* Sanitizes data so that Cross Site Scripting Hacks can be | |
* prevented. This function does a fair amount of work but | |
* it is extremely thorough, designed to prevent even the | |
* most obscure XSS attempts. Nothing is ever 100% foolproof, | |
* of course, but I haven't been able to get anything passed | |
* the filter. | |
* | |
* Note: This function should only be used to deal with data | |
* upon submission. It's not something that should | |
* be used for general runtime processing. | |
* | |
* This function was based in part on some code and ideas I | |
* got from Bitflux: http://channel.bitflux.ch/wiki/XSS_Prevention | |
* | |
* To help develop this script I used this great list of | |
* vulnerabilities along with a few other hacks I've | |
* harvested from examining vulnerabilities in other programs: | |
* http://ha.ckers.org/xss.html | |
* | |
* @param mixed string or array | |
* @param bool | |
* @return string | |
*/ | |
public function xss_clean($str, $is_image = FALSE) | |
{ | |
if (self::$instance === false) { | |
self::init(); | |
} | |
/* | |
* Is the string an array? | |
* | |
*/ | |
if (is_array($str)) | |
{ | |
while (list($key) = each($str)) | |
{ | |
$str[$key] = self::xss_clean($str[$key]); | |
} | |
return $str; | |
} | |
/* | |
* Remove Invisible Characters | |
*/ | |
$str = self::remove_invisible_characters($str); | |
// Validate Entities in URLs | |
$str = self::_validate_entities($str); | |
/* | |
* URL Decode | |
* | |
* Just in case stuff like this is submitted: | |
* | |
* <a href="http://%77%77%77%2E%67%6F%6F%67%6C%65%2E%63%6F%6D">Google</a> | |
* | |
* Note: Use rawurldecode() so it does not remove plus signs | |
* | |
*/ | |
$str = rawurldecode($str); | |
/* | |
* Convert character entities to ASCII | |
* | |
* This permits our tests below to work reliably. | |
* We only convert entities that are within tags since | |
* these are the ones that will pose security problems. | |
* | |
*/ | |
$str = preg_replace_callback("/[a-z]+=([\'\"]).*?\\1/si", array(self::$instance, '_convert_attribute'), $str); | |
$str = preg_replace_callback("/<\w+.*?(?=>|<|$)/si", array(self::$instance, '_decode_entity'), $str); | |
/* | |
* Remove Invisible Characters Again! | |
*/ | |
$str = self::remove_invisible_characters($str); | |
/* | |
* Convert all tabs to spaces | |
* | |
* This prevents strings like this: ja vascript | |
* NOTE: we deal with spaces between characters later. | |
* NOTE: preg_replace was found to be amazingly slow here on | |
* large blocks of data, so we use str_replace. | |
*/ | |
if (strpos($str, "\t") !== FALSE) | |
{ | |
$str = str_replace("\t", ' ', $str); | |
} | |
/* | |
* Capture converted string for later comparison | |
*/ | |
$converted_string = $str; | |
// Remove Strings that are never allowed | |
$str = self::_do_never_allowed($str); | |
/* | |
* Makes PHP tags safe | |
* | |
* Note: XML tags are inadvertently replaced too: | |
* | |
* <?xml | |
* | |
* But it doesn't seem to pose a problem. | |
*/ | |
if ($is_image === TRUE) | |
{ | |
// Images have a tendency to have the PHP short opening and | |
// closing tags every so often so we skip those and only | |
// do the long opening tags. | |
$str = preg_replace('/<\?(php)/i', "<?\\1", $str); | |
} | |
else | |
{ | |
$str = str_replace(array('<?', '?'.'>'), array('<?', '?>'), $str); | |
} | |
/* | |
* Compact any exploded words | |
* | |
* This corrects words like: j a v a s c r i p t | |
* These words are compacted back to their correct state. | |
*/ | |
$words = array( | |
'javascript', 'expression', 'vbscript', 'script', 'base64', | |
'applet', 'alert', 'document', 'write', 'cookie', 'window' | |
); | |
foreach ($words as $word) | |
{ | |
$temp = ''; | |
for ($i = 0, $wordlen = strlen($word); $i < $wordlen; $i++) | |
{ | |
$temp .= substr($word, $i, 1)."\s*"; | |
} | |
// We only want to do this when it is followed by a non-word character | |
// That way valid stuff like "dealer to" does not become "dealerto" | |
$str = preg_replace_callback('#('.substr($temp, 0, -3).')(\W)#is', array(self::$instance, '_compact_exploded_words'), $str); | |
} | |
/* | |
* Remove disallowed Javascript in links or img tags | |
* We used to do some version comparisons and use of stripos for PHP5, | |
* but it is dog slow compared to these simplified non-capturing | |
* preg_match(), especially if the pattern exists in the string | |
*/ | |
do | |
{ | |
$original = $str; | |
if (preg_match("/<a/i", $str)) | |
{ | |
$str = preg_replace_callback("#<a\s+([^>]*?)(>|$)#si", array(self::$instance, '_js_link_removal'), $str); | |
} | |
if (preg_match("/<img/i", $str)) | |
{ | |
$str = preg_replace_callback("#<img\s+([^>]*?)(\s?/?>|$)#si", array(self::$instance, '_js_img_removal'), $str); | |
} | |
if (preg_match("/script/i", $str) OR preg_match("/xss/i", $str)) | |
{ | |
$str = preg_replace("#<(/*)(script|xss)(.*?)\>#si", '[removed]', $str); | |
} | |
} | |
while($original != $str); | |
unset($original); | |
// Remove evil attributes such as style, onclick and xmlns | |
$str = self::_remove_evil_attributes($str, $is_image); | |
/* | |
* Sanitize naughty HTML elements | |
* | |
* If a tag containing any of the words in the list | |
* below is found, the tag gets converted to entities. | |
* | |
* So this: <blink> | |
* Becomes: <blink> | |
*/ | |
$naughty = 'alert|applet|audio|basefont|base|behavior|bgsound|blink|body|embed|expression|form|frameset|frame|head|html|ilayer|iframe|input|isindex|layer|link|meta|object|plaintext|style|script|textarea|title|video|xml|xss'; | |
$str = preg_replace_callback('#<(/*\s*)('.$naughty.')([^><]*)([><]*)#is', array(self::$instance, '_sanitize_naughty_html'), $str); | |
/* | |
* Sanitize naughty scripting elements | |
* | |
* Similar to above, only instead of looking for | |
* tags it looks for PHP and JavaScript commands | |
* that are disallowed. Rather than removing the | |
* code, it simply converts the parenthesis to entities | |
* rendering the code un-executable. | |
* | |
* For example: eval('some code') | |
* Becomes: eval('some code') | |
*/ | |
$str = preg_replace('#(alert|cmd|passthru|eval|exec|expression|system|fopen|fsockopen|file|file_get_contents|readfile|unlink)(\s*)\((.*?)\)#si', "\\1\\2(\\3)", $str); | |
// Final clean up | |
// This adds a bit of extra precaution in case | |
// something got through the above filters | |
$str = self::_do_never_allowed($str); | |
/* | |
* Images are Handled in a Special Way | |
* - Essentially, we want to know that after all of the character | |
* conversion is done whether any unwanted, likely XSS, code was found. | |
* If not, we return TRUE, as the image is clean. | |
* However, if the string post-conversion does not matched the | |
* string post-removal of XSS, then it fails, as there was unwanted XSS | |
* code found and removed/changed during processing. | |
*/ | |
if ($is_image === TRUE) | |
{ | |
return ($str == $converted_string) ? TRUE: FALSE; | |
} | |
return $str; | |
} | |
/** | |
* Random Hash for protecting URLs | |
* | |
* @return string | |
*/ | |
public function xss_hash() | |
{ | |
mt_srand(); | |
self::$_xss_hash = md5(time() + mt_rand(0, 1999999999)); | |
return self::$_xss_hash; | |
} | |
/** | |
* Remove Invisible Characters | |
* | |
* This prevents sandwiching null characters | |
* between ascii characters, like Java\0script. | |
* | |
* @access public | |
* @param string | |
* @return string | |
*/ | |
public function remove_invisible_characters($str, $url_encoded = TRUE) | |
{ | |
$non_displayables = array(); | |
// every control character except newline (dec 10) | |
// carriage return (dec 13), and horizontal tab (dec 09) | |
if ($url_encoded) | |
{ | |
$non_displayables[] = '/%0[0-8bcef]/'; // url encoded 00-08, 11, 12, 14, 15 | |
$non_displayables[] = '/%1[0-9a-f]/'; // url encoded 16-31 | |
} | |
$non_displayables[] = '/[\x00-\x08\x0B\x0C\x0E-\x1F\x7F]+/S'; // 00-08, 11, 12, 14-31, 127 | |
do | |
{ | |
$str = preg_replace($non_displayables, '', $str, -1, $count); | |
} | |
while ($count); | |
return $str; | |
} | |
/** | |
* Validate URL entities | |
* | |
* Called by xss_clean() | |
* | |
* @param string | |
* @return string | |
*/ | |
protected function _validate_entities($str) | |
{ | |
/* | |
* Protect GET variables in URLs | |
*/ | |
// 901119URL5918AMP18930PROTECT8198 | |
$str = preg_replace('|\&([a-z\_0-9\-]+)\=([a-z\_0-9\-]+)|i', self::xss_hash()."\\1=\\2", $str); | |
/* | |
* Validate standard character entities | |
* | |
* Add a semicolon if missing. We do this to enable | |
* the conversion of entities to ASCII later. | |
* | |
*/ | |
$str = preg_replace('#(&\#?[0-9a-z]{2,})([\x00-\x20])*;?#i', "\\1;\\2", $str); | |
/* | |
* Validate UTF16 two byte encoding (x00) | |
* | |
* Just as above, adds a semicolon if missing. | |
* | |
*/ | |
$str = preg_replace('#(&\#x?)([0-9A-F]+);?#i',"\\1\\2;",$str); | |
/* | |
* Un-Protect GET variables in URLs | |
*/ | |
$str = str_replace(self::xss_hash(), '&', $str); | |
return $str; | |
} | |
/** | |
* Attribute Conversion | |
* | |
* Used as a callback for XSS Clean | |
* | |
* @param array | |
* @return string | |
*/ | |
protected function _convert_attribute($match) | |
{ | |
return str_replace(array('>', '<', '\\'), array('>', '<', '\\\\'), $match[0]); | |
} | |
/** | |
* HTML Entity Decode Callback | |
* | |
* Used as a callback for XSS Clean | |
* | |
* @param array | |
* @return string | |
*/ | |
protected function _decode_entity($match) | |
{ | |
return self::entity_decode($match[0], 'UTF-8'); | |
} | |
/** | |
* HTML Entities Decode | |
* | |
* This function is a replacement for html_entity_decode() | |
* | |
* The reason we are not using html_entity_decode() by itself is because | |
* while it is not technically correct to leave out the semicolon | |
* at the end of an entity most browsers will still interpret the entity | |
* correctly. html_entity_decode() does not convert entities without | |
* semicolons, so we are left with our own little solution here. Bummer. | |
* | |
* @param string | |
* @param string | |
* @return string | |
*/ | |
public function entity_decode($str, $charset='UTF-8') | |
{ | |
if (stristr($str, '&') === FALSE) | |
{ | |
return $str; | |
} | |
$str = html_entity_decode($str, ENT_COMPAT, $charset); | |
$str = preg_replace('~&#x(0*[0-9a-f]{2,5})~ei', 'chr(hexdec("\\1"))', $str); | |
return preg_replace('~&#([0-9]{2,4})~e', 'chr(\\1)', $str); | |
} | |
/** | |
* Do Never Allowed | |
* | |
* A utility function for xss_clean() | |
* | |
* @param string | |
* @return string | |
*/ | |
protected function _do_never_allowed($str) | |
{ | |
$str = str_replace(array_keys(self::$_never_allowed_str), self::$_never_allowed_str, $str); | |
foreach (self::$_never_allowed_regex as $regex) | |
{ | |
$str = preg_replace('#'.$regex.'#is', '[removed]', $str); | |
} | |
return $str; | |
} | |
/** | |
* Compact Exploded Words | |
* | |
* Callback function for xss_clean() to remove whitespace from | |
* things like j a v a s c r i p t | |
* | |
* @param type | |
* @return type | |
*/ | |
protected function _compact_exploded_words($matches) | |
{ | |
return preg_replace('/\s+/s', '', $matches[1]).$matches[2]; | |
} | |
/** | |
* Sanitize Naughty HTML | |
* | |
* Callback function for xss_clean() to remove naughty HTML elements | |
* | |
* @param array | |
* @return string | |
*/ | |
protected function _sanitize_naughty_html($matches) | |
{ | |
// encode opening brace | |
$str = '<'.$matches[1].$matches[2].$matches[3]; | |
// encode captured opening or closing brace to prevent recursive vectors | |
$str .= str_replace(array('>', '<'), array('>', '<'), | |
$matches[4]); | |
return $str; | |
} | |
/** | |
* JS Link Removal | |
* | |
* Callback function for xss_clean() to sanitize links | |
* This limits the PCRE backtracks, making it more performance friendly | |
* and prevents PREG_BACKTRACK_LIMIT_ERROR from being triggered in | |
* PHP 5.2+ on link-heavy strings | |
* | |
* @param array | |
* @return string | |
*/ | |
protected function _js_link_removal($match) | |
{ | |
return str_replace( | |
$match[1], | |
preg_replace( | |
'#href=.*?(alert\(|alert&\#40;|javascript\:|livescript\:|mocha\:|charset\=|window\.|document\.|\.cookie|<script|<xss|data\s*:)#si', | |
'', | |
self::_filter_attributes(str_replace(array('<', '>'), '', $match[1])) | |
), | |
$match[0] | |
); | |
} | |
/** | |
* JS Image Removal | |
* | |
* Callback function for xss_clean() to sanitize image tags | |
* This limits the PCRE backtracks, making it more performance friendly | |
* and prevents PREG_BACKTRACK_LIMIT_ERROR from being triggered in | |
* PHP 5.2+ on image tag heavy strings | |
* | |
* @param array | |
* @return string | |
*/ | |
protected function _js_img_removal($match) | |
{ | |
return str_replace( | |
$match[1], | |
preg_replace( | |
'#src=.*?(alert\(|alert&\#40;|javascript\:|livescript\:|mocha\:|charset\=|window\.|document\.|\.cookie|<script|<xss|base64\s*,)#si', | |
'', | |
self::_filter_attributes(str_replace(array('<', '>'), '', $match[1])) | |
), | |
$match[0] | |
); | |
} | |
/** | |
* Filter Attributes | |
* | |
* Filters tag attributes for consistency and safety | |
* | |
* @param string | |
* @return string | |
*/ | |
protected function _filter_attributes($str) | |
{ | |
$out = ''; | |
if (preg_match_all('#\s*[a-z\-]+\s*=\s*(\042|\047)([^\\1]*?)\\1#is', $str, $matches)) | |
{ | |
foreach ($matches[0] as $match) | |
{ | |
$out .= preg_replace("#/\*.*?\*/#s", '', $match); | |
} | |
} | |
return $out; | |
} | |
/* | |
* Remove Evil HTML Attributes (like evenhandlers and style) | |
* | |
* It removes the evil attribute and either: | |
* - Everything up until a space | |
* For example, everything between the pipes: | |
* <a |style=document.write('hello');alert('world');| class=link> | |
* - Everything inside the quotes | |
* For example, everything between the pipes: | |
* <a |style="document.write('hello'); alert('world');"| class="link"> | |
* | |
* @param string $str The string to check | |
* @param boolean $is_image TRUE if this is an image | |
* @return string The string with the evil attributes removed | |
*/ | |
protected function _remove_evil_attributes($str, $is_image) | |
{ | |
// All javascript event handlers (e.g. onload, onclick, onmouseover), style, and xmlns | |
$evil_attributes = array('on\w*', 'style', 'xmlns', 'formaction'); | |
if ($is_image === TRUE) | |
{ | |
/* | |
* Adobe Photoshop puts XML metadata into JFIF images, | |
* including namespacing, so we have to allow this for images. | |
*/ | |
unset($evil_attributes[array_search('xmlns', $evil_attributes)]); | |
} | |
do { | |
$count = 0; | |
$attribs = array(); | |
// find occurrences of illegal attribute strings without quotes | |
preg_match_all('/('.implode('|', $evil_attributes).')\s*=\s*([^\s>]*)/is', $str, $matches, PREG_SET_ORDER); | |
foreach ($matches as $attr) | |
{ | |
$attribs[] = preg_quote($attr[0], '/'); | |
} | |
// find occurrences of illegal attribute strings with quotes (042 and 047 are octal quotes) | |
preg_match_all("/(".implode('|', $evil_attributes).")\s*=\s*(\042|\047)([^\\2]*?)(\\2)/is", $str, $matches, PREG_SET_ORDER); | |
foreach ($matches as $attr) | |
{ | |
$attribs[] = preg_quote($attr[0], '/'); | |
} | |
// replace illegal attribute strings that are inside an html tag | |
if (count($attribs) > 0) | |
{ | |
$str = preg_replace("/<(\/?[^><]+?)([^A-Za-z<>\-])(.*?)(".implode('|', $attribs).")(.*?)([\s><])([><]*)/i", '<$1 $3$5$6$7', $str, -1, $count); | |
} | |
} while ($count); | |
return $str; | |
} | |
/** | |
* Is ajax Request? | |
* | |
* Test to see if a request contains the HTTP_X_REQUESTED_WITH header | |
* | |
* @return boolean | |
*/ | |
public function is_ajax_request() | |
{ | |
return (self::server('HTTP_X_REQUESTED_WITH') === 'XMLHttpRequest'); | |
} | |
/** | |
* Is cli Request? | |
* | |
* Test to see if a request was made from the command line | |
* | |
* @return bool | |
*/ | |
public function is_cli_request() | |
{ | |
return (php_sapi_name() === 'cli' OR defined('STDIN')); | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Purge::xss_clean($data);