nikitastore · December 20, 2017 15:25
diff --git a/clean_html_xss.php b/clean_html_xss.php
 <?php

 function clean_html(&$html) {
    libxml_use_internal_errors(true) AND libxml_clear_errors();
    
    $whitelist = array(
        "#text",
        "a" => array("target", "href", "name", "type", "rel", "download"),
        "img" => array("src", "alt", "width", "height"),
        "table" => array("width"),
        "td" => array("rowspan", "colspan"),
        "ul" => array("type"),
        "ol" => array("type", "reversed", "start"),
        "li" => array("type", "value"),
     "span", "dd", "dl", "dt", "caption", "thead", "tbody", "tfoot", "th", "tr", "p", "div", "br", "hr", "pre", "blockquote", "center", "h1", "h2", "h3", "h4", "h5", "h6", "abbr", "b", "strong", "big", "small", "cite", "code", "listing", "del", "s", "ins", "u", "em", "i", "kbd", "mark", "nobr", "q", "sub", "sup", "time");

    if (is_object($html)) {
        
        if ($html->hasChildNodes()) {
            foreach (range($html->childNodes->length - 1, 0) as $i) {
                clean_html($html->childNodes->item($i), $whitelist);
            }
        }
        
        if (!in_array($html->nodeName, $whitelist) && !isset($whitelist[$html->nodeName]) && is_object($html->childNodes)) {
            
            $fragment = $html->ownerDocument->createDocumentFragment();
            
            while ($html->childNodes->length > 0) {
                $fragment->appendChild($html->childNodes->item(0));
            }
            
            return $html->parentNode->replaceChild($fragment, $html);
        }

        if($html->hasAttributes()) {
            if(isset($whitelist[$html->nodeName]) && is_array($whitelist[$html->nodeName])) {
                for ( $k = $html->attributes->length - 1; $k >= 0; --$k ) {
                    $attribute = $html->attributes->item($k);
                    if(!in_array($attribute->nodeName, $whitelist[$html->nodeName])) {
                        $html->removeAttributeNode( $attribute );
                    } else {
                        $attribute->value = xss_clean((string)$attribute->value);
                    }
                }
            } else {
                while ($html->hasAttributes()) {
                    $html->removeAttributeNode($html->attributes->item(0));
                }
            }
        }
    } else {
        $dimp1 = new DOMImplementation();
        $doctype = $dimp1->createDocumentType('html',
                  '-//W3C//DTD XHTML 1.1//EN',
                  'http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd');
        
        $dimp2 = new DOMImplementation();
        $document = $dimp2->createDocument('http://www.w3.org/1999/xhtml',
                   'html',
                   $doctype);
        $document->encoding = "UTF-8";
        $document->formatOutput = TRUE;
        $document->recover = TRUE;

        $html = mb_convert_encoding($html, 'HTML-ENTITIES', "UTF-8");

        if ($document->loadHTML($html)) {
            clean_html($document->documentElement, $whitelist);
            $result = "";
            foreach($document->childNodes as $node)
                $result .= $document->saveXML($node, LIBXML_NOEMPTYTAG)."\n";

            $result = preg_replace('~<(?:!DOCTYPE|/?(?:html|body))[^>]*>\s*~i', '', $result);
            return $result;
        }
    }
 }

 function xss_clean($str, $is_image = FALSE) {
    
    static $_xss_hash = "";
    $charset = 'UTF-8';
    
    if ($_xss_hash == "") {
        mt_srand();
        $_xss_hash = md5(time() + mt_rand(0, 1999999999));
    }
    /*
     * Is the string an array?
     *
    */
    if (is_array($str)) {
        while (list($key) = each($str)) {
            $str[$key] = xss_clean($str[$key]);
        }
        
        return $str;
    }
    /*
     * Remove Invisible Characters
    */
    $non_displayables = array();
    // every control character except newline (dec 10)
    // carriage return (dec 13), and horizontal tab (dec 09)
    
    $non_displayables[] = '/%0[0-8bcef]/';
    // url encoded 00-08, 11, 12, 14, 15
    
    $non_displayables[] = '/%1[0-9a-f]/';
    // url encoded 16-31
    
    $non_displayables[] = '/[\x00-\x08\x0B\x0C\x0E-\x1F\x7F]+/S';
    // 00-08, 11, 12, 14-31, 127
    
    do {
        $str = preg_replace($non_displayables, "", $str, -1, $count);
    } while ($count);
    // Validate Entities in URLs
    
    /*
     * Protect GET variables in URLs
    */
    // 901119URL5918AMP18930PROTECT8198
    
    $str = preg_replace('|\&([a-z\_0-9\-]+)\=([a-z\_0-9\-]+)|i', $_xss_hash . "\\1=\\2", $str);
    /*
     * Validate standard character entities
     *
     * Add a semicolon if missing.  We do this to enable
     * the conversion of entities to ASCII later.
     *
    */
    $str = preg_replace('#(&\#?[0-9a-z]{2,})([\x00-\x20])*;?#i', "\\1;\\2", $str);
    /*
     * Validate UTF16 two byte encoding (x00)
     *
     * Just as above, adds a semicolon if missing.
     *
    */
    $str = preg_replace('#(&\#x?)([0-9A-F]+);?#i', "\\1\\2;", $str);
    /*
     * Un-Protect GET variables in URLs
    */
    $str = str_replace($_xss_hash, '&', $str);
    /*
     * URL Decode
     *
     * Just in case stuff like this is submitted:
     *
     * <a href="http://%77%77%77%2E%67%6F%6F%67%6C%65%2E%63%6F%6D">Google</a>
     *
     * Note: Use rawurldecode() so it does not remove plus signs
     *
    */
    $str = rawurldecode($str);
    /*
     * Convert character entities to ASCII
     *
     * This permits our tests below to work reliably.
     * We only convert entities that are within tags since
     * these are the ones that will pose security problems.
     *
    */
    
    $str = preg_replace_callback("/[a-z]+=([\'\"]).*?\\1/si", function ($match) {
        return str_replace(array(
            '>',
            '<',
            '\\'
        ) , array(
            '&gt;',
            '&lt;',
            '\\\\'
        ) , $match[0]);
    }
    , $str);
    
    $str = preg_replace_callback("/<\w+.*?(?=>|<|$)/si", function ($match) {
        if (stristr($match[0], '&') === FALSE) {
            return $match[0];
        }
        
        $out = html_entity_decode($match[0], ENT_COMPAT, $charset);
        $out = preg_replace('~&#x(0*[0-9a-f]{2,5})~ei', 'chr(hexdec("\\1"))', $out);
        return preg_replace('~&#([0-9]{2,4})~e', 'chr(\\1)', $out);
    }
    , $str);
    /*
     * Remove Invisible Characters Again!
    */
    $non_displayables = array();
    // every control character except newline (dec 10)
    // carriage return (dec 13), and horizontal tab (dec 09)
    
    $non_displayables[] = '/%0[0-8bcef]/';
    // url encoded 00-08, 11, 12, 14, 15
    
    $non_displayables[] = '/%1[0-9a-f]/';
    // url encoded 16-31
    
    $non_displayables[] = '/[\x00-\x08\x0B\x0C\x0E-\x1F\x7F]+/S';
    // 00-08, 11, 12, 14-31, 127
    
    do {
        $str = preg_replace($non_displayables, "", $str, -1, $count);
    } while ($count);
    /*
     * Convert all tabs to spaces
     *
     * This prevents strings like this: ja  vascript
     * NOTE: we deal with spaces between characters later.
     * NOTE: preg_replace was found to be amazingly slow here on
     * large blocks of data, so we use str_replace.
    */
    
    if (strpos($str, "\t") !== FALSE) {
        $str = str_replace("\t", ' ', $str);
    }
    /*
     * Capture converted string for later comparison
    */
    $converted_string = $str;
    // Remove Strings that are never allowed
    
    $_never_allowed_str = array(
        'document.cookie' => '[removed]',
        'document.write' => '[removed]',
        '.parentNode' => '[removed]',
        '.innerHTML' => '[removed]',
        'window.location' => '[removed]',
        '-moz-binding' => '[removed]',
        '<!--' => '&lt;!--',
        '-->' => '--&gt;',
        '<![CDATA[' => '&lt;![CDATA[',
        '<comment>' => '&lt;comment&gt;'
    );
    
    $_never_allowed_regex = array(
        'javascript\s*:',
        'expression\s*(\(|&\#40;)',
        // CSS and IE
        'vbscript\s*:',
        // IE, surprise!
        'Redirect\s+302',
        "([\"'])?data\s*:[^\\1]*?base64[^\\1]*?,[^\\1]*?\\1?"
    );
    
    $str = str_replace(array_keys($_never_allowed_str) , $_never_allowed_str, $str);
    
    foreach ($_never_allowed_regex as $regex) {
        $str = preg_replace('#' . $regex . '#is', '[removed]', $str);
    }
    /*
     * Makes PHP tags safe
     *
     * Note: XML tags are inadvertently replaced too:
     *
     * <?xml
     *
     * But it doesn't seem to pose a problem.
    */
    if ($is_image === TRUE) {
        // Images have a tendency to have the PHP short opening and
        // closing tags every so often so we skip those and only
        // do the long opening tags.
        $str = preg_replace('/<\?(php)/i', "&lt;?\\1", $str);
    } else {
        $str = str_replace(array(
            '<?',
            '?' . '>'
        ) , array(
            '&lt;?',
            '?&gt;'
        ) , $str);
    }
    /*
     * Compact any exploded words
     *
     * This corrects words like:  j a v a s c r i p t
     * These words are compacted back to their correct state.
    */
    $words = array(
        'javascript',
        'expression',
        'vbscript',
        'script',
        'base64',
        'applet',
        'alert',
        'document',
        'write',
        'cookie',
        'window'
    );
    
    foreach ($words as $word) {
        $temp = "";
        
        for ($i = 0, $wordlen = strlen($word);$i < $wordlen;$i++) {
            $temp.= substr($word, $i, 1) . "\s*";
        }
        // We only want to do this when it is followed by a non-word character
        // That way valid stuff like "dealer to" does not become "dealerto"
        $str = preg_replace_callback('#(' . substr($temp, 0, -3) . ')(\W)#is', function ($matches) {
            return preg_replace('/\s+/s', "", $matches[1]) . $matches[2];
        }
        , $str);
    }
    /*
     * Remove disallowed Javascript in links or img tags
     * We used to do some version comparisons and use of stripos for PHP5,
     * but it is dog slow compared to these simplified non-capturing
     * preg_match(), especially if the pattern exists in the string
    */
    do {
        $original = $str;
        
        if (preg_match("/<a/i", $str)) {
            $str = preg_replace_callback("#<a\s+([^>]*?)(>|$)#si", function ($match) {
                $subject = str_replace(array(
                    '<',
                    '>'
                ) , "", $match[1]);
                
                $out = "";
                if (preg_match_all('#\s*[a-z\-]+\s*=\s*(\042|\047)([^\\1]*?)\\1#is', $subject, $matches)) {
                    foreach ($matches[0] as $match) {
                        $out.= preg_replace("#/\*.*?\*/#s", "", $match);
                    }
                }
                $subject = $out;
                
                $replace = preg_replace('#href=.*?(alert\(|alert&\#40;|javascript\:|livescript\:|mocha\:|charset\=|window\.|document\.|\.cookie|<script|<xss|data\s*:)#si', "", $subject);
                return str_replace($match[1], $replace, $match[0]);
            }
            , $str);
        }
        
        if (preg_match("/<img/i", $str)) {
            $str = preg_replace_callback("#<img\s+([^>]*?)(\s?/?>|$)#si", function ($match) {
                $subject = str_replace(array(
                    '<',
                    '>'
                ) , "", $match[1]);
                
                $out = "";
                if (preg_match_all('#\s*[a-z\-]+\s*=\s*(\042|\047)([^\\1]*?)\\1#is', $subject, $matches)) {
                    foreach ($matches[0] as $match) {
                        $out.= preg_replace("#/\*.*?\*/#s", "", $match);
                    }
                }
                $subject = $out;
                
                $replace = preg_replace('#src=.*?(alert\(|alert&\#40;|javascript\:|livescript\:|mocha\:|charset\=|window\.|document\.|\.cookie|<script|<xss|base64\s*,)#si', "", $subject);
                return str_replace($match[1], $replace, $match[0]);
            }
            , $str);
        }
        
        if (preg_match("/script/i", $str) OR preg_match("/xss/i", $str)) {
            $str = preg_replace("#<(/*)(script|xss)(.*?)\>#si", '[removed]', $str);
        }
    } while ($original != $str);
    
    unset($original);
    // Remove evil attributes such as style, onclick and xmlns
    
    // All javascript event handlers (e.g. onload, onclick, onmouseover), style, and xmlns
    $evil_attributes = array(
        'on\w*',
        'style',
        'xmlns',
        'formaction'
    );
    
    if ($is_image === TRUE) {
        /*
         * Adobe Photoshop puts XML metadata into JFIF images,
         * including namespacing, so we have to allow this for images.
        */
        unset($evil_attributes[array_search('xmlns', $evil_attributes) ]);
    }
    
    do {
        $count = 0;
        $attribs = array();
        // find occurrences of illegal attribute strings with quotes (042 and 047 are octal quotes)
        preg_match_all('/(' . implode('|', $evil_attributes) . ')\s*=\s*(\042|\047)([^\\2]*?)(\\2)/is', $str, $matches, PREG_SET_ORDER);
        
        foreach ($matches as $attr) {
            $attribs[] = preg_quote($attr[0], '/');
        }
        // find occurrences of illegal attribute strings without quotes
        preg_match_all('/(' . implode('|', $evil_attributes) . ')\s*=\s*([^\s>]*)/is', $str, $matches, PREG_SET_ORDER);
        
        foreach ($matches as $attr) {
            $attribs[] = preg_quote($attr[0], '/');
        }
        // replace illegal attribute strings that are inside an html tag
        if (count($attribs) > 0) {
            $str = preg_replace('/(<?)(\/?[^><]+?)([^A-Za-z<>\-])(.*?)(' . implode('|', $attribs) . ')(.*?)([\s><]?)([><]*)/i', '$1$2 $4$6$7$8', $str, -1, $count);
        }
    } while ($count);
    /*
     * Sanitize naughty HTML elements
     *
     * If a tag containing any of the words in the list
     * below is found, the tag gets converted to entities.
     *
     * So this: <blink>
     * Becomes: &lt;blink&gt;
    */
    $naughty = 'alert|applet|audio|basefont|base|behavior|bgsound|blink|body|embed|expression|form|frameset|frame|head|html|ilayer|iframe|input|isindex|layer|link|meta|object|plaintext|style|script|textarea|title|video|xml|xss';
    $str = preg_replace_callback('#<(/*\s*)(' . $naughty . ')([^><]*)([><]*)#is', function ($matches) {
            // encode opening brace
            $out = '&lt;' . $matches[1] . $matches[2] . $matches[3];
            
            // encode captured opening or closing brace to prevent recursive vectors
            $out.= str_replace(array('>', '<'), array('&gt;', '&lt;'), $matches[4]);
            
            return $out;
        }, $str);

    /*
     * Sanitize naughty scripting elements
     *
     * Similar to above, only instead of looking for
     * tags it looks for PHP and JavaScript commands
     * that are disallowed.  Rather than removing the
     * code, it simply converts the parenthesis to entities
     * rendering the code un-executable.
     *
     * For example: eval('some code')
     * Becomes:     eval&#40;'some code'&#41;
    */
    $str = preg_replace('#(alert|cmd|passthru|eval|exec|expression|system|fopen|fsockopen|file|file_get_contents|readfile|unlink)(\s*)\((.*?)\)#si', "\\1\\2&#40;\\3&#41;", $str);
    // Final clean up
    // This adds a bit of extra precaution in case
    // something got through the above filters
    
    $str = str_replace(array_keys($_never_allowed_str) , $_never_allowed_str, $str);
    
    foreach ($_never_allowed_regex as $regex) {
        $str = preg_replace('#' . $regex . '#is', '[removed]', $str);
    }
    /*
     * Images are Handled in a Special Way
     * - Essentially, we want to know that after all of the character
     * conversion is done whether any unwanted, likely XSS, code was found.
     * If not, we return TRUE, as the image is clean.
     * However, if the string post-conversion does not matched the
     * string post-removal of XSS, then it fails, as there was unwanted XSS
     * code found and removed/changed during processing.
    */
    
    if ($is_image === TRUE) {
        return ($str == $converted_string) ? TRUE : FALSE;
    }
    
    return $str;
 }
	<?php

	function clean_html(&$html) {
	libxml_use_internal_errors(true) AND libxml_clear_errors();

	$whitelist = array(
	"#text",
	"a" => array("target", "href", "name", "type", "rel", "download"),
	"img" => array("src", "alt", "width", "height"),
	"table" => array("width"),
	"td" => array("rowspan", "colspan"),
	"ul" => array("type"),
	"ol" => array("type", "reversed", "start"),
	"li" => array("type", "value"),
	"span", "dd", "dl", "dt", "caption", "thead", "tbody", "tfoot", "th", "tr", "p", "div", "br", "hr", "pre", "blockquote", "center", "h1", "h2", "h3", "h4", "h5", "h6", "abbr", "b", "strong", "big", "small", "cite", "code", "listing", "del", "s", "ins", "u", "em", "i", "kbd", "mark", "nobr", "q", "sub", "sup", "time");

	if (is_object($html)) {

	if ($html->hasChildNodes()) {
	foreach (range($html->childNodes->length - 1, 0) as $i) {
	clean_html($html->childNodes->item($i), $whitelist);
	}
	}

	if (!in_array($html->nodeName, $whitelist) && !isset($whitelist[$html->nodeName]) && is_object($html->childNodes)) {

	$fragment = $html->ownerDocument->createDocumentFragment();

	while ($html->childNodes->length > 0) {
	$fragment->appendChild($html->childNodes->item(0));
	}

	return $html->parentNode->replaceChild($fragment, $html);
	}

	if($html->hasAttributes()) {
	if(isset($whitelist[$html->nodeName]) && is_array($whitelist[$html->nodeName])) {
	for ( $k = $html->attributes->length - 1; $k >= 0; --$k ) {
	$attribute = $html->attributes->item($k);
	if(!in_array($attribute->nodeName, $whitelist[$html->nodeName])) {
	$html->removeAttributeNode( $attribute );
	} else {
	$attribute->value = xss_clean((string)$attribute->value);
	}
	}
	} else {
	while ($html->hasAttributes()) {
	$html->removeAttributeNode($html->attributes->item(0));
	}
	}
	}
	} else {
	$dimp1 = new DOMImplementation();
	$doctype = $dimp1->createDocumentType('html',
	'-//W3C//DTD XHTML 1.1//EN',
	'http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd');

	$dimp2 = new DOMImplementation();
	$document = $dimp2->createDocument('http://www.w3.org/1999/xhtml',
	'html',
	$doctype);
	$document->encoding = "UTF-8";
	$document->formatOutput = TRUE;
	$document->recover = TRUE;

	$html = mb_convert_encoding($html, 'HTML-ENTITIES', "UTF-8");

	if ($document->loadHTML($html)) {
	clean_html($document->documentElement, $whitelist);
	$result = "";
	foreach($document->childNodes as $node)
	$result .= $document->saveXML($node, LIBXML_NOEMPTYTAG)."\n";

	$result = preg_replace('~<(?:!DOCTYPE\|/?(?:html\|body))[^>]>\s~i', '', $result);
	return $result;
	}
	}
	}

	function xss_clean($str, $is_image = FALSE) {

	static $_xss_hash = "";
	$charset = 'UTF-8';

	if ($_xss_hash == "") {
	mt_srand();
	$_xss_hash = md5(time() + mt_rand(0, 1999999999));
	}
	/*
	* Is the string an array?
	*
	*/
	if (is_array($str)) {
	while (list($key) = each($str)) {
	$str[$key] = xss_clean($str[$key]);
	}

	return $str;
	}
	/*
	* Remove Invisible Characters
	*/
	$non_displayables = array();
	// every control character except newline (dec 10)
	// carriage return (dec 13), and horizontal tab (dec 09)

	$non_displayables[] = '/%0[0-8bcef]/';
	// url encoded 00-08, 11, 12, 14, 15

	$non_displayables[] = '/%1[0-9a-f]/';
	// url encoded 16-31

	$non_displayables[] = '/[\x00-\x08\x0B\x0C\x0E-\x1F\x7F]+/S';
	// 00-08, 11, 12, 14-31, 127

	do {
	$str = preg_replace($non_displayables, "", $str, -1, $count);
	} while ($count);
	// Validate Entities in URLs

	/*
	* Protect GET variables in URLs
	*/
	// 901119URL5918AMP18930PROTECT8198

	$str = preg_replace('\|\&([a-z\_0-9\-]+)\=([a-z\_0-9\-]+)\|i', $_xss_hash . "\\1=\\2", $str);
	/*
	* Validate standard character entities
	*
	* Add a semicolon if missing. We do this to enable
	* the conversion of entities to ASCII later.
	*
	*/
	$str = preg_replace('#(&\#?[0-9a-z]{2,})([\x00-\x20])*;?#i', "\\1;\\2", $str);
	/*
	* Validate UTF16 two byte encoding (x00)
	*
	* Just as above, adds a semicolon if missing.
	*
	*/
	$str = preg_replace('#(&\#x?)([0-9A-F]+);?#i', "\\1\\2;", $str);
	/*
	* Un-Protect GET variables in URLs
	*/
	$str = str_replace($_xss_hash, '&', $str);
	/*
	* URL Decode
	*
	* Just in case stuff like this is submitted:
	*
	* <a href="http://%77%77%77%2E%67%6F%6F%67%6C%65%2E%63%6F%6D">Google</a>
	*
	* Note: Use rawurldecode() so it does not remove plus signs
	*
	*/
	$str = rawurldecode($str);
	/*
	* Convert character entities to ASCII
	*
	* This permits our tests below to work reliably.
	* We only convert entities that are within tags since
	* these are the ones that will pose security problems.
	*
	*/

	$str = preg_replace_callback("/[a-z]+=([\'\"]).*?\\1/si", function ($match) {
	return str_replace(array(
	'>',
	'<',
	'\\'
	) , array(
	'>',
	'<',
	'\\\\'
	) , $match[0]);
	}
	, $str);

	$str = preg_replace_callback("/<\w+.*?(?=>\|<\|$)/si", function ($match) {
	if (stristr($match[0], '&') === FALSE) {
	return $match[0];
	}

	$out = html_entity_decode($match[0], ENT_COMPAT, $charset);
	$out = preg_replace('~&#x(0*[0-9a-f]{2,5})~ei', 'chr(hexdec("\\1"))', $out);
	return preg_replace('~&#([0-9]{2,4})~e', 'chr(\\1)', $out);
	}
	, $str);
	/*
	* Remove Invisible Characters Again!
	*/
	$non_displayables = array();
	// every control character except newline (dec 10)
	// carriage return (dec 13), and horizontal tab (dec 09)

	$non_displayables[] = '/%0[0-8bcef]/';
	// url encoded 00-08, 11, 12, 14, 15

	$non_displayables[] = '/%1[0-9a-f]/';
	// url encoded 16-31

	$non_displayables[] = '/[\x00-\x08\x0B\x0C\x0E-\x1F\x7F]+/S';
	// 00-08, 11, 12, 14-31, 127

	do {
	$str = preg_replace($non_displayables, "", $str, -1, $count);
	} while ($count);
	/*
	* Convert all tabs to spaces
	*
	* This prevents strings like this: ja vascript
	* NOTE: we deal with spaces between characters later.
	* NOTE: preg_replace was found to be amazingly slow here on
	* large blocks of data, so we use str_replace.
	*/

	if (strpos($str, "\t") !== FALSE) {
	$str = str_replace("\t", ' ', $str);
	}
	/*
	* Capture converted string for later comparison
	*/
	$converted_string = $str;
	// Remove Strings that are never allowed

	$_never_allowed_str = array(
	'document.cookie' => '[removed]',
	'document.write' => '[removed]',
	'.parentNode' => '[removed]',
	'.innerHTML' => '[removed]',
	'window.location' => '[removed]',
	'-moz-binding' => '[removed]',
	'<!--' => '<!--',
	'-->' => '-->',
	'<![CDATA[' => '<![CDATA[',
	'<comment>' => '<comment>'
	);

	$_never_allowed_regex = array(
	'javascript\s*:',
	'expression\s*(\(\|&\#40;)',
	// CSS and IE
	'vbscript\s*:',
	// IE, surprise!
	'Redirect\s+302',
	"([\"'])?data\s:[^\\1]?base64[^\\1]?,[^\\1]?\\1?"
	);

	$str = str_replace(array_keys($_never_allowed_str) , $_never_allowed_str, $str);

	foreach ($_never_allowed_regex as $regex) {
	$str = preg_replace('#' . $regex . '#is', '[removed]', $str);
	}
	/*
	* Makes PHP tags safe
	*
	* Note: XML tags are inadvertently replaced too:
	*
	* <?xml
	*
	* But it doesn't seem to pose a problem.
	*/
	if ($is_image === TRUE) {
	// Images have a tendency to have the PHP short opening and
	// closing tags every so often so we skip those and only
	// do the long opening tags.
	$str = preg_replace('/<\?(php)/i', "<?\\1", $str);
	} else {
	$str = str_replace(array(
	'<?',
	'?' . '>'
	) , array(
	'<?',
	'?>'
	) , $str);
	}
	/*
	* Compact any exploded words
	*
	* This corrects words like: j a v a s c r i p t
	* These words are compacted back to their correct state.
	*/
	$words = array(
	'javascript',
	'expression',
	'vbscript',
	'script',
	'base64',
	'applet',
	'alert',
	'document',
	'write',
	'cookie',
	'window'
	);

	foreach ($words as $word) {
	$temp = "";

	for ($i = 0, $wordlen = strlen($word);$i < $wordlen;$i++) {
	$temp.= substr($word, $i, 1) . "\s*";
	}
	// We only want to do this when it is followed by a non-word character
	// That way valid stuff like "dealer to" does not become "dealerto"
	$str = preg_replace_callback('#(' . substr($temp, 0, -3) . ')(\W)#is', function ($matches) {
	return preg_replace('/\s+/s', "", $matches[1]) . $matches[2];
	}
	, $str);
	}
	/*
	* Remove disallowed Javascript in links or img tags
	* We used to do some version comparisons and use of stripos for PHP5,
	* but it is dog slow compared to these simplified non-capturing
	* preg_match(), especially if the pattern exists in the string
	*/
	do {
	$original = $str;

	if (preg_match("/<a/i", $str)) {
	$str = preg_replace_callback("#<a\s+([^>]*?)(>\|$)#si", function ($match) {
	$subject = str_replace(array(
	'<',
	'>'
	) , "", $match[1]);

	$out = "";
	if (preg_match_all('#\s[a-z\-]+\s=\s(\042\|\047)([^\\1]?)\\1#is', $subject, $matches)) {
	foreach ($matches[0] as $match) {
	$out.= preg_replace("#/\.?\*/#s", "", $match);
	}
	}
	$subject = $out;

	$replace = preg_replace('#href=.?(alert\(\|alert&\#40;\|javascript\:\|livescript\:\|mocha\:\|charset\=\|window\.\|document\.\|\.cookie\|<script\|<xss\|data\s:)#si', "", $subject);
	return str_replace($match[1], $replace, $match[0]);
	}
	, $str);
	}

	if (preg_match("/<img/i", $str)) {
	$str = preg_replace_callback("#<img\s+([^>]*?)(\s?/?>\|$)#si", function ($match) {
	$subject = str_replace(array(
	'<',
	'>'
	) , "", $match[1]);

	$out = "";
	if (preg_match_all('#\s[a-z\-]+\s=\s(\042\|\047)([^\\1]?)\\1#is', $subject, $matches)) {
	foreach ($matches[0] as $match) {
	$out.= preg_replace("#/\.?\*/#s", "", $match);
	}
	}
	$subject = $out;

	$replace = preg_replace('#src=.?(alert\(\|alert&\#40;\|javascript\:\|livescript\:\|mocha\:\|charset\=\|window\.\|document\.\|\.cookie\|<script\|<xss\|base64\s,)#si', "", $subject);
	return str_replace($match[1], $replace, $match[0]);
	}
	, $str);
	}

	if (preg_match("/script/i", $str) OR preg_match("/xss/i", $str)) {
	$str = preg_replace("#<(/)(script\|xss)(.?)\>#si", '[removed]', $str);
	}
	} while ($original != $str);

	unset($original);
	// Remove evil attributes such as style, onclick and xmlns

	// All javascript event handlers (e.g. onload, onclick, onmouseover), style, and xmlns
	$evil_attributes = array(
	'on\w*',
	'style',
	'xmlns',
	'formaction'
	);

	if ($is_image === TRUE) {
	/*
	* Adobe Photoshop puts XML metadata into JFIF images,
	* including namespacing, so we have to allow this for images.
	*/
	unset($evil_attributes[array_search('xmlns', $evil_attributes) ]);
	}

	do {
	$count = 0;
	$attribs = array();
	// find occurrences of illegal attribute strings with quotes (042 and 047 are octal quotes)
	preg_match_all('/(' . implode('\|', $evil_attributes) . ')\s=\s(\042\|\047)([^\\2]*?)(\\2)/is', $str, $matches, PREG_SET_ORDER);

	foreach ($matches as $attr) {
	$attribs[] = preg_quote($attr[0], '/');
	}
	// find occurrences of illegal attribute strings without quotes
	preg_match_all('/(' . implode('\|', $evil_attributes) . ')\s=\s([^\s>]*)/is', $str, $matches, PREG_SET_ORDER);

	foreach ($matches as $attr) {
	$attribs[] = preg_quote($attr[0], '/');
	}
	// replace illegal attribute strings that are inside an html tag
	if (count($attribs) > 0) {
	$str = preg_replace('/(<?)(\/?[^><]+?)([^A-Za-z<>\-])(.?)(' . implode('\|', $attribs) . ')(.?)([\s><]?)([><]*)/i', '$1$2 $4$6$7$8', $str, -1, $count);
	}
	} while ($count);
	/*
	* Sanitize naughty HTML elements
	*
	* If a tag containing any of the words in the list
	* below is found, the tag gets converted to entities.
	*
	* So this: <blink>
	* Becomes: <blink>
	*/
	$naughty = 'alert\|applet\|audio\|basefont\|base\|behavior\|bgsound\|blink\|body\|embed\|expression\|form\|frameset\|frame\|head\|html\|ilayer\|iframe\|input\|isindex\|layer\|link\|meta\|object\|plaintext\|style\|script\|textarea\|title\|video\|xml\|xss';
	$str = preg_replace_callback('#<(/\s)(' . $naughty . ')([^><])([><])#is', function ($matches) {
	// encode opening brace
	$out = '<' . $matches[1] . $matches[2] . $matches[3];

	// encode captured opening or closing brace to prevent recursive vectors
	$out.= str_replace(array('>', '<'), array('>', '<'), $matches[4]);

	return $out;
	}, $str);

	/*
	* Sanitize naughty scripting elements
	*
	* Similar to above, only instead of looking for
	* tags it looks for PHP and JavaScript commands
	* that are disallowed. Rather than removing the
	* code, it simply converts the parenthesis to entities
	* rendering the code un-executable.
	*
	* For example: eval('some code')
	* Becomes: eval('some code')
	*/
	$str = preg_replace('#(alert\|cmd\|passthru\|eval\|exec\|expression\|system\|fopen\|fsockopen\|file\|file_get_contents\|readfile\|unlink)(\s)\((.?)\)#si', "\\1\\2(\\3)", $str);
	// Final clean up
	// This adds a bit of extra precaution in case
	// something got through the above filters

	$str = str_replace(array_keys($_never_allowed_str) , $_never_allowed_str, $str);

	foreach ($_never_allowed_regex as $regex) {
	$str = preg_replace('#' . $regex . '#is', '[removed]', $str);
	}
	/*
	* Images are Handled in a Special Way
	* - Essentially, we want to know that after all of the character
	* conversion is done whether any unwanted, likely XSS, code was found.
	* If not, we return TRUE, as the image is clean.
	* However, if the string post-conversion does not matched the
	* string post-removal of XSS, then it fails, as there was unwanted XSS
	* code found and removed/changed during processing.
	*/

	if ($is_image === TRUE) {
	return ($str == $converted_string) ? TRUE : FALSE;
	}

	return $str;
	}