Created
January 31, 2012 03:14
-
-
Save ifree/1708506 to your computer and use it in GitHub Desktop.
#php sanitize html
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?php | |
//////////////////////////////////////////// | |
// sanitize function | |
// thanks to: http://www.programmingtalk.com/showthread.php?t=47560&page=2&highlight=injection | |
function sanitize($dtype, $dlen, $data){ | |
// dtype 1: allow numbers, space, and '-' | |
// dtype 2: allow alpha and spaces only | |
// dtype 3: allow alphanumeric, spaces, period, and '-' | |
// dtype 4: allow alphanumeric w/ all punctuation | |
// dtype 5: email validation chars | |
// dlen: data length limit, '0' = no length limit | |
// special cleanups | |
$data = preg_replace("/x1a/",'', $data); | |
$data = preg_replace("/x00/",'', $data); | |
$data = preg_replace("|\.\./|",'', $data); // stop directory traversal | |
$data = preg_replace("/--/",' - ', $data); // stop mySQL comments | |
$data = preg_replace("/%3A%2F%2F/",'', $data); // stop B64 encoded '://' | |
///////////////////// | |
// Remove Null Characters | |
// This prevents sandwiching null characters | |
// between ascii characters, like Java\0script. | |
$data = preg_replace('/\0+/', '', $data); | |
$data = preg_replace('/(\\\\0)+/', '', $data); | |
// Validate standard character entities | |
// Add a semicolon if missing. We do this to enable | |
// the conversion of entities to ASCII later. | |
$data = preg_replace('#(&\#*\w+)[\x00-\x20]+;#u',"\\1;",$data); | |
// Validate UTF16 two byte encoding (x00) | |
// Just as above, adds a semicolon if missing. | |
$data = preg_replace('#(&\#x*)([0-9A-F]+);*#iu',"\\1\\2;",$data); | |
// URL Decode | |
// Just in case stuff like this is submitted: | |
// <a href="http://%77%77%77%2E%67%6F%6F%67%6C%65%2E%63%6F%6D">Google</a> | |
// Note: Normally urldecode() would be easier but it removes plus signs | |
$data = preg_replace("/%u0([a-z0-9]{3})/i", "&#x\\1;", $data); | |
$data = preg_replace("/%([a-z0-9]{2})/i", "&#x\\1;", $data); | |
// Convert character entities to ASCII | |
// This permits our tests below to work reliably. | |
// We only convert entities that are within tags since | |
// these are the ones that will pose security problems. | |
if (preg_match_all("/<(.+?)>/si", $data, $matches)) { | |
for ($i = 0; $i < count($matches['0']); $i++) { | |
$data = str_replace($matches['1'][$i], | |
html_entity_decode($matches['1'][$i], ENT_COMPAT, $charset), $data); | |
} | |
} | |
// Convert all tabs to spaces | |
// This prevents strings like this: ja vascript | |
// Note: we deal with spaces between characters later. | |
$data = preg_replace("#\t+#", " ", $data); | |
// Makes PHP tags safe | |
// Note: XML tags are inadvertently replaced too: | |
// <?xml | |
// But it doesn't seem to pose a problem. Besides, only commies use xml. ;) | |
$data = str_replace(array('<?php', '<?PHP', '<?', '?>'), array('<?php', '<?PHP', '<?', '?>'), $data); | |
// Compact any exploded words | |
// This corrects words like: j a v a s c r i p t | |
// These words are compacted back to their correct state. | |
$words = array('javascript', 'vbscript', 'script', 'applet', 'alert', 'document', 'write', 'cookie', 'window'); | |
foreach ($words as $word) { | |
$temp = ''; | |
for ($i = 0; $i < strlen($word); $i++) { | |
$temp .= substr($word, $i, 1)."\s*"; | |
} | |
$temp = substr($temp, 0, -3); | |
$data = preg_replace('#'.$temp.'#s', $word, $data); | |
$data = preg_replace('#'.ucfirst($temp).'#s', ucfirst($word), $data); | |
} | |
// Remove disallowed Javascript in links or img tags | |
$data = preg_replace("#<a.+?href=.*?(alert\(|alert&\#40;|javascript\:|window\.|document\.|\.cookie|<script|<xss).*?\>.*?</a>#si", "", $data); | |
$data = preg_replace("#<img.+?src=.*?(alert\(|alert&\#40;|javascript\:|window\.|document\.|\.cookie|<script|<xss).*?\>#si","", $data); | |
$data = preg_replace("#<(script|xss).*?\>#si", "", $data); | |
// Remove JavaScript Event Handlers | |
// Note: This code is a little blunt. It removes | |
// the event handler and anything up to the closing >, | |
// but it's unlikely to be a problem. | |
$data = preg_replace('#(<[^>]+.*?)(onblur|onchange|onclick|onfocus|onload|onmouseover|onmouseup|onmousedown|onselect|onsubmit|onunload|onkeypress|onkeydown|onkeyup|onresize)[^>]*>#iU',"\\1>",$data); | |
// Sanitize naughty HTML elements | |
// If a tag containing any of the words in the list | |
// below is found, the tag gets converted to entities. | |
// So this: <blink> | |
// Becomes: <blink> | |
$data = preg_replace('#<(/*\s*)(alert|applet|basefont|base|behavior|bgsound|blink|body|embed|expression|form|frameset|frame|head|html|ilayer|iframe|input|layer|link|meta|object|plaintext|style|script|textarea|title|xml|xss)([^>]*)>#is', "<\\1\\2\\3>", $data); | |
// Sanitize naughty scripting elements | |
// Similar to above, only instead of looking for | |
// tags it looks for PHP and JavaScript commands | |
// that are disallowed. Rather than removing the | |
// code, it simply converts the parenthesis to entities | |
// rendering the code un-executable. | |
// For example: eval('some code') | |
// Becomes: eval('some code') | |
$data = preg_replace('#(alert|cmd|passthru|eval|exec|system|fopen|fsockopen|file|file_get_contents|readfile|unlink)(\s*)\((.*?)\)#si', "\\1\\2(\\3)", $data); | |
// Final clean up | |
// This adds a bit of extra precaution in case | |
// something got through the above filters | |
$bad = array( | |
'document.cookie' => '', | |
'document.write' => '', | |
'window.location' => '', | |
"javascript\s*:" => '', | |
"Redirect\s+302" => '', | |
'<!--' => '<!--', | |
'-->' => '-->' | |
); | |
foreach ($bad as $key => $val) { | |
$data = preg_replace("#".$key."#i", $val, $data); | |
} | |
///////////////////// | |
if($dlen != '0'){ | |
$data = substr($data, 0, $dlen); | |
} | |
if($dtype == '1'){ | |
// allow only numeric characters, space, period, and '-' | |
$data = preg_replace("/[^0-9\-\ \.]/",'', $data); | |
} | |
if($dtype == '2'){ | |
// allow only alpha characters, '_' and space | |
$data = preg_replace("/[^a-zA-Z~\ \_]/",'', $data); | |
} | |
if($dtype == '3'){ | |
// allow only alphanumeric characters, space, '_', period, colon, and '-' | |
$data = preg_replace("/[^0-9a-zA-Z~\-\ \.\:\_]/",'', $data); | |
} | |
if($dtype == '4'){ | |
// allow only alphanumeric characters w/ punctuation + carriage returns | |
$data = preg_replace("|[^0-9a-zA-Z~@#$%=:;_, \\n\\\!\^&\*\(\)\-\+\.\?\/\'\"]|",'', $data); | |
} | |
if($dtype == '5'){ | |
// specifically for email validation | |
$data = preg_replace("|[^[email protected]\-\.]|",'', $data); | |
} | |
$data = trim($data); | |
return $data; | |
} | |
// end sanitize | |
//////////////////////////////////////////// |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment