Created
March 16, 2015 13:27
-
-
Save kevinruscoe/3400629cebcf68ba2194 to your computer and use it in GitHub Desktop.
Clean String
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?php | |
function clean_string( $input ){ | |
// Clean HTML (From: http://nadeausoftware.com/articles/2007/09/php_tip_how_strip_html_tags_web_page) | |
$input = preg_replace( | |
array( | |
// Remove invisible content | |
'@<head[^>]*?>.*?</head>@siu', | |
'@<style[^>]*?>.*?</style>@siu', | |
'@<script[^>]*?.*?</script>@siu', | |
'@<object[^>]*?.*?</object>@siu', | |
'@<embed[^>]*?.*?</embed>@siu', | |
'@<applet[^>]*?.*?</applet>@siu', | |
'@<noframes[^>]*?.*?</noframes>@siu', | |
'@<noscript[^>]*?.*?</noscript>@siu', | |
'@<noembed[^>]*?.*?</noembed>@siu', | |
// Add line breaks before and after blocks | |
'@</?((address)|(blockquote)|(center)|(del))@iu', | |
'@</?((div)|(h[1-9])|(ins)|(isindex)|(p)|(pre))@iu', | |
'@</?((dir)|(dl)|(dt)|(dd)|(li)|(menu)|(ol)|(ul))@iu', | |
'@</?((table)|(th)|(td)|(caption))@iu', | |
'@</?((form)|(button)|(fieldset)|(legend)|(input))@iu', | |
'@</?((label)|(select)|(optgroup)|(option)|(textarea))@iu', | |
'@</?((frameset)|(frame)|(iframe))@iu', | |
), | |
array( | |
' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', | |
"\n\$0", "\n\$0", "\n\$0", "\n\$0", "\n\$0", "\n\$0", | |
"\n\$0", "\n\$0", | |
), | |
$input | |
); | |
// From: http://stackoverflow.com/a/20766625/164230 | |
$input = strip_tags( $input ); | |
$input = trim($input); | |
$input = iconv("UTF-8", "UTF-8//IGNORE", $input); // drop all non utf-8 characters | |
// this is some bad utf-8 byte sequence that makes mysql complain - control and formatting i think | |
$input = preg_replace('/(?>[\x00-\x1F]|\xC2[\x80-\x9F]|\xE2[\x80-\x8F]{2}|\xE2\x80[\xA4-\xA8]|\xE2\x81[\x9F-\xAF])/', ' ', $input); | |
$input = preg_replace('/\s+/', ' ', $input); // reduce all multiple whitespace to a single space | |
return $input; | |
} | |
?> |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment