Skip to content

Instantly share code, notes, and snippets.

@kevinruscoe
Created March 16, 2015 13:27
Show Gist options
  • Save kevinruscoe/3400629cebcf68ba2194 to your computer and use it in GitHub Desktop.
Save kevinruscoe/3400629cebcf68ba2194 to your computer and use it in GitHub Desktop.
Clean String
<?php
function clean_string( $input ){
// Clean HTML (From: http://nadeausoftware.com/articles/2007/09/php_tip_how_strip_html_tags_web_page)
$input = preg_replace(
array(
// Remove invisible content
'@<head[^>]*?>.*?</head>@siu',
'@<style[^>]*?>.*?</style>@siu',
'@<script[^>]*?.*?</script>@siu',
'@<object[^>]*?.*?</object>@siu',
'@<embed[^>]*?.*?</embed>@siu',
'@<applet[^>]*?.*?</applet>@siu',
'@<noframes[^>]*?.*?</noframes>@siu',
'@<noscript[^>]*?.*?</noscript>@siu',
'@<noembed[^>]*?.*?</noembed>@siu',
// Add line breaks before and after blocks
'@</?((address)|(blockquote)|(center)|(del))@iu',
'@</?((div)|(h[1-9])|(ins)|(isindex)|(p)|(pre))@iu',
'@</?((dir)|(dl)|(dt)|(dd)|(li)|(menu)|(ol)|(ul))@iu',
'@</?((table)|(th)|(td)|(caption))@iu',
'@</?((form)|(button)|(fieldset)|(legend)|(input))@iu',
'@</?((label)|(select)|(optgroup)|(option)|(textarea))@iu',
'@</?((frameset)|(frame)|(iframe))@iu',
),
array(
' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ',
"\n\$0", "\n\$0", "\n\$0", "\n\$0", "\n\$0", "\n\$0",
"\n\$0", "\n\$0",
),
$input
);
// From: http://stackoverflow.com/a/20766625/164230
$input = strip_tags( $input );
$input = trim($input);
$input = iconv("UTF-8", "UTF-8//IGNORE", $input); // drop all non utf-8 characters
// this is some bad utf-8 byte sequence that makes mysql complain - control and formatting i think
$input = preg_replace('/(?>[\x00-\x1F]|\xC2[\x80-\x9F]|\xE2[\x80-\x8F]{2}|\xE2\x80[\xA4-\xA8]|\xE2\x81[\x9F-\xAF])/', ' ', $input);
$input = preg_replace('/\s+/', ' ', $input); // reduce all multiple whitespace to a single space
return $input;
}
?>
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment