Created
May 28, 2009 19:25
-
-
Save jaywilliams/119517 to your computer and use it in GitHub Desktop.
This simple function will remove any non-ASCII character. Feel free to fork and extend!
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?php | |
/** | |
* Remove any non-ASCII characters and convert known non-ASCII characters | |
* to their ASCII equivalents, if possible. | |
* | |
* @param string $string | |
* @return string $string | |
* @author Jay Williams <myd3.com> | |
* @license MIT License | |
* @link http://gist.github.com/119517 | |
*/ | |
function convert_ascii($string) | |
{ | |
// Replace Single Curly Quotes | |
$search[] = chr(226).chr(128).chr(152); | |
$replace[] = "'"; | |
$search[] = chr(226).chr(128).chr(153); | |
$replace[] = "'"; | |
// Replace Smart Double Curly Quotes | |
$search[] = chr(226).chr(128).chr(156); | |
$replace[] = '"'; | |
$search[] = chr(226).chr(128).chr(157); | |
$replace[] = '"'; | |
// Replace En Dash | |
$search[] = chr(226).chr(128).chr(147); | |
$replace[] = '--'; | |
// Replace Em Dash | |
$search[] = chr(226).chr(128).chr(148); | |
$replace[] = '---'; | |
// Replace Bullet | |
$search[] = chr(226).chr(128).chr(162); | |
$replace[] = '*'; | |
// Replace Middle Dot | |
$search[] = chr(194).chr(183); | |
$replace[] = '*'; | |
// Replace Ellipsis with three consecutive dots | |
$search[] = chr(226).chr(128).chr(166); | |
$replace[] = '...'; | |
// Apply Replacements | |
$string = str_replace($search, $replace, $string); | |
// Remove any non-ASCII Characters | |
$string = preg_replace("/[^\x01-\x7F]/","", $string); | |
return $string; | |
} | |
?> |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
@paulintrognon, that's correct, however if you have some invalid UTF-8 input, it will error out. So that only works with valid unicode.