Forked from salipro4ever/PHP Clean String of UTF8 Chars – Convert to similar ASCII char
Created
October 2, 2018 12:11
-
-
Save markeriz/69c43fc97f15e24a98184278e1ad6a40 to your computer and use it in GitHub Desktop.
PHP Clean String of UTF8 Chars – Convert to similar ASCII char
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/** | |
* Returns an string clean of UTF8 characters. It will convert them to a similar ASCII character | |
* www.unexpectedit.com | |
*/ | |
function cleanString($text) { | |
// 1) convert á ô => a o | |
$text = preg_replace("/[áàâãªä]/u","a",$text); | |
$text = preg_replace("/[ÁÀÂÃÄ]/u","A",$text); | |
$text = preg_replace("/[ÍÌÎÏ]/u","I",$text); | |
$text = preg_replace("/[íìîï]/u","i",$text); | |
$text = preg_replace("/[éèêë]/u","e",$text); | |
$text = preg_replace("/[ÉÈÊË]/u","E",$text); | |
$text = preg_replace("/[óòôõºö]/u","o",$text); | |
$text = preg_replace("/[ÓÒÔÕÖ]/u","O",$text); | |
$text = preg_replace("/[úùûü]/u","u",$text); | |
$text = preg_replace("/[ÚÙÛÜ]/u","U",$text); | |
$text = preg_replace("/[’‘‹›‚]/u","'",$text); | |
$text = preg_replace("/[“”«»„]/u",'"',$text); | |
$text = str_replace("–","-",$text); | |
$text = str_replace(" "," ",$text); | |
$text = str_replace("ç","c",$text); | |
$text = str_replace("Ç","C",$text); | |
$text = str_replace("ñ","n",$text); | |
$text = str_replace("Ñ","N",$text); | |
//2) Translation CP1252. – => - | |
$trans = get_html_translation_table(HTML_ENTITIES); | |
$trans[chr(130)] = '‚'; // Single Low-9 Quotation Mark | |
$trans[chr(131)] = 'ƒ'; // Latin Small Letter F With Hook | |
$trans[chr(132)] = '„'; // Double Low-9 Quotation Mark | |
$trans[chr(133)] = '…'; // Horizontal Ellipsis | |
$trans[chr(134)] = '†'; // Dagger | |
$trans[chr(135)] = '‡'; // Double Dagger | |
$trans[chr(136)] = 'ˆ'; // Modifier Letter Circumflex Accent | |
$trans[chr(137)] = '‰'; // Per Mille Sign | |
$trans[chr(138)] = 'Š'; // Latin Capital Letter S With Caron | |
$trans[chr(139)] = '‹'; // Single Left-Pointing Angle Quotation Mark | |
$trans[chr(140)] = 'Œ'; // Latin Capital Ligature OE | |
$trans[chr(145)] = '‘'; // Left Single Quotation Mark | |
$trans[chr(146)] = '’'; // Right Single Quotation Mark | |
$trans[chr(147)] = '“'; // Left Double Quotation Mark | |
$trans[chr(148)] = '”'; // Right Double Quotation Mark | |
$trans[chr(149)] = '•'; // Bullet | |
$trans[chr(150)] = '–'; // En Dash | |
$trans[chr(151)] = '—'; // Em Dash | |
$trans[chr(152)] = '˜'; // Small Tilde | |
$trans[chr(153)] = '™'; // Trade Mark Sign | |
$trans[chr(154)] = 'š'; // Latin Small Letter S With Caron | |
$trans[chr(155)] = '›'; // Single Right-Pointing Angle Quotation Mark | |
$trans[chr(156)] = 'œ'; // Latin Small Ligature OE | |
$trans[chr(159)] = 'Ÿ'; // Latin Capital Letter Y With Diaeresis | |
$trans['euro'] = '€'; // euro currency symbol | |
ksort($trans); | |
foreach ($trans as $k => $v) { | |
$text = str_replace($v, $k, $text); | |
} | |
// 3) remove <p>, <br/> ... | |
$text = strip_tags($text); | |
// 4) & => & " => ' | |
$text = html_entity_decode($text); | |
// 5) remove Windows-1252 symbols like "TradeMark", "Euro"... | |
$text = preg_replace('/[^(\x20-\x7F)]*/','', $text); | |
$targets=array('\r\n','\n','\r','\t'); | |
$results=array(" "," "," ",""); | |
$text = str_replace($targets,$results,$text); | |
//XML compatible | |
/* | |
$text = str_replace("&", "and", $text); | |
$text = str_replace("<", ".", $text); | |
$text = str_replace(">", ".", $text); | |
$text = str_replace("\\", "-", $text); | |
$text = str_replace("/", "-", $text); | |
*/ | |
return ($text); | |
} | |
Usage: | |
$val = "Arômes ... óòôõº ... áéíóú ... Barça ... “Windows quotes” ... this is not a normal space ( ) ... this is not a normal dash (–) ... Esdrújula ... Wünderlist ...   ... & ... & ... ’ ... – ... £ ... € ... ... ..."; | |
echo cleanString($val); | |
//result: Aromes ... ooooo ... aeiou ... Barca ... "Windows quotes" ... this is not a normal space ( ) ... this is not a normal dash (-) ... Esdrujula ... Wunderlist ... ... & ... & ... ... ... ... euro ... ... ... | |
//Note: If you get an empty string, make sure you pass utf8 string to the function | |
echo cleanString(utf8_encode($val)); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment