Created
April 24, 2014 03:21
-
-
Save stormsweeper/11240363 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?php | |
/** | |
* A utility class to clean up common problems with UTF-8 strings. | |
*/ | |
class UnicodeUtils{ | |
/** | |
* Maps double-encoded UTF-8 byte sequences back to single encoded UTF-8. The keys are byte sequences where a valid | |
* UTF-8 character has been interpreted as multiple characters in CP1252, and then re-converted | |
* to UTF-8 characters. The values are the UTF-8 character byte sequence that was double encoded. | |
* | |
*/ | |
private static $duped_utf8_mapping = [ | |
"\xC3\x82\xC2\xA0" => "\xC2\xA0", | |
"\xC3\x82\xC2\xA1" => "\xC2\xA1", | |
"\xC3\x82\xC2\xA2" => "\xC2\xA2", | |
"\xC3\x82\xC2\xA3" => "\xC2\xA3", | |
"\xC3\x82\xC2\xA4" => "\xC2\xA4", | |
"\xC3\x82\xC2\xA5" => "\xC2\xA5", | |
"\xC3\x82\xC2\xA6" => "\xC2\xA6", | |
"\xC3\x82\xC2\xA7" => "\xC2\xA7", | |
"\xC3\x82\xC2\xA8" => "\xC2\xA8", | |
"\xC3\x82\xC2\xA9" => "\xC2\xA9", | |
"\xC3\x82\xC2\xAA" => "\xC2\xAA", | |
"\xC3\x82\xC2\xAB" => "\xC2\xAB", | |
"\xC3\x82\xC2\xAC" => "\xC2\xAC", | |
"\xC3\x82\xC2\xAD" => "\xC2\xAD", | |
"\xC3\x82\xC2\xAE" => "\xC2\xAE", | |
"\xC3\x82\xC2\xAF" => "\xC2\xAF", | |
"\xC3\x82\xC2\xB0" => "\xC2\xB0", | |
"\xC3\x82\xC2\xB1" => "\xC2\xB1", | |
"\xC3\x82\xC2\xB2" => "\xC2\xB2", | |
"\xC3\x82\xC2\xB3" => "\xC2\xB3", | |
"\xC3\x82\xC2\xB4" => "\xC2\xB4", | |
"\xC3\x82\xC2\xB5" => "\xC2\xB5", | |
"\xC3\x82\xC2\xB6" => "\xC2\xB6", | |
"\xC3\x82\xC2\xB7" => "\xC2\xB7", | |
"\xC3\x82\xC2\xB8" => "\xC2\xB8", | |
"\xC3\x82\xC2\xB9" => "\xC2\xB9", | |
"\xC3\x82\xC2\xBA" => "\xC2\xBA", | |
"\xC3\x82\xC2\xBB" => "\xC2\xBB", | |
"\xC3\x82\xC2\xBC" => "\xC2\xBC", | |
"\xC3\x82\xC2\xBD" => "\xC2\xBD", | |
"\xC3\x82\xC2\xBE" => "\xC2\xBE", | |
"\xC3\x82\xC2\xBF" => "\xC2\xBF", | |
"\xC3\x83\xC2\x81" => "\xC3\x81", | |
"\xC3\x83\xC2\x8D" => "\xC3\x8D", | |
"\xC3\x83\xC2\x8F" => "\xC3\x8F", | |
"\xC3\x83\xC2\x90" => "\xC3\x90", | |
"\xC3\x83\xC2\x9D" => "\xC3\x9D", | |
"\xC3\x83\xC2\xA0" => "\xC3\xA0", | |
"\xC3\x83\xC2\xA1" => "\xC3\xA1", | |
"\xC3\x83\xC2\xA2" => "\xC3\xA2", | |
"\xC3\x83\xC2\xA3" => "\xC3\xA3", | |
"\xC3\x83\xC2\xA4" => "\xC3\xA4", | |
"\xC3\x83\xC2\xA5" => "\xC3\xA5", | |
"\xC3\x83\xC2\xA6" => "\xC3\xA6", | |
"\xC3\x83\xC2\xA7" => "\xC3\xA7", | |
"\xC3\x83\xC2\xA8" => "\xC3\xA8", | |
"\xC3\x83\xC2\xA9" => "\xC3\xA9", | |
"\xC3\x83\xC2\xAA" => "\xC3\xAA", | |
"\xC3\x83\xC2\xAB" => "\xC3\xAB", | |
"\xC3\x83\xC2\xAC" => "\xC3\xAC", | |
"\xC3\x83\xC2\xAD" => "\xC3\xAD", | |
"\xC3\x83\xC2\xAE" => "\xC3\xAE", | |
"\xC3\x83\xC2\xAF" => "\xC3\xAF", | |
"\xC3\x83\xC2\xB0" => "\xC3\xB0", | |
"\xC3\x83\xC2\xB1" => "\xC3\xB1", | |
"\xC3\x83\xC2\xB2" => "\xC3\xB2", | |
"\xC3\x83\xC2\xB3" => "\xC3\xB3", | |
"\xC3\x83\xC2\xB4" => "\xC3\xB4", | |
"\xC3\x83\xC2\xB5" => "\xC3\xB5", | |
"\xC3\x83\xC2\xB6" => "\xC3\xB6", | |
"\xC3\x83\xC2\xB7" => "\xC3\xB7", | |
"\xC3\x83\xC2\xB8" => "\xC3\xB8", | |
"\xC3\x83\xC2\xB9" => "\xC3\xB9", | |
"\xC3\x83\xC2\xBA" => "\xC3\xBA", | |
"\xC3\x83\xC2\xBB" => "\xC3\xBB", | |
"\xC3\x83\xC2\xBC" => "\xC3\xBC", | |
"\xC3\x83\xC2\xBD" => "\xC3\xBD", | |
"\xC3\x83\xC2\xBE" => "\xC3\xBE", | |
"\xC3\x83\xC2\xBF" => "\xC3\xBF", | |
"\xC3\x83\xC5\x92" => "\xC3\x8C", | |
"\xC3\x83\xC5\x93" => "\xC3\x9C", | |
"\xC3\x83\xC5\xA0" => "\xC3\x8A", | |
"\xC3\x83\xC5\xA1" => "\xC3\x9A", | |
"\xC3\x83\xC5\xB8" => "\xC3\x9F", | |
"\xC3\x83\xC5\xBD" => "\xC3\x8E", | |
"\xC3\x83\xC5\xBE" => "\xC3\x9E", | |
"\xC3\x83\xC6\x92" => "\xC3\x83", | |
"\xC3\x83\xCB\x86" => "\xC3\x88", | |
"\xC3\x83\xCB\x9C" => "\xC3\x98", | |
"\xC3\x83\xE2\x80\x93" => "\xC3\x96", | |
"\xC3\x83\xE2\x80\x94" => "\xC3\x97", | |
"\xC3\x83\xE2\x80\x98" => "\xC3\x91", | |
"\xC3\x83\xE2\x80\x99" => "\xC3\x92", | |
"\xC3\x83\xE2\x80\x9A" => "\xC3\x82", | |
"\xC3\x83\xE2\x80\x9C" => "\xC3\x93", | |
"\xC3\x83\xE2\x80\x9D" => "\xC3\x94", | |
"\xC3\x83\xE2\x80\x9E" => "\xC3\x84", | |
"\xC3\x83\xE2\x80\xA0" => "\xC3\x86", | |
"\xC3\x83\xE2\x80\xA1" => "\xC3\x87", | |
"\xC3\x83\xE2\x80\xA2" => "\xC3\x95", | |
"\xC3\x83\xE2\x80\xA6" => "\xC3\x85", | |
"\xC3\x83\xE2\x80\xB0" => "\xC3\x89", | |
"\xC3\x83\xE2\x80\xB9" => "\xC3\x8B", | |
"\xC3\x83\xE2\x80\xBA" => "\xC3\x9B", | |
"\xC3\x83\xE2\x82\xAC" => "\xC3\x80", | |
"\xC3\x83\xE2\x84\xA2" => "\xC3\x99", | |
"\xC3\x85\xC2\xA0" => "\xC5\xA0", | |
"\xC3\x85\xC2\xA1" => "\xC5\xA1", | |
"\xC3\x85\xC2\xB8" => "\xC5\xB8", | |
"\xC3\x85\xC2\xBD" => "\xC5\xBD", | |
"\xC3\x85\xC2\xBE" => "\xC5\xBE", | |
"\xC3\x85\xE2\x80\x99" => "\xC5\x92", | |
"\xC3\x85\xE2\x80\x9C" => "\xC5\x93", | |
"\xC3\x86\xE2\x80\x99" => "\xC6\x92", | |
"\xC3\x8B\xC5\x93" => "\xCB\x9C", | |
"\xC3\x8B\xE2\x80\xA0" => "\xCB\x86", | |
"\xC3\xA2\xE2\x80\x9A\xC2\xAC" => "\xE2\x82\xAC", | |
"\xC3\xA2\xE2\x80\x9E\xC2\xA2" => "\xE2\x84\xA2", | |
"\xC3\xA2\xE2\x82\xAC\xC2\x9D" => "\xE2\x80\x9D", | |
"\xC3\xA2\xE2\x82\xAC\xC2\xA0" => "\xE2\x80\xA0", | |
"\xC3\xA2\xE2\x82\xAC\xC2\xA1" => "\xE2\x80\xA1", | |
"\xC3\xA2\xE2\x82\xAC\xC2\xA2" => "\xE2\x80\xA2", | |
"\xC3\xA2\xE2\x82\xAC\xC2\xA6" => "\xE2\x80\xA6", | |
"\xC3\xA2\xE2\x82\xAC\xC2\xB0" => "\xE2\x80\xB0", | |
"\xC3\xA2\xE2\x82\xAC\xC2\xB9" => "\xE2\x80\xB9", | |
"\xC3\xA2\xE2\x82\xAC\xC2\xBA" => "\xE2\x80\xBA", | |
"\xC3\xA2\xE2\x82\xAC\xC5\x93" => "\xE2\x80\x9C", | |
"\xC3\xA2\xE2\x82\xAC\xC5\xA1" => "\xE2\x80\x9A", | |
"\xC3\xA2\xE2\x82\xAC\xC5\xBE" => "\xE2\x80\x9E", | |
"\xC3\xA2\xE2\x82\xAC\xCB\x9C" => "\xE2\x80\x98", | |
"\xC3\xA2\xE2\x82\xAC\xE2\x80\x9C" => "\xE2\x80\x93", | |
"\xC3\xA2\xE2\x82\xAC\xE2\x80\x9D" => "\xE2\x80\x94", | |
"\xC3\xA2\xE2\x82\xAC\xE2\x84\xA2" => "\xE2\x80\x99", | |
]; | |
/** | |
* Cleans up instances where a UTF-8 string has been incorrectly re-encoded to UTF-8 from ISO-8859-1/CP1252. | |
* | |
* @see http://www.i18nqa.com/debug/utf8-debug.html#dbg | |
* @param string $str | |
* @return string | |
*/ | |
public static function dedupUTF8($str) { | |
// do a check for byte prefixes of the mapping strings | |
if ( | |
strpos($str, "\xC3") === false | |
|| | |
( | |
strpos($str, "\xC3\x83") === false | |
&& | |
strpos($str, "\xC3\x82") === false | |
&& | |
strpos($str, "\xC3\xA2") === false | |
&& | |
strpos($str, "\xC3\x85") === false | |
&& | |
strpos($str, "\xC3\x8B") === false | |
&& | |
strpos($str, "\xC3\x86") === false | |
) | |
) { | |
return $str; | |
} | |
return strtr($str, self::$duped_utf8_mapping); | |
} | |
/** | |
* Converts various "extended" Unicode characters to ASCII equivalents. | |
* | |
* @see self::convertCurlyQuotes | |
* @see self::convertSmartChars | |
* @see self::convertSpaces | |
* @param string $str | |
* @return string | |
*/ | |
public static function convertToSimpleChars($str) { | |
return self::convertCurlyQuotes( | |
self::convertSmartChars( | |
self::convertSpaces($str) | |
) | |
); | |
} | |
/** | |
* Converts curly (aka "smart" or "typographic") quotes to straight quotes a found in ASCII. | |
* | |
* @param string $str | |
* @return string | |
*/ | |
public static function convertCurlyQuotes($str) { | |
return strtr( | |
$str, | |
[ | |
"\xC2\xB4" => '\'', // acute accent, common on European keyboards | |
"\xE2\x80\x98" => '\'', // left single quote | |
"\xE2\x80\x99" => '\'', // right single quote | |
"\xE2\x80\x9C" => '"', // left double quote | |
"\xE2\x80\x9D" => '"', // right double quote | |
] | |
); | |
} | |
/** | |
* Converts various characters from "smart" versions in Unicode back to ASCII lookalikes. | |
* | |
* @param string $str | |
* @return string | |
*/ | |
public static function convertSmartChars($str) { | |
return strtr( | |
$str, | |
[ | |
"\xE2\x80\xA6" => '...', // horizontal ellipsis | |
"\xE2\x80\x90" => '-', // hyphen | |
"\xE2\x80\x91" => '-', // non-breaking hyphen | |
"\xE2\x80\x92" => '-', // figure dash | |
"\xE2\x80\x93" => '-', // en dash | |
"\xE2\x80\x94" => '--', // em dash | |
"\xE2\x80\x95" => '--', // horizontal bar | |
"\xC2\xB7" => '*', // mid dot | |
"\xE2\x80\xA2" => '*', // bullet | |
"\xC2\xB0" => 'o', // degree symbol | |
] | |
); | |
} | |
/** | |
* Converts all "space" characters in Unicode to a standard space. | |
* | |
* @see http://unicode.org/cldr/utility/list-unicodeset.jsp?a=[:Zs:] | |
* @param string $str | |
* @return string | |
*/ | |
public static function convertSpaces($str) { | |
return preg_replace('/\p{Zs}/u', ' ', $str); | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment