Created
August 30, 2010 10:10
-
-
Save laurynas/557259 to your computer and use it in GitHub Desktop.
Convert escaped html to utf8
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?php | |
// Returns the utf string corresponding to the unicode value (from php.net, courtesy - [email protected]) | |
function code2utf($num) | |
{ | |
if ($num < 128) return chr($num); | |
if ($num < 2048) return chr(($num >> 6) + 192) . chr(($num & 63) + 128); | |
if ($num < 65536) return chr(($num >> 12) + 224) . chr((($num >> 6) & 63) + 128) . chr(($num & 63) + 128); | |
if ($num < 2097152) return chr(($num >> 18) + 240) . chr((($num >> 12) & 63) + 128) . chr((($num >> 6) & 63) + 128) . chr(($num & 63) + 128); | |
return ''; | |
} | |
// convert escaped html to utf8 | |
function html_entity_decode_utf8($string) | |
{ | |
static $trans_tbl; | |
// replace numeric entities | |
$string = preg_replace('~�*([0-9a-f]+);~ei', 'code2utf(hexdec("\\1"))', $string); | |
$string = preg_replace('~�*([0-9]+);~e', 'code2utf(\\1)', $string); | |
// replace literal entities | |
if (!isset($trans_tbl)) | |
{ | |
$trans_tbl = array(); | |
foreach (get_html_translation_table(HTML_ENTITIES) as $val=>$key) | |
$trans_tbl[$key] = utf8_encode($val); | |
// add extra characters, not included in default translation table | |
// suggested from http://lt2.php.net/get_html_translation_table comments | |
$extra = array( '''=> 39, '−'=> 45, 'ˆ'=> 94, | |
'˜'=> 126, 'Š'=> 138, '‹'=> 139, | |
'Œ'=> 140, '‘'=> 145, '’'=> 146, | |
'“'=> 147, '”'=> 148, '•'=> 149, | |
'–'=> 150, '—'=> 151, '˜'=> 152, | |
'™'=> 153, 'š'=> 154, '›'=> 155, | |
'œ'=> 156, 'Ÿ'=> 159, 'ÿ'=> 255, | |
'Œ'=> 338, 'œ'=> 339, 'Š'=> 352, | |
'š'=> 353, 'Ÿ'=> 376, 'ƒ'=> 402, | |
'ˆ'=> 710, '˜'=> 732, 'Α'=> 913, | |
'Β'=> 914, 'Γ'=> 915, 'Δ'=> 916, | |
'Ε'=> 917, 'Ζ'=> 918, 'Η'=> 919, | |
'Θ'=> 920, 'Ι'=> 921, 'Κ'=> 922, | |
'Λ'=> 923, 'Μ'=> 924, 'Ν'=> 925, | |
'Ξ'=> 926, 'Ο'=> 927, 'Π'=> 928, | |
'Ρ'=> 929, 'Σ'=> 931, 'Τ'=> 932, | |
'Υ'=> 933, 'Φ'=> 934, 'Χ'=> 935, | |
'Ψ'=> 936, 'Ω'=> 937, 'α'=> 945, | |
'β'=> 946, 'γ'=> 947, 'δ'=> 948, | |
'ε'=> 949, 'ζ'=> 950, 'η'=> 951, | |
'θ'=> 952, 'ι'=> 953, 'κ'=> 954, | |
'λ'=> 955, 'μ'=> 956, 'ν'=> 957, | |
'ξ'=> 958, 'ο'=> 959, 'π'=> 960, | |
'ρ'=> 961, 'ς'=> 962, 'σ'=> 963, | |
'τ'=> 964, 'υ'=> 965, 'φ'=> 966, | |
'χ'=> 967, 'ψ'=> 968, 'ω'=> 969, | |
'ϑ'=> 977, 'ϒ'=> 978, 'ϖ'=> 982, | |
' '=> 8194, ' '=> 8195, ' '=> 8201, | |
'‌'=> 8204, '‍'=> 8205, '‎'=> 8206, | |
'‏'=> 8207, '–'=> 8211, '—'=> 8212, | |
'‘'=> 8216, '’'=> 8217, '‚'=> 8218, | |
'“'=> 8220, '”'=> 8221, '„'=> 8222, | |
'†'=> 8224, '‡'=> 8225, '•'=> 8226, | |
'…'=> 8230, '‰'=> 8240, '′'=> 8242, | |
'″'=> 8243, '‹'=> 8249, '›'=> 8250, | |
'‾'=> 8254, '⁄'=> 8260, | |
'ℑ'=> 8465, '℘'=> 8472, 'ℜ'=> 8476, | |
'™'=> 8482, 'ℵ'=> 8501, '←'=> 8592, | |
'↑'=> 8593, '→'=> 8594, '↓'=> 8595, | |
'↔'=> 8596, '↵'=> 8629, '⇐'=> 8656, | |
'⇑'=> 8657, '⇒'=> 8658, '⇓'=> 8659, | |
'⇔'=> 8660, '∀'=> 8704, '∂'=> 8706, | |
'∃'=> 8707, '∅'=> 8709, '∇'=> 8711, | |
'∈'=> 8712, '∉'=> 8713, '∋'=> 8715, | |
'∏'=> 8719, '∑'=> 8721, '−'=> 8722, | |
'∗'=> 8727, '√'=> 8730, '∝'=> 8733, | |
'∞'=> 8734, '∠'=> 8736, '∧'=> 8743, | |
'∨'=> 8744, '∩'=> 8745, '∪'=> 8746, | |
'∫'=> 8747, '∴'=> 8756, '∼'=> 8764, | |
'≅'=> 8773, '≈'=> 8776, '≠'=> 8800, | |
'≡'=> 8801, '≤'=> 8804, '≥'=> 8805, | |
'⊂'=> 8834, '⊃'=> 8835, '⊄'=> 8836, | |
'⊆'=> 8838, '⊇'=> 8839, '⊕'=> 8853, | |
'⊗'=> 8855, '⊥'=> 8869, '⋅'=> 8901, | |
'⌈'=> 8968, '⌉'=> 8969, '⌊'=> 8970, | |
'⌋'=> 8971, '⟨'=> 9001, '⟩'=> 9002, | |
'◊'=> 9674, '♠'=> 9824, '♣'=> 9827, | |
'♥'=> 9829, '♦'=> 9830 | |
); | |
foreach ($extra as $key=>$val) | |
$trans_tbl[$key] = code2utf($val); | |
} | |
return strtr($string, $trans_tbl); | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment