Last active
December 30, 2015 16:29
-
-
Save bg5sbk/7854753 to your computer and use it in GitHub Desktop.
Test whether the text is UTF-8 encoding by PHP.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?php | |
// | |
// Test whether the text is UTF-8 encoding | |
// | |
// Result: | |
// 1 - Has BOM head | |
// 2 - Pure UTF-8 context | |
// 3 - More likely to be UTF-8 content | |
// 4 - Less likely to be UTF-8 content | |
// | |
function utf8_check($text) | |
{ | |
$utf8_bom = chr(0xEF).chr(0xBB).chr(0xBF); | |
// check BOM head | |
if (strstr($text, $utf8_bom) === 0) | |
return 1; | |
$text_len = strlen($text); | |
// UTF-8 format: | |
// < 0x80 | 0xxxxxxx | |
// < 0xE0 | 110xxxxx 10xxxxxx | |
// < 0xF0 | 1110xxxx 10xxxxxx 10xxxxxx | |
// < 0xF8 | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx | |
// < 0xFC | 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx | |
// < 0xFE | 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx | |
$bad = 0; | |
$good = 0; | |
$need_check = 0; | |
$have_check = 0; | |
for ($i = 0; $i < $text_len; $i ++) { | |
$c = ord($text[$i]); | |
if ($need_check > 0) { | |
$c = ord($text[$i]); | |
$c = ($c >> 6) << 6; | |
$have_check ++; | |
// 10xxxxxx ~ 10111111 | |
if ($c != 0x80) { | |
$i -= $have_check; | |
$need_check = 0; | |
$have_check = 0; | |
$bad ++; | |
} | |
else if ($need_check == $have_check) { | |
$need_check = 0; | |
$have_check = 0; | |
$good ++; | |
} | |
continue; | |
} | |
if ($c < 0x80) // 0xxxxxxx | |
$good ++; | |
else if ($c < 0xE0) // 110xxxxx | |
$need_check = 1; | |
else if ($c < 0xF0) // 1110xxxx | |
$need_check = 2; | |
else if ($c < 0xF8) // 11110xxx | |
$need_check = 3; | |
else if ($c < 0xFC) // 111110xx | |
$need_check = 4; | |
else if ($c < 0xFE) // 1111110x | |
$need_check = 5; | |
else | |
$bad ++; | |
} | |
if ($bad == 0) | |
return 2; | |
else if ($good > $bad) | |
return 3; | |
else | |
return 4; | |
} | |
?> |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment