Skip to content

Instantly share code, notes, and snippets.

@bg5sbk
Last active December 30, 2015 16:29
Show Gist options
  • Save bg5sbk/7854753 to your computer and use it in GitHub Desktop.
Save bg5sbk/7854753 to your computer and use it in GitHub Desktop.
Test whether the text is UTF-8 encoding by PHP.
<?php
//
// Test whether the text is UTF-8 encoding
//
// Result:
// 1 - Has BOM head
// 2 - Pure UTF-8 context
// 3 - More likely to be UTF-8 content
// 4 - Less likely to be UTF-8 content
//
function utf8_check($text)
{
$utf8_bom = chr(0xEF).chr(0xBB).chr(0xBF);
// check BOM head
if (strstr($text, $utf8_bom) === 0)
return 1;
$text_len = strlen($text);
// UTF-8 format:
// < 0x80 | 0xxxxxxx
// < 0xE0 | 110xxxxx 10xxxxxx
// < 0xF0 | 1110xxxx 10xxxxxx 10xxxxxx
// < 0xF8 | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
// < 0xFC | 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
// < 0xFE | 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
$bad = 0;
$good = 0;
$need_check = 0;
$have_check = 0;
for ($i = 0; $i < $text_len; $i ++) {
$c = ord($text[$i]);
if ($need_check > 0) {
$c = ord($text[$i]);
$c = ($c >> 6) << 6;
$have_check ++;
// 10xxxxxx ~ 10111111
if ($c != 0x80) {
$i -= $have_check;
$need_check = 0;
$have_check = 0;
$bad ++;
}
else if ($need_check == $have_check) {
$need_check = 0;
$have_check = 0;
$good ++;
}
continue;
}
if ($c < 0x80) // 0xxxxxxx
$good ++;
else if ($c < 0xE0) // 110xxxxx
$need_check = 1;
else if ($c < 0xF0) // 1110xxxx
$need_check = 2;
else if ($c < 0xF8) // 11110xxx
$need_check = 3;
else if ($c < 0xFC) // 111110xx
$need_check = 4;
else if ($c < 0xFE) // 1111110x
$need_check = 5;
else
$bad ++;
}
if ($bad == 0)
return 2;
else if ($good > $bad)
return 3;
else
return 4;
}
?>
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment