Skip to content

Instantly share code, notes, and snippets.

@bg5sbk
Created December 9, 2013 06:34
Show Gist options
  • Save bg5sbk/7868237 to your computer and use it in GitHub Desktop.
Save bg5sbk/7868237 to your computer and use it in GitHub Desktop.
Get UTF-8 length of the text.
<?php
//
// Get UTF-8 length of the text.
// Return false when the text not UTF-8 encoding.
//
function utf8_length($text)
{
$text_len = strlen($text);
$utf8_len = 0;
// UTF-8 format:
// < 0x80 | 0xxxxxxx
// < 0xE0 | 110xxxxx 10xxxxxx
// < 0xF0 | 1110xxxx 10xxxxxx 10xxxxxx
// < 0xF8 | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
// < 0xFC | 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
// < 0xFE | 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
$need_check = 0;
$have_check = 0;
for ($i = 0; $i < $text_len; $i ++) {
$c = ord($text[$i]);
if ($need_check > 0) {
$c = ord($text[$i]);
$c = ($c >> 6) << 6;
$have_check ++;
// 10xxxxxx ~ 10111111
if ($c != 0x80) {
$i -= $have_check;
$need_check = 0;
$have_check = 0;
return false;
}
else if ($need_check == $have_check) {
$need_check = 0;
$have_check = 0;
$utf8_len ++;
}
continue;
}
if ($c < 0x80) // 0xxxxxxx
$utf8_len ++;
else if ($c < 0xE0) // 110xxxxx
$need_check = 1;
else if ($c < 0xF0) // 1110xxxx
$need_check = 2;
else if ($c < 0xF8) // 11110xxx
$need_check = 3;
else if ($c < 0xFC) // 111110xx
$need_check = 4;
else if ($c < 0xFE) // 1111110x
$need_check = 5;
else
return false;
}
return $utf8_len;
}
?>
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment