Created
November 28, 2018 23:16
-
-
Save theraot/0d92d4f6c6e29e5cfe5572dbb5cbe9f2 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?php | |
/** | |
* CC-BY 3.0 Alfonso J. Ramos (theraot) | |
* UTF8 | |
*/ | |
final class UTF8 | |
{ | |
//------------------------------------------------------------ | |
// Private (Class) | |
//------------------------------------------------------------ | |
private static function CodePointLength($ord) | |
{ | |
if (($ord >> 7) === 0) | |
{ | |
return 1; | |
} | |
if (($ord >> 5) === 6) | |
{ | |
return 2; | |
} | |
if (($ord >> 4) === 14) | |
{ | |
return 3; | |
} | |
if (($ord >> 3) === 30) | |
{ | |
return 4; | |
} | |
return false; | |
} | |
private static function CharacterIndex($string, $position, $after) | |
{ | |
$strlen = strlen($string); | |
if ($position < 0) | |
{ | |
for ($index = $strlen - 1; $index >= 0; $index--) | |
{ | |
$ord = ord($string{$index}); | |
if (($ord >> 6) !== 2) | |
{ | |
$position++; | |
} | |
if ($position === 0) | |
{ | |
return $index; | |
} | |
} | |
return null; | |
} | |
$count = 0; | |
for ($index = $after; $index < $strlen; $count++) | |
{ | |
if ($count === $position) | |
{ | |
return $index; | |
} | |
$ord = ord($string{$index}); | |
$add = UTF8::CodePointLength($ord); | |
if ($add === false) | |
{ | |
return false; | |
} | |
/*for ($check = $index + 1; $check < $index + $add; $check++) | |
{ | |
$ord = ord($string{$check}); | |
if ($ord < 0x80 || $ord > 0xbf) | |
{ | |
return false; | |
} | |
}*/ | |
$index += $add; | |
} | |
if ($count < $position) | |
{ | |
return false; | |
} | |
return $strlen; | |
} | |
//------------------------------------------------------------ | |
// Public (Class) | |
//------------------------------------------------------------ | |
/** | |
* UTF-8 aware replacement of char | |
*/ | |
public static function Character(/*int*/ $codepoint) | |
{ | |
$codepoint = intval($codepoint); | |
if ($codepoint < 127) | |
{ | |
return chr($codepoint); | |
} | |
if ($codepoint < 2047) | |
{ | |
return chr(192 | (($codepoint >> 6) & 31)).chr(128 | ($codepoint & 63)); | |
} | |
if ($codepoint < 65535) | |
{ | |
return chr(224 | (($codepoint >> 12) & 31)).chr(128 | (($codepoint >> 6) & 63)).chr(128 | ($codepoint & 63)); | |
} | |
if ($codepoint < 1114111) | |
{ | |
return chr(240 | (($codepoint >> 18) & 31)).chr(128 | (($codepoint >> 12) & 63)).chr(128 | (($codepoint >> 6) & 63)).chr(128 | ($codepoint & 63)); | |
} | |
} | |
public static function CharacterAt($string, $index) | |
{ | |
$nextIndex = UTF8::CharacterIndex($string, 1, $index); | |
if ($nextIndex === false) | |
{ | |
return ''; | |
} | |
return substr($string, $index, $nextIndex - $index); | |
} | |
public static function CharactersAt($string, $index, $length) | |
{ | |
$nextIndex = UTF8::CharacterIndex($string, $length, $index); | |
if ($nextIndex === false) | |
{ | |
return substr($string, $index); | |
} | |
return substr($string, $index, $nextIndex - $index); | |
} | |
/** | |
* UTF-8 aware replacement of ord | |
*/ | |
public static function CodePoint(/*string*/ $character) | |
{ | |
$ord0 = ord($character{0}); | |
switch(UTF8::CodePointLength($ord0)) | |
{ | |
case 1: | |
return $ord0; | |
case 2: | |
return ($ord0 - 192) * 64 + (ord($character{1}) - 128); | |
case 3: | |
return ($ord0 - 224) * 4096 + (ord($character{1}) - 128) * 64 + (ord($character{2}) - 128); | |
case 4: | |
return ($ord0 - 240) * 262144 + (ord($character{1}) - 128) * 4096 + (ord($character{2}) - 128) * 64 + (ord($character{3}) - 128); | |
default: | |
return false; | |
} | |
} | |
public static function Enumerate($string) | |
{ | |
$strlen = strlen($string); | |
for ($index = 0; $index < $strlen; ) | |
{ | |
$chr = UTF8::CharacterAt($string, $index); | |
if ($chr === '') | |
{ | |
return; | |
} | |
$index += strlen($chr); | |
yield $chr; | |
} | |
} | |
public static function IsASCII($string) | |
{ | |
$strlen = strlen($string); | |
$count = 0; | |
for ($index = 0; $index < $strlen; $count++) | |
{ | |
$ord = ord($string{$index}); | |
$add = UTF8::CodePointLength($ord); | |
if ($add === false || $add !== 1) | |
{ | |
return false; | |
} | |
$index += $add; | |
} | |
return $count; | |
} | |
public static function IsUTF8($string) | |
{ | |
return UTF8::Length($string) !== false; | |
} | |
public static function Length($string) | |
{ | |
$strlen = strlen($string); | |
$count = 0; | |
for ($index = 0; $index < $strlen; $count++) | |
{ | |
$ord = ord($string{$index}); | |
$add = UTF8::CodePointLength($ord); | |
if ($add === false) | |
{ | |
return false; | |
} | |
for ($check = $index + 1; $check < $index + $add; $check++) | |
{ | |
$ord = ord($string{$check}); | |
if ($ord < 0x80 || $ord > 0xbf) | |
{ | |
return false; | |
} | |
} | |
$index += $add; | |
} | |
return $count; | |
} | |
public static function Split(/*string*/ $string, /*int*/ $length = 1) | |
{ | |
if (intval($length) !== $length || $length < 1) | |
{ | |
trigger_error('The length of each segment must be greater than zero', E_USER_WARNING); | |
return false; | |
} | |
else | |
{ | |
$strlen = strlen($string); | |
$result = []; | |
$index = 0; | |
while($index < $strlen) | |
{ | |
$block = UTF8::CharactersAt($string, $index, $length); | |
$result[] = $block; | |
$index += strlen($block); | |
} | |
return $result; | |
} | |
} | |
public static function Substr($string, $start, $length = null) | |
{ | |
$startIndex = UTF8::CharacterIndex($string, $start, 0); | |
if ($startIndex === null) | |
{ | |
$startIndex = 0; | |
} | |
if ($startIndex === false) | |
{ | |
return false; | |
} | |
if ($length === null) | |
{ | |
return substr($string, $startIndex); | |
} | |
else | |
{ | |
$endIndex = UTF8::CharacterIndex($string, $length, $startIndex); | |
if ($endIndex === null) | |
{ | |
return false; | |
} | |
if ($endIndex === false) | |
{ | |
return substr($string, $startIndex); | |
} | |
if ($endIndex < $startIndex) | |
{ | |
return $start < 0 ? '' : false; | |
} | |
return substr($string, $startIndex, $endIndex - $startIndex); | |
} | |
} | |
//------------------------------------------------------------ | |
// Public (Constructor) | |
//------------------------------------------------------------ | |
/** | |
* Creating instances of this class is not allowed. | |
*/ | |
public function __construct() | |
{ | |
trigger_error('Creating instances of '.__CLASS__.' is forbidden'); | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment