-
-
Save jfcherng/02c013631a3be12164f113dada0cb252 to your computer and use it in GitHub Desktop.
中文筆劃排序(只適用繁體中文) for PHP 7.2
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?php | |
/** | |
* Author: [email protected] | |
* Desc: 為了解決中文筆劃排序的問題(只適用繁體中文) | |
* php 可以直接執行 cht_strokesort. | |
*/ | |
final class ChtStroke | |
{ | |
const BIG5_HB_MIN = 0x81; // 高位元組最小值 | |
const BIG5_HB_MAX = 0xfe; // 高位元組最大值 | |
const BIG5_LB1_MIN = 0x40; // 低位元組最小值 | |
const BIG5_LB1_MAX = 0x7e; // 低位元組最大值 | |
const BIG5_LB2_MIN = 0xa1; // 低位元組最小值 | |
const BIG5_LB2_MAX = 0xfe; // 低位元組最大值 | |
public static function chtStrokesort(array $str_arr, bool $dontSort = false): array | |
{ | |
//若是英數字,則依照ord來做排序,而筆劃排序則由base開始起算 | |
static $stroke_base = 50000; | |
$ord_arr = []; | |
foreach ($str_arr as $key => $value) { | |
$value = \urldecode($value); | |
$charLen = \mb_strlen($value, 'UTF-8'); | |
$strokes = $ords = []; | |
for ($i = 0; $i < $charLen; ++$i) { | |
$char = \mb_substr($value, $i, 1, 'UTF-8'); | |
$stroke = self::getStringStroke($char); | |
if ($stroke > 0) { | |
$ords[] = $stroke_base + $stroke; | |
} else { | |
$ords[] = \ord($char); | |
} | |
$strokes[] = $stroke; | |
} | |
$ord_arr[] = [ | |
'string' => $value, | |
'index_old' => $key, | |
'strokes' => $strokes, | |
'ords' => $ords, | |
]; | |
} | |
// 若指定不排序 | |
if (!$dontSort) { | |
\usort($ord_arr, self::uDictArrayCompare('ords')); | |
} | |
return $ord_arr; | |
} | |
/** | |
* Sort the array by the stroke information of one of it's element's columns. | |
* | |
* @param array[] $array The array | |
* @param string $column The column name to be used by stroke sorting | |
*/ | |
public static function chtStrokeSortByColumn(array &$array, string $column): void | |
{ | |
$tmpColumn = "__stroke__{$column}"; | |
// insert stroke information into the source array | |
foreach ($array as &$item) { | |
$item[$tmpColumn] = self::chtOrds($item[$column]); | |
} | |
unset($item); | |
// sort the array by the inserted stroke information | |
\usort($array, self::uDictArrayCompare($tmpColumn)); | |
// remove the inserted stroke information | |
foreach ($array as $key => $item) { | |
unset($item[$tmpColumn]); | |
} | |
} | |
public static function chtOrds(string $str): array | |
{ | |
//若是英數字,則依照ord來做排序,而筆劃排序則由base開始起算 | |
static $stroke_base = 50000; | |
$ords = []; | |
$strLen = \mb_strlen($str, 'UTF-8'); | |
for ($i = 0; $i < $strLen; ++$i) { | |
$char = \mb_substr($str, $i, 1, 'UTF-8'); | |
$stroke = self::getStringStroke($char); | |
if ($stroke > 0) { | |
$ords[] = $stroke_base + $stroke; | |
} else { | |
$ords[] = \ord($char); | |
} | |
} | |
return $ords; | |
} | |
public static function uDictArrayCompare(string $key): Closure | |
{ | |
return function (array $a, array $b) use ($key): int { | |
if ($a[$key] === $b[$key]) { | |
return 0; | |
} | |
for ($i = 0; $i < \count($a[$key]); ++$i) { | |
if (!isset($b[$key][$i])) { | |
return 1; | |
} | |
if ($a[$key][$i] > $b[$key][$i]) { | |
return 1; | |
} | |
if ($a[$key][$i] < $b[$key][$i]) { | |
return -1; | |
} | |
continue; | |
} | |
return -1; | |
}; | |
} | |
// 計算中文字筆劃 | |
private static function big5Stroke(string $str): ?int | |
{ | |
$i = 0; | |
foreach (self::getStrokeData() as $key => $val) { | |
$StrokeMapping[$i] = \explode(' ', $val); | |
$StrokeMapping[$i][1] = \hexdec($StrokeMapping[$i][1]); | |
$StrokeMapping[$i][2] = \hexdec($StrokeMapping[$i][2]); | |
++$i; | |
} | |
$s1 = \substr($str, 0, 1); | |
$s2 = \substr($str, 1, 1); | |
$s = \hexdec(\bin2hex($s1 . $s2)); | |
if (self::big5IsHb($s1) && self::big5IsLb($s2)) { | |
for ($i = 0; $i < \count($StrokeMapping); ++$i) { | |
if ($StrokeMapping[$i][1] <= $s && $StrokeMapping[$i][2] >= $s) { | |
return $StrokeMapping[$i][0]; | |
} | |
} | |
} | |
return null; | |
} | |
private static function getStringStroke(string $str): int | |
{ | |
$str = self::utf8ToBig5($str); | |
$stroke = self::big5Stroke($str); | |
return (int) $stroke; | |
} | |
private static function big5IsHb(string $c): bool | |
{ | |
$asc = \ord($c); | |
return $asc >= self::BIG5_HB_MIN && $asc <= self::BIG5_HB_MAX; | |
} | |
private static function big5IsLb(string $c): bool | |
{ | |
$asc = \ord($c); | |
return | |
($asc >= self::BIG5_LB1_MIN && $asc <= self::BIG5_LB1_MAX) || | |
($asc >= self::BIG5_LB2_MIN && $asc <= self::BIG5_LB2_MAX); | |
} | |
private static function utf8ToBig5(string $utf8_str): string | |
{ | |
$i = 0; | |
$len = \strlen($utf8_str); | |
$big5_str = ''; | |
for ($i = 0; $i < $len; ++$i) { | |
$sbit = \ord(\substr($utf8_str, $i, 1)); | |
if ($sbit < 128) { | |
$big5_str .= \substr($utf8_str, $i, 1); | |
} elseif ($sbit > 191 && $sbit < 224) { | |
$new_word = \mb_convert_encoding(\substr($utf8_str, $i, 2), 'Big5', 'UTF-8'); | |
$big5_str .= ($new_word == '') ? (\mb_convert_encoding(\substr($utf8_str, $i, 3), 'HTML-ENTITIES', 'UTF-8')) : $new_word; | |
++$i; | |
} elseif ($sbit > 223 && $sbit < 240) { | |
$new_word = \mb_convert_encoding(\substr($utf8_str, $i, 3), 'Big5', 'UTF-8'); | |
$big5_str .= ($new_word == '') ? (\mb_convert_encoding(\substr($utf8_str, $i, 3), 'HTML-ENTITIES', 'UTF-8')) : $new_word; | |
$i += 2; | |
} elseif ($sbit > 239 && $sbit < 248) { | |
$new_word = \mb_convert_encoding(\substr($utf8_str, $i, 4), 'Big5', 'UTF-8'); | |
$big5_str .= ($new_word == '') ? (\mb_convert_encoding(\substr($utf8_str, $i, 3), 'HTML-ENTITIES', 'UTF-8')) : $new_word; | |
$i += 3; | |
} | |
} | |
return $big5_str; | |
} | |
private static function getStrokeData(): array | |
{ | |
return \explode( | |
"\n", | |
<<<'EOT' | |
1 a440 a441 | |
2 a442 a453 | |
3 a454 a47e | |
4 a4a1 a4fd | |
5 a4fe a5df | |
6 a5e0 a6e9 | |
7 a6ea a8c2 | |
8 a8c3 ab44 | |
9 ab45 adbb | |
10 adbc b0ad | |
11 b0ae b3c2 | |
12 b3c3 b6c3 | |
13 b6c4 b9ab | |
14 b9ac bbf4 | |
15 bbf5 bea6 | |
16 bea7 c074 | |
17 c075 c24e | |
18 c24f c35e | |
19 c35f c454 | |
20 c455 c4d6 | |
21 c3d7 c56a | |
22 c56b c5c7 | |
23 c5c8 c5c7 | |
24 c5f1 c654 | |
25 c655 c664 | |
26 c665 c66b | |
27 c66c c675 | |
28 c676 c67a | |
29 c67b c67e | |
2 c940 c944 | |
3 c945 c94c | |
4 c94d c95c | |
5 c95d c9aa | |
6 c9ab c959 | |
7 ca5a cbb0 | |
8 cbb1 cddc | |
9 cddd d0c7 | |
10 d0c8 d44a | |
11 d44b d850 | |
12 d851 dcb0 | |
13 dcb1 e0ef | |
14 e0f0 e4e5 | |
15 e4e6 e8f3 | |
16 e8f4 ecb8 | |
17 ecb9 efb6 | |
18 efb7 f1ea | |
19 f1eb f3fc | |
20 f3fd f5bf | |
21 f5c0 f6d5 | |
22 f6d6 f7cf | |
23 f6d6 f7cf | |
24 f8a5 f8ed | |
25 f8e9 f96a | |
26 f96b f9a1 | |
27 f9a2 f9b9 | |
28 f9ba f9c5 | |
29 f9c6 f9dc | |
9 f9da f9da | |
12 f9db f9db | |
13 f9d6 f9d8 | |
15 f9dc f9dc | |
16 f9d9 f9d9 | |
30 c67b c67d | |
30 f9cc f9cf | |
31 f9c6 f9c6 | |
31 f9d0 f9d0 | |
32 f9d1 f9d1 | |
33 c67e c67e | |
33 f9d2 f9d2 | |
34 f9d3 f9d3 | |
36 f9d4 f9d5 | |
EOT | |
); | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment