Created
February 23, 2025 03:15
-
-
Save adsr/834dfc3135bcb6a1d29b498eb94ba101 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?php | |
declare(strict_types=1); | |
// Print data structures necessary for finding grapheme cluster breaks | |
// https://unicode.org/reports/tr29/ | |
function print_prop_table_for_url($prefix, $url, $filter, $prop_strip) { | |
$lines = explode("\n", trim(file_get_contents($url))); | |
$ranges = []; | |
foreach ($lines as $line) { | |
$m = []; | |
if (!preg_match('/^([0-9A-F]+)(?:\.\.([0-9A-F]+))?\s+;\s+([^#]+)/', $line, $m)) continue; | |
if ($filter) if (!preg_match($filter, $line)) continue; | |
$prop = $m[3]; | |
if ($prop_strip) $prop = preg_replace($prop_strip, ' ', $prop); | |
$prop = preg_replace('/\W+/', ' ', $prop); | |
$prop = trim(sprintf('UNI_PROP_%s %s', $prefix, $prop)); | |
$prop = preg_replace('/ +/', '_', strtoupper($prop)); | |
$ranges[] = [ hexdec($m[1]), hexdec($m[2] ?: $m[1]), $prop ]; | |
} | |
$props = array_unique(array_map(fn ($e) => $e[2], $ranges)); | |
sort($props); | |
echo "enum uni_prop_{$prefix} {\n"; | |
foreach ($props as $prop) { | |
echo " $prop,\n"; | |
} | |
echo "};\n\n"; | |
printf("#define UNI_TABLE_%s_COUNT %d\n", strtoupper($prefix), count($ranges)); | |
usort($ranges, fn ($a, $b) => $a[0] <=> $b[0]); | |
echo "static struct uni_table uni_table_{$prefix}[] = {\n"; | |
foreach ($ranges as $r) { | |
printf(" { 0x%05x, 0x%05x, %s },\n", $r[0], $r[1], $r[2]); | |
} | |
echo "};\n\n"; | |
} | |
print_prop_table_for_url( | |
'grapheme_break', | |
'http://www.unicode.org/Public/UCD/latest/ucd/auxiliary/GraphemeBreakProperty.txt', | |
null, | |
null | |
); | |
print_prop_table_for_url( | |
'incb', | |
'http://www.unicode.org/Public/UCD/latest/ucd/DerivedCoreProperties.txt', | |
'/\bInCB\b/', | |
'/InCB/' | |
); | |
print_prop_table_for_url( | |
'ext_picto', | |
'http://www.unicode.org/Public/UCD/latest/ucd/emoji/emoji-data.txt', | |
'/\bExtended_Pictographic\b/', | |
'/Extended_Pictographic/' | |
); | |
/* | |
struct uni_table { | |
uint32_t range_start; | |
uint32_t range_end; | |
int prop; | |
}; | |
int uni_prop_lookup(struct uni_table *table, int ntable, uint32_t c) { | |
int lo = 0; | |
int hi = ntable - 1; | |
while (lo <= hi) { | |
int i = (lo + hi) / 2; | |
if (c < table[i].range_start) { | |
hi = i - 1; | |
} else if (c > table[i].range_end) { | |
lo = i + 1; | |
} else { | |
return table[i].prop; | |
} | |
} | |
return -1; | |
} | |
*/ |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment