Skip to content

Instantly share code, notes, and snippets.

@adsr
Created February 23, 2025 03:15
Show Gist options
  • Save adsr/834dfc3135bcb6a1d29b498eb94ba101 to your computer and use it in GitHub Desktop.
Save adsr/834dfc3135bcb6a1d29b498eb94ba101 to your computer and use it in GitHub Desktop.
<?php
declare(strict_types=1);
// Print data structures necessary for finding grapheme cluster breaks
// https://unicode.org/reports/tr29/
function print_prop_table_for_url($prefix, $url, $filter, $prop_strip) {
$lines = explode("\n", trim(file_get_contents($url)));
$ranges = [];
foreach ($lines as $line) {
$m = [];
if (!preg_match('/^([0-9A-F]+)(?:\.\.([0-9A-F]+))?\s+;\s+([^#]+)/', $line, $m)) continue;
if ($filter) if (!preg_match($filter, $line)) continue;
$prop = $m[3];
if ($prop_strip) $prop = preg_replace($prop_strip, ' ', $prop);
$prop = preg_replace('/\W+/', ' ', $prop);
$prop = trim(sprintf('UNI_PROP_%s %s', $prefix, $prop));
$prop = preg_replace('/ +/', '_', strtoupper($prop));
$ranges[] = [ hexdec($m[1]), hexdec($m[2] ?: $m[1]), $prop ];
}
$props = array_unique(array_map(fn ($e) => $e[2], $ranges));
sort($props);
echo "enum uni_prop_{$prefix} {\n";
foreach ($props as $prop) {
echo " $prop,\n";
}
echo "};\n\n";
printf("#define UNI_TABLE_%s_COUNT %d\n", strtoupper($prefix), count($ranges));
usort($ranges, fn ($a, $b) => $a[0] <=> $b[0]);
echo "static struct uni_table uni_table_{$prefix}[] = {\n";
foreach ($ranges as $r) {
printf(" { 0x%05x, 0x%05x, %s },\n", $r[0], $r[1], $r[2]);
}
echo "};\n\n";
}
print_prop_table_for_url(
'grapheme_break',
'http://www.unicode.org/Public/UCD/latest/ucd/auxiliary/GraphemeBreakProperty.txt',
null,
null
);
print_prop_table_for_url(
'incb',
'http://www.unicode.org/Public/UCD/latest/ucd/DerivedCoreProperties.txt',
'/\bInCB\b/',
'/InCB/'
);
print_prop_table_for_url(
'ext_picto',
'http://www.unicode.org/Public/UCD/latest/ucd/emoji/emoji-data.txt',
'/\bExtended_Pictographic\b/',
'/Extended_Pictographic/'
);
/*
struct uni_table {
uint32_t range_start;
uint32_t range_end;
int prop;
};
int uni_prop_lookup(struct uni_table *table, int ntable, uint32_t c) {
int lo = 0;
int hi = ntable - 1;
while (lo <= hi) {
int i = (lo + hi) / 2;
if (c < table[i].range_start) {
hi = i - 1;
} else if (c > table[i].range_end) {
lo = i + 1;
} else {
return table[i].prop;
}
}
return -1;
}
*/
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment