Skip to content

Instantly share code, notes, and snippets.

@mwgamera
Created July 27, 2025 03:46
Show Gist options
  • Select an option

  • Save mwgamera/97cc4bc29307edbc30234b20f5351b37 to your computer and use it in GitHub Desktop.

Select an option

Save mwgamera/97cc4bc29307edbc30234b20f5351b37 to your computer and use it in GitHub Desktop.
๐‘–๐‘ฑ๐‘๐‘พ๐‘ฏ ๐‘•๐‘น๐‘‘๐‘ฆ๐‘™, ๐‘‘๐‘ฑ๐‘’ n+1. ยท๐‘๐‘ป๐‘คโ€™๐‘Ÿ UCA ๐‘ฅ๐‘ช๐‘ก๐‘ต๐‘ค ๐‘ฅ๐‘ฑ๐‘’๐‘• ๐‘ฆ๐‘‘ ๐‘ฅ๐‘น ๐‘ฉ๐‘ฏ๐‘ถ๐‘ฆ๐‘™ ๐‘ž๐‘จ๐‘ฏ ICU ๐‘น glibc ๐‘ค๐‘ด๐‘’๐‘ญ๐‘ค
#!/usr/bin/env perl
# ๐‘•๐‘น๐‘‘๐‘ฆ๐‘™ ๐‘–๐‘ฑ๐‘๐‘พ๐‘ฏ ๐‘•๐‘‘๐‘ฎ๐‘ฆ๐‘™๐‘Ÿ ๐‘ฟ๐‘Ÿ๐‘ฆ๐‘™ ๐‘ž ยท๐‘ฟ๐‘ฏ๐‘ฆ๐‘’๐‘ด๐‘› ๐‘’๐‘ฉ๐‘ค๐‘ฑ๐‘–๐‘ฉ๐‘ฏ ๐‘จ๐‘ค๐‘œ๐‘ผ๐‘ฆ๐‘ž๐‘ฉ๐‘ฅ ๐‘ฅ๐‘ช๐‘ก๐‘ต๐‘ค
# ๐‘•๐‘ด ๐‘ž๐‘‘ ๐‘ฆ๐‘‘ ๐‘’๐‘จ๐‘ฏ ๐‘๐‘ค๐‘ฑ ๐‘ฏ๐‘ฒ๐‘•๐‘ค๐‘ฆ ๐‘ข๐‘ฆ๐‘ž ๐‘ฅ๐‘ฆ๐‘’๐‘•๐‘‘ ๐‘•๐‘’๐‘ฎ๐‘ฆ๐‘๐‘‘๐‘• ๐‘ฏ ๐‘ณ๐‘ž๐‘ผ ๐‘‘๐‘ฑ๐‘ค๐‘ผ๐‘ฆ๐‘™
# klg, Jul 2025
package Unicode::Collate::Shavian;
use strict;
use base 'Unicode::Collate';
use Carp;
our $version = Unicode::Collate->new->version;
our $entry = do {
my $start = { # ๐‘ฆ๐‘ฏ๐‘ฆ๐‘–๐‘ฉ๐‘ค ๐‘ข๐‘ฑ๐‘‘
'13.0.0' => 0x4B8F,
'15.1.0' => 0x4D7A,
}->{$version};
warn "Unknown DUCET version" unless $start;
unless (defined $start) {
for (@INC) {
next unless -f (my $fn = "$_/Unicode/Collate/allkeys.txt");
open my $f, '<:raw', $fn or die $!;
while (<$f>) {
next if /^[%#]/;
if (/^\@version (.*)$/) {
die "DUCET version mismatch: found $1, expected $version"
unless $version eq $1;
}
if (/^10450\b/) {
chomp, s/[%#].*\z//s, s/\s+//g;
die "Syntax error" unless
m!^[0-9A-F]+;\[(.)([0-9A-F]+)\.[^]]+\](.?)!i;
die "Unexpected sequence of collation elements" if $3 eq '[';
die "Unexpected variable collation element" if $1 eq '*';
die "Syntax error" unless $1 eq '.';
$start = hex $2;
last;
}
}
close $f or die $!;
last;
}
}
die "Unknown DUCET version" unless $start;
my @a = (0, map sprintf("%04X",$start+$_), 0..42);
<< "# EOF";
10450 ; [.$a[ 1].0020.0002] # SHAVIAN LETTER PEE
1045A ; [.$a[ 2].0020.0002] # SHAVIAN LETTER BAY
10451 ; [.$a[ 3].0020.0002] # SHAVIAN LETTER TEE
1045B ; [.$a[ 4].0020.0002] # SHAVIAN LETTER DAY
10452 ; [.$a[ 5].0020.0002] # SHAVIAN LETTER KEY
1045C ; [.$a[ 6].0020.0002] # SHAVIAN LETTER GAY
10453 ; [.$a[ 7].0020.0002] # SHAVIAN LETTER FEE
1045D ; [.$a[ 8].0020.0002] # SHAVIAN LETTER VIE
10454 ; [.$a[ 9].0020.0002] # SHAVIAN LETTER THAW
1045E ; [.$a[10].0020.0002] # SHAVIAN LETTER THEY
10455 ; [.$a[11].0020.0002] # SHAVIAN LETTER SEE
1045F ; [.$a[12].0020.0002] # SHAVIAN LETTER ZOO
10456 ; [.$a[13].0020.0002] # SHAVIAN LETTER SHE
10460 ; [.$a[14].0020.0002] # SHAVIAN LETTER ZHAY
10457 ; [.$a[15].0020.0002] # SHAVIAN LETTER CHEER
10461 ; [.$a[16].0020.0002] # SHAVIAN LETTER JAY
10458 ; [.$a[17].0020.0002] # SHAVIAN LETTER YE
10462 ; [.$a[18].0020.0002] # SHAVIAN LETTER WAY
10459 ; [.$a[19].0020.0002] # SHAVIAN LETTER ING
10463 ; [.$a[20].0020.0002] # SHAVIAN LETTER HAY
10464 ; [.$a[21].0020.0002] # SHAVIAN LETTER LEE
1046E ; [.$a[22].0020.0002] # SHAVIAN LETTER RAY
10465 ; [.$a[23].0020.0002] # SHAVIAN LETTER ME
1046F ; [.$a[24].0020.0002] # SHAVIAN LETTER NAY
10466 ; [.$a[25].0020.0002] # SHAVIAN LETTER IT
10470 ; [.$a[26].0020.0002] # SHAVIAN LETTER EASE
10467 ; [.$a[27].0020.0002] # SHAVIAN LETTER EGG
10471 ; [.$a[28].0020.0002] # SHAVIAN LETTER ACE
10468 ; [.$a[29].0020.0002] # SHAVIAN LETTER AT
10472 ; [.$a[30].0020.0002] # SHAVIAN LETTER ICE
10469 ; [.$a[31].0020.0002] # SHAVIAN LETTER AGO
10473 ; [.$a[32].0020.0002] # SHAVIAN LETTER US
1046A ; [.$a[33].0020.0002] # SHAVIAN LETTER OT
10474 ; [.$a[34].0020.0002] # SHAVIAN LETTER OWE
1046B ; [.$a[35].0020.0002] # SHAVIAN LETTER UUT
10475 ; [.$a[36].0020.0002] # SHAVIAN LETTER OOZE
1046C ; [.$a[37].0020.0002] # SHAVIAN LETTER OUT
10476 ; [.$a[38].0020.0002] # SHAVIAN LETTER OY
1046D ; [.$a[39].0020.0002] # SHAVIAN LETTER ARE*
10477 ; [.$a[40].0020.0002] # SHAVIAN LETTER AWE
# SHAVIAN LETTER AIR*
# SHAVIAN LETTER URGE
1047E ; [.$a[43].0020.0002] # SHAVIAN LETTER EAR*
10478 ; [.$a[39].0020.0003][.$a[22].0020.0002] # SHAVIAN LIGATURE ARE
10479 ; [.$a[40].0020.0003][.$a[22].0020.0002] # SHAVIAN LIGATURE OR
1047A ; [.$a[41].0020.0003][.$a[22].0020.0002] # SHAVIAN LIGATURE AIR
1047B ; [.$a[42].0020.0003][.$a[22].0020.0002] # SHAVIAN LIGATURE ERR
1047C ; [.$a[31].0020.0003][.$a[22].0020.0002] # SHAVIAN LIGATURE ARRAY
1047D ; [.$a[43].0020.0003][.$a[22].0020.0002] # SHAVIAN LIGATURE EAR
1047F ; [.$a[17].0020.0003][.$a[36].0020.0002] # SHAVIAN LIGATURE YEW
1047A FE00 ; [.$a[41].0020.0002] # SHAVIAN LETTER AIR* (Inter Alia hack)
1047B FE00 ; [.$a[42].0020.0002] # SHAVIAN LETTER URGE (Inter Alia hack)
E72A ; [.$a[41].0020.0002] # SHAVIAN LETTER AIR* (PUA as I use it, NOT CSUR)
E72B ; [.$a[42].0020.0002] # SHAVIAN LETTER URGE (PUA as I use it, NOT CSUR)
# EOF
};
sub new {
my ($cls, %opt) = @_;
$opt{entry} = ($opt{entry}//"")."\n".$entry;
$cls->SUPER::new(%opt);
}
return 1 unless $0 eq __FILE__;
package main;
use strict;
use open qw/:std :locale/;
my $collator = Unicode::Collate::Shavian->new();
my @lines = <>;
@lines = $collator->sort(@lines);
print for @lines;
exit;
#!/usr/bin/perl
use strict;
use utf8;
use open qw/:std :utf8/;
use List::Util 'shuffle';
use lib '.';
use sort_uca; # Unicode::Collate::Shavian but yolo
my (@test, %name);
open my $test, '<:utf8', './test.txt' or die $!;
while (<$test>) {
next if /^#/; chomp;
if (/^%%\s*(.*)$/) {
push @test, [];
$name{$test[-1]} = $1;
next;
}
push @{$test[-1]}, $_;
}
close $test or die $!;
@test = grep scalar @$_, @test;
printf "1..%u\n", scalar @test;
my $collator = Unicode::Collate::Shavian->new;
for my $i (0 .. $#test) {
my @x = @{$test[$i]};
my @y = shuffle(@x);
@y = $collator->sort(@y);
my $err = 0;
$err += $x[$_] ne $y[$_] for 0..$#y;
my $n = $name{$test[$i]} // ''; $n = " - $n" if $n;
$n .= ' # TODO' if $n and $n =~ /\b๐‘ช๐‘๐‘–๐‘ฉ๐‘ฏ๐‘ฉ๐‘ค\b/;
if (!$err) {
printf "ok %u%s\n", $i+1, $n;
} else {
printf "not ok %u%s\n", $i+1, $n;
eval {
require Algorithm::Diff;
my $dif = Algorithm::Diff::sdiff(\@x, \@y);
my $len = 0; for (@x) { $len = length if $len < length }
for (@$dif) {
my ($dir, $x, $y) = @$_;
printf("# %-*s %s %s\n", $len,
$x, $dir =~ y/uc+-/ :></r, $y);
}
};
}
}
1;
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment