Skip to content

Instantly share code, notes, and snippets.

@mwgamera
Created December 7, 2024 07:58
Show Gist options
  • Save mwgamera/63b0db71e4820c42bec2f52a599d5fa8 to your computer and use it in GitHub Desktop.
Save mwgamera/63b0db71e4820c42bec2f52a599d5fa8 to your computer and use it in GitHub Desktop.
#!/usr/bin/env perl
# klg, Mar 2021
use strict;
use utf8;
use open qw/:std :utf8/;
use constant PATH => <'~/lib/dict/readlex/kingsleyreadlexicon.tsv'>;
unless (@ARGV) {
utf8::decode($0);
printf STDERR"๐‘ฟ๐‘•๐‘ฆ๐‘ก: %s ๐‘ข๐‘ป๐‘›[๐‘Ÿ] ...\n", $0 =~ s!.*/!!gr;
printf STDERR"<%s>\n", PATH;
exit;
}
my $ps = '';
my $qs = "@ARGV"; # bytes
$ps = $1 if $qs =~ s!\s*[_/]\s*([A-Z][A-Z0-9]*)\s*\z!!i;
my $qr0 = qr/\Q$qs\E/i;
eval { $qr0 = qr/$qs|$qr0/i } if $qs =~ /[\[\]().^\$]/;
# Columns: Latn Shaw POS IPA frequency
my $qr = qr{
# full match of either headword or spelling
(^$qr0\t[^\t]*\t\Q$ps\E
|^[^\t]*\t$qr0\t\Q$ps\E) (?{ 1 }) |
# initial match
(^$qr0 [^\t]+\t[^\t]*\t\Q$ps\E
|^[^\t]*\t$qr0 [^\t]*\t\Q$ps\E) (?{ 2 }) |
# any partial match
(^[^\t]+$qr0 [^\t]+\t[^\t]*\t\Q$ps\E
|^[^\t]*\t[^\t]+$qr0 [^\t]*\t\Q$ps\E) (?{ 3 }) |
# the same but without POS
(^$qr0\t
|^[^\t]*\t$qr0\t) (?{ 4 }) |
(^$qr0
|^[^\t]*\t$qr0) (?{ 5 }) |
(^[^\t]+$qr0
|^[^\t]*\t[^\t]+$qr0) (?{ 6 }) |
# everything else
$qr0(?{ 9 })
}ix;
# Non-regex prefixes & suffixes
if ($qs =~ s/^-([^\[\]()^\$]+)-$/$1/) {
$qr = qr/^[^\t]*\Q$qs\E(?{ 1 })|$qr/i;
$qr0 = qr/\Q$qs\E|$qr0/i;
} elsif ($qs =~ s/^-([^\[\]()^\$]+)(?<!-)$/$1/) {
$qr = qr/^[^\t]*\Q$qs\E\t(?{ 1 })|$qr/i;
$qr0 = qr/\Q$qs\E|$qr0/i;
} elsif ($qs =~ s/^(?!-)([^\[\]()^\$]+)-$/$1/) {
$qr = qr/^\Q$qs\E(?{ 1 })|$qr/i;
$qr0 = qr/\Q$qs\E|$qr0/i;
}
open my $dict, '<', PATH or die $!;
binmode($dict); # sic, for speed
my $minR = 0+'inf';
my @match;
while (<$dict>) {
next unless /$qr0/;
next unless /$qr/;
next unless $^R <= $minR;
if ($^R < $minR) {
$minR = $^R;
@match = ();
}
push @match, $_;
}
close $dict;
# Columns: [0]Latn [1]Shaw [2]POS [3]IPA [4]frequency
@match = sort {
$$a[0] cmp $$b[0] || # Latn
$$a[1] cmp $$b[1] || # Shaw
$$a[3] cmp $$b[3] || # IPA
$$b[4] <=> $$a[4] # frequency
} map {[split /\t/]} @match;
my @m = [-1 x 5];
for (@match) {
if ($m[-1][0] eq $$_[0] and
$m[-1][1] eq $$_[1] and
$m[-1][3] eq $$_[3]
) {
$m[-1] = [
@{$m[-1]}[0,1],
"$m[-1][2],$$_[2]", # POS
$m[-1][3],
$m[-1][4] + $$_[4]
];
} else {
push @m, $_;
}
}
@match = ();
shift @m;
exit unless @m;
my @len = (8, 8, 0, 0);
for my $m (@m) {
for my $i (0 .. 3) {
utf8::decode($$m[$i]);
my $s = $$m[$i];
$len[$i] = length $s if $len[$i] < length $s;
}
}
while ($len[0] + $len[1] + $len[2] + $len[3] >= 74) {
for (@len) { $_-- if $_ > 16 }
}
utf8::decode($qs);
my $qru = qr/\Q$qs\E/i;
eval { $qru = qr/$qru|$qs/i } if $qs =~ /[\[\]().^\$]/;
for (sort {$$b[4] <=> $$a[4] || $$a[0] cmp $$b[0]} @m) {
my ($Latn, $Shaw, $POS, $IPA) = @$_;
next if $IPA =~ /ร†/; # ๐‘‘๐‘ฎ๐‘จ๐‘-๐‘š๐‘ญ๐‘” ๐‘•๐‘๐‘ค๐‘ฆ๐‘‘
$Latn =~ s/(.)/\u$1/ if $POS eq 'NP0';
$Latn =~ s/(.)/\U$1/g if $Shaw =~ /^โธฐ/; # shavian.info๐‘Ÿ ๐‘น๐‘”๐‘ฉ๐‘œ๐‘ฎ๐‘จ๐‘“๐‘ฆ๐‘’ ๐‘’๐‘ฉ๐‘ฏ๐‘๐‘ง๐‘ฏ๐‘–๐‘ฉ๐‘ฏ
if ($IPA eq '\N') { $IPA = '' } else { $IPA = " $IPA" }
$POS =~ s!(\A|,)(\Q$ps\E)!$1.highlight($2)!gie;
$Latn = sprintf '%-*s', $len[0], $Latn;
$Shaw = sprintf '%-*s', $len[0], $Shaw;
$Shaw .= ' ' if $Shaw =~ /\x{200D}/;
s!($qru)!highlight($1)!ge for ($Latn, $Shaw, $IPA);
print "$Latn $Shaw ($POS)$IPA\e[K\n";
}
sub highlight {
my $txt = shift;
return "\e[${1}m$txt\e[0m" if $ENV{GREP_COLORS}
and $ENV{GREP_COLORS} =~ /(?:^|:)m[st]=([^:]++)(?::|$)/;
return "\e[$ENV{GREP_COLOR}m$txt\e[0m" if $ENV{GREP_COLOR};
return "\e[1m$txt\e[22m";
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment