Created
December 7, 2024 07:58
-
-
Save mwgamera/63b0db71e4820c42bec2f52a599d5fa8 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env perl | |
# klg, Mar 2021 | |
use strict; | |
use utf8; | |
use open qw/:std :utf8/; | |
use constant PATH => <'~/lib/dict/readlex/kingsleyreadlexicon.tsv'>; | |
unless (@ARGV) { | |
utf8::decode($0); | |
printf STDERR"๐ฟ๐๐ฆ๐ก: %s ๐ข๐ป๐[๐] ...\n", $0 =~ s!.*/!!gr; | |
printf STDERR"<%s>\n", PATH; | |
exit; | |
} | |
my $ps = ''; | |
my $qs = "@ARGV"; # bytes | |
$ps = $1 if $qs =~ s!\s*[_/]\s*([A-Z][A-Z0-9]*)\s*\z!!i; | |
my $qr0 = qr/\Q$qs\E/i; | |
eval { $qr0 = qr/$qs|$qr0/i } if $qs =~ /[\[\]().^\$]/; | |
# Columns: Latn Shaw POS IPA frequency | |
my $qr = qr{ | |
# full match of either headword or spelling | |
(^$qr0\t[^\t]*\t\Q$ps\E | |
|^[^\t]*\t$qr0\t\Q$ps\E) (?{ 1 }) | | |
# initial match | |
(^$qr0 [^\t]+\t[^\t]*\t\Q$ps\E | |
|^[^\t]*\t$qr0 [^\t]*\t\Q$ps\E) (?{ 2 }) | | |
# any partial match | |
(^[^\t]+$qr0 [^\t]+\t[^\t]*\t\Q$ps\E | |
|^[^\t]*\t[^\t]+$qr0 [^\t]*\t\Q$ps\E) (?{ 3 }) | | |
# the same but without POS | |
(^$qr0\t | |
|^[^\t]*\t$qr0\t) (?{ 4 }) | | |
(^$qr0 | |
|^[^\t]*\t$qr0) (?{ 5 }) | | |
(^[^\t]+$qr0 | |
|^[^\t]*\t[^\t]+$qr0) (?{ 6 }) | | |
# everything else | |
$qr0(?{ 9 }) | |
}ix; | |
# Non-regex prefixes & suffixes | |
if ($qs =~ s/^-([^\[\]()^\$]+)-$/$1/) { | |
$qr = qr/^[^\t]*\Q$qs\E(?{ 1 })|$qr/i; | |
$qr0 = qr/\Q$qs\E|$qr0/i; | |
} elsif ($qs =~ s/^-([^\[\]()^\$]+)(?<!-)$/$1/) { | |
$qr = qr/^[^\t]*\Q$qs\E\t(?{ 1 })|$qr/i; | |
$qr0 = qr/\Q$qs\E|$qr0/i; | |
} elsif ($qs =~ s/^(?!-)([^\[\]()^\$]+)-$/$1/) { | |
$qr = qr/^\Q$qs\E(?{ 1 })|$qr/i; | |
$qr0 = qr/\Q$qs\E|$qr0/i; | |
} | |
open my $dict, '<', PATH or die $!; | |
binmode($dict); # sic, for speed | |
my $minR = 0+'inf'; | |
my @match; | |
while (<$dict>) { | |
next unless /$qr0/; | |
next unless /$qr/; | |
next unless $^R <= $minR; | |
if ($^R < $minR) { | |
$minR = $^R; | |
@match = (); | |
} | |
push @match, $_; | |
} | |
close $dict; | |
# Columns: [0]Latn [1]Shaw [2]POS [3]IPA [4]frequency | |
@match = sort { | |
$$a[0] cmp $$b[0] || # Latn | |
$$a[1] cmp $$b[1] || # Shaw | |
$$a[3] cmp $$b[3] || # IPA | |
$$b[4] <=> $$a[4] # frequency | |
} map {[split /\t/]} @match; | |
my @m = [-1 x 5]; | |
for (@match) { | |
if ($m[-1][0] eq $$_[0] and | |
$m[-1][1] eq $$_[1] and | |
$m[-1][3] eq $$_[3] | |
) { | |
$m[-1] = [ | |
@{$m[-1]}[0,1], | |
"$m[-1][2],$$_[2]", # POS | |
$m[-1][3], | |
$m[-1][4] + $$_[4] | |
]; | |
} else { | |
push @m, $_; | |
} | |
} | |
@match = (); | |
shift @m; | |
exit unless @m; | |
my @len = (8, 8, 0, 0); | |
for my $m (@m) { | |
for my $i (0 .. 3) { | |
utf8::decode($$m[$i]); | |
my $s = $$m[$i]; | |
$len[$i] = length $s if $len[$i] < length $s; | |
} | |
} | |
while ($len[0] + $len[1] + $len[2] + $len[3] >= 74) { | |
for (@len) { $_-- if $_ > 16 } | |
} | |
utf8::decode($qs); | |
my $qru = qr/\Q$qs\E/i; | |
eval { $qru = qr/$qru|$qs/i } if $qs =~ /[\[\]().^\$]/; | |
for (sort {$$b[4] <=> $$a[4] || $$a[0] cmp $$b[0]} @m) { | |
my ($Latn, $Shaw, $POS, $IPA) = @$_; | |
next if $IPA =~ /ร/; # ๐๐ฎ๐จ๐-๐๐ญ๐ ๐๐๐ค๐ฆ๐ | |
$Latn =~ s/(.)/\u$1/ if $POS eq 'NP0'; | |
$Latn =~ s/(.)/\U$1/g if $Shaw =~ /^โธฐ/; # shavian.info๐ ๐น๐๐ฉ๐๐ฎ๐จ๐๐ฆ๐ ๐๐ฉ๐ฏ๐๐ง๐ฏ๐๐ฉ๐ฏ | |
if ($IPA eq '\N') { $IPA = '' } else { $IPA = " $IPA" } | |
$POS =~ s!(\A|,)(\Q$ps\E)!$1.highlight($2)!gie; | |
$Latn = sprintf '%-*s', $len[0], $Latn; | |
$Shaw = sprintf '%-*s', $len[0], $Shaw; | |
$Shaw .= ' ' if $Shaw =~ /\x{200D}/; | |
s!($qru)!highlight($1)!ge for ($Latn, $Shaw, $IPA); | |
print "$Latn $Shaw ($POS)$IPA\e[K\n"; | |
} | |
sub highlight { | |
my $txt = shift; | |
return "\e[${1}m$txt\e[0m" if $ENV{GREP_COLORS} | |
and $ENV{GREP_COLORS} =~ /(?:^|:)m[st]=([^:]++)(?::|$)/; | |
return "\e[$ENV{GREP_COLOR}m$txt\e[0m" if $ENV{GREP_COLOR}; | |
return "\e[1m$txt\e[22m"; | |
} | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment