Created
May 26, 2013 10:55
-
-
Save hoehrmann/5652431 to your computer and use it in GitHub Desktop.
The attached script contains a simple Perl implementation of the NFC and NFD normalization form using data directly derived from the UCD. It includes testing against the NormalizationTest.txt file. Originally http://lists.w3.org/Archives/Public/www-archive/2009Feb/0015.html
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!perl -w | |
use strict; | |
use warnings; | |
use DBI; | |
use Data::Dumper; | |
use Storable qw(retrieve nstore); | |
use IO::File; | |
# Look up the canonical combining class as in the UCD | |
our %CCC; | |
# Look up the canonical decomposition as in the UCD. | |
our %DEC; | |
# Look up primary composites based on their NFD expansion | |
our %CAN; | |
if (-f 'ccc.sto' and -f 'dec.sto' and -f 'can.sto') { | |
%CCC = %{retrieve 'ccc.sto'}; | |
%DEC = %{retrieve 'dec.sto'}; | |
%CAN = %{retrieve 'can.sto'}; | |
} else { | |
# This obviously requires that the whole UCD is available as SQLite | |
# database in the file 'ucd'. It can easily be derived from the com- | |
# plete XML dump of the database in the file ucd.all.flat.xml.zip. | |
# Importantly it contains the Hangul decompositions, so we do not've | |
# to compute them algorithmically. | |
my $dbh = DBI->connect("dbi:SQLite:dbname=ucd","",""); | |
my $db = $dbh->selectall_hashref(q{ | |
SELECT | |
cp, ccc, dm, dt, Comp_Ex | |
FROM | |
ucd | |
}, 'cp'); | |
foreach my $key (keys %$db) { | |
my $entry = $db->{$key}; | |
my $cp = chr(hex $entry->{cp}); | |
my $dm = join '', map { chr(hex) } split/\s+/, $entry->{dm}; | |
$CCC{$cp} = $entry->{ccc}; | |
$DEC{$cp} = $dm if | |
$entry->{dt} eq 'can'; | |
} | |
foreach my $key (keys %$db) { | |
my $entry = $db->{$key}; | |
my $cp = chr(hex $entry->{cp}); | |
next unless $entry->{Comp_Ex} eq 'N'; | |
$CAN{ NFD($cp) } = $cp; | |
} | |
nstore \%CCC, 'ccc.sto'; | |
nstore \%DEC, 'dec.sto'; | |
nstore \%CAN, 'can.sto'; | |
} | |
sub reorder { | |
my $s = shift; $s = "$s"; | |
my $i = 1; | |
while ($i < length $s) { | |
my $x = substr $s, $i - 1, 1; | |
my $y = substr $s, $i + 0, 1; | |
if (combClass($x) > combClass($y) and combClass($y) != 0) { | |
substr $s, $i - 1, 1, $y; | |
substr $s, $i + 0, 1, $x; | |
$i-- if $i > 1; | |
next; | |
} | |
$i++; | |
} | |
return $s; | |
} | |
sub combClass { | |
my $c = shift; | |
return $CCC{$c} || 0; | |
} | |
sub decombine { | |
my $c = shift; | |
my $d = $DEC{$c}; | |
return $c unless defined $d; | |
# Recursively decombine | |
join '', map { decombine($_) } split//, $d | |
} | |
sub NFD { | |
my $s = shift; $s = "$s"; | |
my $d = join '', map { decombine($_) } split//, $s; | |
return reorder($d); | |
} | |
sub combine { | |
my $starter = shift; | |
my $combiner = shift; | |
my $d = NFD $starter . $combiner; | |
return $CAN{$d}; | |
} | |
sub NFC { | |
my $s = shift; | |
$s = NFD $s; | |
my $starterpos = 0; | |
# advance to the first starter | |
$starterpos++ while combClass(substr $s, $starterpos, 1) != 0; | |
my $pos = $starterpos + 1; | |
my $prev_ccc = 0; | |
while ($pos < length $s) { | |
my $current = substr $s, $pos, 1; | |
my $here_ccc = combClass($current); | |
my $combo = combine(substr($s, $starterpos, 1), $current); | |
my $blocked = ($starterpos < $pos - 1) && ($prev_ccc >= $here_ccc); | |
if (defined $combo and not $blocked) { | |
substr $s, $starterpos, 1, $combo; | |
substr $s, $pos, 1, ''; | |
next; | |
} | |
if ($here_ccc == 0) { | |
$starterpos = $pos; | |
} | |
$prev_ccc = $here_ccc; | |
$pos++; | |
} | |
return $s; | |
} | |
my $f = IO::File->new('<' . 'NormalizationTest.txt'); | |
while (<$f>) { | |
chomp; | |
s/#.*//; | |
next if /^@/; | |
next unless /\S/; | |
my ($c1, $c2, $c3, $c4, $c5) = split /;/; | |
$c1 = join '', map { chr(hex) } $c1 =~ m/(\S+)/g; | |
$c2 = join '', map { chr(hex) } $c2 =~ m/(\S+)/g; | |
$c3 = join '', map { chr(hex) } $c3 =~ m/(\S+)/g; | |
$c4 = join '', map { chr(hex) } $c4 =~ m/(\S+)/g; | |
$c5 = join '', map { chr(hex) } $c5 =~ m/(\S+)/g; | |
my $fail = 0; | |
$fail++ unless $c2 eq NFC($c1) and | |
$c2 eq NFC($c2) and | |
$c2 eq NFC($c3) and | |
$c4 eq NFC($c4) and | |
$c4 eq NFC($c5) ; | |
$fail++ unless $c3 eq NFD($c1) and | |
$c3 eq NFD($c2) and | |
$c3 eq NFD($c3) and | |
$c5 eq NFD($c4) and | |
$c5 eq NFD($c5) ; | |
next unless $fail; | |
warn "bad"; | |
} | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment