hoehrmann · May 26, 2013 10:55
diff --git a/normalize.pl b/normalize.pl
 #!perl -w
 use strict;
 use warnings;
 use DBI;
 use Data::Dumper;
 use Storable qw(retrieve nstore);
 use IO::File;

 # Look up the canonical combining class as in the UCD
 our %CCC;

 # Look up the canonical decomposition as in the UCD.
 our %DEC;

 # Look up primary composites based on their NFD expansion
 our %CAN;

 if (-f 'ccc.sto' and -f 'dec.sto' and -f 'can.sto') {

  %CCC = %{retrieve 'ccc.sto'};
  %DEC = %{retrieve 'dec.sto'};
  %CAN = %{retrieve 'can.sto'};

 } else {

  # This obviously requires that the whole UCD is available as SQLite
  # database in the file 'ucd'. It can easily be derived from the com-
  # plete XML dump of the database in the file ucd.all.flat.xml.zip.
  # Importantly it contains the Hangul decompositions, so we do not've
  # to compute them algorithmically.

  my $dbh = DBI->connect("dbi:SQLite:dbname=ucd","","");
  my $db = $dbh->selectall_hashref(q{

    SELECT
      cp, ccc, dm, dt, Comp_Ex
    FROM
      ucd

  }, 'cp');

  foreach my $key (keys %$db) {
    my $entry = $db->{$key};
    my $cp = chr(hex $entry->{cp});
    my $dm = join '', map { chr(hex) } split/\s+/, $entry->{dm};

    $CCC{$cp} = $entry->{ccc};
    $DEC{$cp} = $dm if
      $entry->{dt} eq 'can';
  }

  foreach my $key (keys %$db) {
    my $entry = $db->{$key};
    my $cp = chr(hex $entry->{cp});
    next unless $entry->{Comp_Ex} eq 'N';
    $CAN{ NFD($cp) } = $cp;
  }

  nstore \%CCC, 'ccc.sto';
  nstore \%DEC, 'dec.sto';
  nstore \%CAN, 'can.sto';

 }

 sub reorder {
  my $s = shift; $s = "$s";
  my $i = 1;
  while ($i < length $s) {
    my $x = substr $s, $i - 1, 1;
    my $y = substr $s, $i + 0, 1;
    
    if (combClass($x) > combClass($y) and combClass($y) != 0) {
      substr $s, $i - 1, 1, $y;
      substr $s, $i + 0, 1, $x;
      $i-- if $i > 1;
      next;
    }
    $i++;
  }

  return $s;
 }

 sub combClass {
  my $c = shift;
  return $CCC{$c} || 0;
 }

 sub decombine {
  my $c = shift;
  my $d = $DEC{$c};

  return $c unless defined $d;

  # Recursively decombine
  join '', map { decombine($_) } split//, $d
 }

 sub NFD {
  my $s = shift; $s = "$s";
  my $d = join '', map { decombine($_) } split//, $s;
  return reorder($d);
 }

 sub combine {
  my $starter = shift;
  my $combiner = shift;
  my $d = NFD $starter . $combiner;
  return $CAN{$d};
 }

 sub NFC {
  my $s = shift;
  $s = NFD $s;
  my $starterpos = 0;

  # advance to the first starter
  $starterpos++ while combClass(substr $s, $starterpos, 1) != 0;

  my $pos = $starterpos + 1;
  my $prev_ccc = 0;

  while ($pos < length $s) {
    my $current = substr $s, $pos, 1;
    my $here_ccc = combClass($current);

    my $combo = combine(substr($s, $starterpos, 1), $current);
    my $blocked = ($starterpos < $pos - 1) && ($prev_ccc >= $here_ccc);

    if (defined $combo and not $blocked) {
      substr $s, $starterpos, 1, $combo;
      substr $s, $pos, 1, '';
      next;
    }

    if ($here_ccc == 0) {
      $starterpos = $pos;
    }

    $prev_ccc = $here_ccc;
    $pos++;
  }

  return $s;
 }

 my $f = IO::File->new('<' . 'NormalizationTest.txt');

 while (<$f>) {
  chomp;
  s/#.*//;
  next if /^@/;
  next unless /\S/;
  my ($c1, $c2, $c3, $c4, $c5) = split /;/;

  $c1 = join '', map { chr(hex) } $c1 =~ m/(\S+)/g;
  $c2 = join '', map { chr(hex) } $c2 =~ m/(\S+)/g;
  $c3 = join '', map { chr(hex) } $c3 =~ m/(\S+)/g;
  $c4 = join '', map { chr(hex) } $c4 =~ m/(\S+)/g;
  $c5 = join '', map { chr(hex) } $c5 =~ m/(\S+)/g;

  my $fail = 0;
  $fail++ unless $c2 eq NFC($c1) and
                 $c2 eq NFC($c2) and
                 $c2 eq NFC($c3) and
                 $c4 eq NFC($c4) and
                 $c4 eq NFC($c5) ; 

  $fail++ unless $c3 eq NFD($c1) and
                 $c3 eq NFD($c2) and
                 $c3 eq NFD($c3) and
                 $c5 eq NFD($c4) and
                 $c5 eq NFD($c5) ; 

  next unless $fail;

  warn "bad";
 }
	#!perl -w
	use strict;
	use warnings;
	use DBI;
	use Data::Dumper;
	use Storable qw(retrieve nstore);
	use IO::File;

	# Look up the canonical combining class as in the UCD
	our %CCC;

	# Look up the canonical decomposition as in the UCD.
	our %DEC;

	# Look up primary composites based on their NFD expansion
	our %CAN;

	if (-f 'ccc.sto' and -f 'dec.sto' and -f 'can.sto') {

	%CCC = %{retrieve 'ccc.sto'};
	%DEC = %{retrieve 'dec.sto'};
	%CAN = %{retrieve 'can.sto'};

	} else {

	# This obviously requires that the whole UCD is available as SQLite
	# database in the file 'ucd'. It can easily be derived from the com-
	# plete XML dump of the database in the file ucd.all.flat.xml.zip.
	# Importantly it contains the Hangul decompositions, so we do not've
	# to compute them algorithmically.

	my $dbh = DBI->connect("dbi:SQLite:dbname=ucd","","");
	my $db = $dbh->selectall_hashref(q{

	SELECT
	cp, ccc, dm, dt, Comp_Ex
	FROM
	ucd

	}, 'cp');

	foreach my $key (keys %$db) {
	my $entry = $db->{$key};
	my $cp = chr(hex $entry->{cp});
	my $dm = join '', map { chr(hex) } split/\s+/, $entry->{dm};

	$CCC{$cp} = $entry->{ccc};
	$DEC{$cp} = $dm if
	$entry->{dt} eq 'can';
	}

	foreach my $key (keys %$db) {
	my $entry = $db->{$key};
	my $cp = chr(hex $entry->{cp});
	next unless $entry->{Comp_Ex} eq 'N';
	$CAN{ NFD($cp) } = $cp;
	}

	nstore \%CCC, 'ccc.sto';
	nstore \%DEC, 'dec.sto';
	nstore \%CAN, 'can.sto';

	}

	sub reorder {
	my $s = shift; $s = "$s";
	my $i = 1;
	while ($i < length $s) {
	my $x = substr $s, $i - 1, 1;
	my $y = substr $s, $i + 0, 1;

	if (combClass($x) > combClass($y) and combClass($y) != 0) {
	substr $s, $i - 1, 1, $y;
	substr $s, $i + 0, 1, $x;
	$i-- if $i > 1;
	next;
	}
	$i++;
	}

	return $s;
	}

	sub combClass {
	my $c = shift;
	return $CCC{$c} \|\| 0;
	}

	sub decombine {
	my $c = shift;
	my $d = $DEC{$c};

	return $c unless defined $d;

	# Recursively decombine
	join '', map { decombine($_) } split//, $d
	}

	sub NFD {
	my $s = shift; $s = "$s";
	my $d = join '', map { decombine($_) } split//, $s;
	return reorder($d);
	}

	sub combine {
	my $starter = shift;
	my $combiner = shift;
	my $d = NFD $starter . $combiner;
	return $CAN{$d};
	}

	sub NFC {
	my $s = shift;
	$s = NFD $s;
	my $starterpos = 0;

	# advance to the first starter
	$starterpos++ while combClass(substr $s, $starterpos, 1) != 0;

	my $pos = $starterpos + 1;
	my $prev_ccc = 0;

	while ($pos < length $s) {
	my $current = substr $s, $pos, 1;
	my $here_ccc = combClass($current);

	my $combo = combine(substr($s, $starterpos, 1), $current);
	my $blocked = ($starterpos < $pos - 1) && ($prev_ccc >= $here_ccc);

	if (defined $combo and not $blocked) {
	substr $s, $starterpos, 1, $combo;
	substr $s, $pos, 1, '';
	next;
	}

	if ($here_ccc == 0) {
	$starterpos = $pos;
	}

	$prev_ccc = $here_ccc;
	$pos++;
	}

	return $s;
	}

	my $f = IO::File->new('<' . 'NormalizationTest.txt');

	while (<$f>) {
	chomp;
	s/#.*//;
	next if /^@/;
	next unless /\S/;
	my ($c1, $c2, $c3, $c4, $c5) = split /;/;

	$c1 = join '', map { chr(hex) } $c1 =~ m/(\S+)/g;
	$c2 = join '', map { chr(hex) } $c2 =~ m/(\S+)/g;
	$c3 = join '', map { chr(hex) } $c3 =~ m/(\S+)/g;
	$c4 = join '', map { chr(hex) } $c4 =~ m/(\S+)/g;
	$c5 = join '', map { chr(hex) } $c5 =~ m/(\S+)/g;

	my $fail = 0;
	$fail++ unless $c2 eq NFC($c1) and
	$c2 eq NFC($c2) and
	$c2 eq NFC($c3) and
	$c4 eq NFC($c4) and
	$c4 eq NFC($c5) ;

	$fail++ unless $c3 eq NFD($c1) and
	$c3 eq NFD($c2) and
	$c3 eq NFD($c3) and
	$c5 eq NFD($c4) and
	$c5 eq NFD($c5) ;

	next unless $fail;

	warn "bad";
	}