Skip to content

Instantly share code, notes, and snippets.

@hoehrmann
Last active December 18, 2015 00:09
Show Gist options
  • Save hoehrmann/5695002 to your computer and use it in GitHub Desktop.
Save hoehrmann/5695002 to your computer and use it in GitHub Desktop.
Convert CPAN's Lingua::Translit XML data files into JSON. Originally http://lists.w3.org/Archives/Public/www-archive/2012Mar/0033.html
#!perl -w
use strict;
use warnings;
use XML::LibXML;
use Set::IntSpan;
use Encode;
use JSON;
use Attribute::Memoize;
die "Usage: $0 rules.xml\n" unless @ARGV;
my $doc = XML::LibXML->load_xml(
location => $ARGV[0],
load_ext_dtd => 0,
);
my @rules = $doc->findnodes('//rule');
sub simplify_regex : Memoize {
my $regex = shift;
no warnings 'utf8';
my $spans = Set::IntSpan->new(
0xD800 .. 0xDFFF,
grep { chr($_) =~ /$regex/; } 0x0000 .. 0xD7FF, 0xE000 .. 0xFFFF
);
return
"["
. join("", map { sprintf "\\u%04X-\\u%04X", @$_ } $spans->spans)
. "]";
}
my @list;
foreach my $rule (@rules) {
my $from = $rule->findvalue('from');
my $to = $rule->findvalue('to');
my $before = $rule->findvalue('.//before') || "";
my $after = $rule->findvalue('.//after') || "";
push @list, {
from => $from,
to => $to,
before => simplify_regex($before) . (length($before) ? '' : '?'),
after => simplify_regex($after) . (length($after) ? '' : '?')
};
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment