Created
June 5, 2016 02:01
-
-
Save nanto/502769aef22cc07a93349351ffd2550c to your computer and use it in GitHub Desktop.
Convert Perl regexp `\p{...}` to JS regexp
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Usage: $0 <property> | |
# | |
# Print a JavaScript regular expression pattern | |
# corresponding to a Perl regular expression pattern `\p{property}`. | |
# | |
# See `perldoc perluniprops` for values that can be specified as <property>. | |
# | |
# Author: nanto_vi | |
# License: Public Domain | |
use strict; | |
use warnings; | |
use utf8; | |
use feature qw(say); | |
use Unicode::UCD qw(prop_invlist); | |
my $property = shift; | |
die 'No property is specified' unless defined $property; | |
my $invlist = [prop_invlist($property)]; | |
push @$invlist, 0x10FFFF + 1 if @$invlist % 2 == 1; | |
my $bmp_range_list = []; # bmp: Basic Multilingual Plane | |
my $sps_range_list = []; # sps: Supplementaly Planes | |
while (my ($a, $b) = splice @$invlist, 0, 2) { | |
# Ignore single surrogate | |
if ($a < 0xD800 && 0xD800 < $b) { | |
unshift @$invlist, $a, 0xD800, 0xD800, $b; | |
next; | |
} | |
if (0xD800 <= $a && $b <= 0xDFFF + 1) { | |
next; | |
} | |
if ($a < 0xDFFF + 1 && 0xDFFF + 1 < $b) { | |
unshift @$invlist, 0xDFFF + 1, $b; | |
next; | |
} | |
if ($a < 0x10000 && 0x10000 < $b) { | |
unshift @$invlist, $a, 0x10000, 0x10000, $b; | |
next; | |
} | |
if ($b <= 0x10000) { | |
push @$bmp_range_list, [$a, $b - 1]; | |
} else { | |
push @$sps_range_list, [$a, $b - 1]; | |
} | |
} | |
die 'Invalid property' if !@$bmp_range_list && !@$sps_range_list; | |
my $sps_naive_alternatives = []; | |
for (@$sps_range_list) { | |
my $a_hi = int(($_->[0] - 0x10000) / 0x400) + 0xD800; | |
my $a_lo = (($_->[0] - 0x10000) % 0x400) + 0xDC00; | |
my $b_hi = int(($_->[1] - 0x10000) / 0x400) + 0xD800; | |
my $b_lo = (($_->[1] - 0x10000) % 0x400) + 0xDC00; | |
if ($a_hi == $b_hi) { | |
push @$sps_naive_alternatives, { hi_range => [$a_hi, $a_hi], lo_range => [$a_lo, $b_lo] }; | |
} else { | |
my $has_head = ($a_lo != 0xDC00); | |
my $has_tail = ($b_lo != 0xDFFF); | |
my $body_hi_start = $has_head ? $a_hi + 1 : $a_hi; | |
my $body_hi_end = $has_tail ? $b_hi - 1 : $b_hi; | |
my $has_body = ($body_hi_start <= $body_hi_end); | |
push @$sps_naive_alternatives, { hi_range => [$a_hi, $a_hi], lo_range => [$a_lo, 0xDFFF] } if $has_head; | |
push @$sps_naive_alternatives, { hi_range => [$body_hi_start, $body_hi_end], lo_range => [0xDC00, 0xDFFF] } if $has_body; | |
push @$sps_naive_alternatives, { hi_range => [$b_hi, $b_hi], lo_range => [0xDC00, $b_lo] } if $has_tail; | |
} | |
} | |
my $sps_alternatives = []; | |
my $last_hi = -1; | |
for (@$sps_naive_alternatives) { | |
my $hi = $_->{hi_range}->[1]; | |
if ($hi == $last_hi) { | |
push @{ $sps_alternatives->[-1]->{lo_range_list} }, $_->{lo_range}; | |
} else { | |
push @$sps_alternatives, { | |
hi_range_list => [$_->{hi_range}], | |
lo_range_list => [$_->{lo_range}], | |
}; | |
} | |
$last_hi = $hi; | |
} | |
sub char_class_string { | |
my ($range_list) = @_; | |
if (@$range_list == 1 && $range_list->[0]->[0] == $range_list->[0]->[1]) { | |
return sprintf('\u%04X', $range_list->[0]->[0]); | |
} | |
my $range_strings = [ map { | |
my ($a, $b) = @$_; | |
($a == $b) ? sprintf('\u%04X', $a) : sprintf('\u%04X-\u%04X', $a, $b); | |
} @$range_list ]; | |
return '[' . join('', @$range_strings) . ']'; | |
} | |
my $alternative_strings = [ | |
@$bmp_range_list ? char_class_string($bmp_range_list) : (), | |
map { | |
char_class_string($_->{hi_range_list}) . | |
char_class_string($_->{lo_range_list}) | |
} @$sps_alternatives, | |
]; | |
my $pattern = @$sps_alternatives | |
? '(?:' . join('|', @$alternative_strings) . ')' | |
: $alternative_strings->[0]; | |
say Unicode::UCD::UnicodeVersion; | |
say $pattern; |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment