Created
August 6, 2012 05:37
-
-
Save jeffreykegler/3271072 to your computer and use it in GitHub Desktop.
Marpa v. Perl regex, round 2
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!perl | |
use 5.010; | |
use strict; | |
use warnings; | |
use autodie; | |
use Benchmark qw(timethis); | |
use List::Util qw(min); | |
use Regexp::Common qw /balanced/; | |
use Marpa::R2 2.016000; | |
my $tchrist_regex = '(\\((?:[^()]++|(?-1))*+\\))'; | |
my $marpa_answer_shown; | |
my $thin_answer_shown; | |
my $regex_answer_shown; | |
sub do_marpa_r2 { | |
my ($s) = @_; | |
my $grammar_args = { | |
start => 'S', | |
rules => [ | |
[ S => [qw(prefix first_balanced endmark )] ], | |
{ lhs => 'S', | |
rhs => [qw(prefix first_balanced )] | |
}, | |
{ lhs => 'prefix', rhs => [qw(prefix_char)], min => 0 }, | |
{ lhs => 'prefix_char', rhs => [qw(xlparen)] }, | |
{ lhs => 'prefix_char', rhs => [qw(rparen)] }, | |
{ lhs => 'lparen', rhs => [qw(xlparen)] }, | |
{ lhs => 'lparen', rhs => [qw(ilparen)] }, | |
{ lhs => 'first_balanced', | |
rhs => [qw(xlparen balanced_sequence rparen)], | |
}, | |
{ lhs => 'balanced', | |
rhs => [qw(lparen balanced_sequence rparen)], | |
}, | |
{ lhs => 'balanced_sequence', | |
rhs => [qw(balanced)], | |
min => 0, | |
}, | |
], | |
}; | |
my $grammar = Marpa::R2::Grammar->new($grammar_args); | |
$grammar->precompute(); | |
my ($first_balanced_rule) = | |
grep { ( $grammar->rule($_) )[0] eq 'first_balanced' } | |
$grammar->rule_ids(); | |
my $recce = Marpa::R2::Recognizer->new( { grammar => $grammar } ); | |
$recce->expected_symbol_event_set( 'endmark', 1 ); | |
my $location = 0; | |
my $string_length = length $s; | |
my $end_of_match; | |
# find the match which ends first -- the one which starts | |
# first must start at or before it does | |
CHAR: while ( $location < $string_length ) { | |
my $value = substr $s, $location, 1; | |
my $event_count; | |
if ( $value eq '(' ) { | |
# say "Adding xlparen at $location"; | |
$event_count = $recce->read('xlparen'); | |
} | |
else { | |
# say "Adding rparen at $location"; | |
$event_count = $recce->read('rparen'); | |
} | |
if ( $event_count | |
and grep { $_->[0] eq 'SYMBOL_EXPECTED' } @{ $recce->events() } ) | |
{ | |
$end_of_match = $location + 1; | |
last CHAR; | |
} ## end if ( $event_count and grep { $_->[0] eq 'SYMBOL_EXPECTED'...}) | |
$location++; | |
} ## end CHAR: while ( $location < $string_length ) | |
if ( not defined $end_of_match ) { | |
say 'No balanced parens'; | |
return 0; | |
} | |
CHAR: while ( ++$location < $string_length ) { | |
my $value = substr $s, $location, 1; | |
my $token = $value eq '(' ? 'ilparen' : 'rparen'; | |
my $event_count = $recce->read($token); | |
last CHAR if not defined $event_count; | |
if ( $event_count | |
and grep { $_->[0] eq 'SYMBOL_EXPECTED' } @{ $recce->events() } ) | |
{ | |
$end_of_match = $location + 1; | |
} | |
} ## end CHAR: while ( ++$location < $string_length ) | |
my $report = $recce->progress($end_of_match); | |
# say Dumper($report); | |
my $start_of_match = List::Util::min map { $_->[2] } | |
grep { $_->[1] < 0 && $_->[0] == $first_balanced_rule } @{$report}; | |
my $value = substr $s, $start_of_match, $end_of_match - $start_of_match; | |
return 0 if $marpa_answer_shown; | |
$marpa_answer_shown = $value; | |
say qq{Marpa::R2: "$value" at $start_of_match-$end_of_match}; | |
return 0; | |
} ## end sub do_marpa_r2 | |
sub do_regex { | |
my ($s) = @_; | |
my $answer = | |
$s =~ $tchrist_regex | |
? $1 | |
: 'no balanced parentheses'; | |
return 0 if $regex_answer_shown; | |
$regex_answer_shown = $answer; | |
say qq{regex: "$answer"}; | |
return 0; | |
} ## end sub do_regex | |
sub do_thin { | |
my ($s) = @_; | |
my $thin_grammar = Marpa::R2::Thin::G->new( { if => 1 } ); | |
my $s_xlparen = $thin_grammar->symbol_new(); | |
my $s_ilparen = $thin_grammar->symbol_new(); | |
my $s_rparen = $thin_grammar->symbol_new(); | |
my $s_lparen = $thin_grammar->symbol_new(); | |
my $s_endmark = $thin_grammar->symbol_new(); | |
my $s_start = $thin_grammar->symbol_new(); | |
my $s_prefix = $thin_grammar->symbol_new(); | |
my $s_first_balanced = $thin_grammar->symbol_new(); | |
my $s_prefix_char = $thin_grammar->symbol_new(); | |
my $s_balanced_sequence = $thin_grammar->symbol_new(); | |
my $s_balanced = $thin_grammar->symbol_new(); | |
$thin_grammar->start_symbol_set($s_start); | |
$thin_grammar->rule_new( $s_start, | |
[ $s_prefix, $s_first_balanced, $s_endmark ] ); | |
$thin_grammar->rule_new( $s_start, [ $s_prefix, $s_first_balanced ] ); | |
$thin_grammar->rule_new( $s_prefix_char, [$s_xlparen] ); | |
$thin_grammar->rule_new( $s_prefix_char, [$s_rparen] ); | |
$thin_grammar->rule_new( $s_lparen, [$s_xlparen] ); | |
$thin_grammar->rule_new( $s_lparen, [$s_ilparen] ); | |
my $first_balanced_rule = | |
$thin_grammar->rule_new( $s_first_balanced, | |
[ $s_xlparen, $s_balanced_sequence, $s_rparen ] ); | |
$thin_grammar->rule_new( $s_balanced, | |
[ $s_lparen, $s_balanced_sequence, $s_rparen ] ); | |
$thin_grammar->sequence_new( $s_prefix, $s_prefix_char, { min => 0 } ); | |
$thin_grammar->sequence_new( $s_balanced_sequence, $s_balanced, | |
{ min => 0 } ); | |
$thin_grammar->precompute(); | |
my $thin_recce = Marpa::R2::Thin::R->new($thin_grammar); | |
$thin_recce->start_input(); | |
$thin_recce->expected_symbol_event_set( $s_endmark, 1 ); | |
my $location = 0; | |
my $string_length = length $s; | |
my $end_of_match; | |
# find the match which ends first -- the one which starts | |
# first must start at or before it does | |
CHAR: while ( $location < $string_length ) { | |
my $value = substr $s, $location, 1; | |
my $event_count; | |
if ( $value eq '(' ) { | |
# say "Adding xlparen at $location"; | |
$thin_recce->alternative( $s_xlparen, 0, 1 ); | |
$event_count = $thin_recce->earleme_complete(); | |
} ## end if ( $value eq '(' ) ) | |
else { | |
# say "Adding rparen at $location"; | |
$thin_recce->alternative( $s_rparen, 0, 1 ); | |
$event_count = $thin_recce->earleme_complete(); | |
} | |
if ($event_count | |
and grep { $_ eq 'MARPA_EVENT_SYMBOL_EXPECTED' } | |
map { ; ( $thin_grammar->event($_) )[0] } | |
( 0 .. $event_count - 1 ) | |
) | |
{ | |
$end_of_match = $location + 1; | |
last CHAR; | |
} ## end if ( $event_count and grep { $_ eq ...}) | |
$location++; | |
} ## end CHAR: while ( $location < $string_length ) | |
if ( not defined $end_of_match ) { | |
say 'No balanced parens'; | |
return 0; | |
} | |
CHAR: while ( ++$location < $string_length ) { | |
my $value = substr $s, $location, 1; | |
my $token = $value eq '(' ? $s_ilparen : $s_rparen; | |
# say "Adding $token at $location"; | |
last CHAR if not defined $thin_recce->alternative( $token, 0, 1 ); | |
my $event_count = $thin_recce->earleme_complete(); | |
if ($event_count | |
and grep { $_ eq 'MARPA_EVENT_SYMBOL_EXPECTED' } | |
map { ; ( $thin_grammar->event($_) )[0] } | |
( 0 .. $event_count - 1 ) | |
) | |
{ | |
$end_of_match = $location + 1; | |
} ## end if ( $event_count and grep { $_ eq ...}) | |
} ## end CHAR: while ( ++$location < $string_length ) | |
my $start_of_match = $end_of_match; | |
$thin_recce->progress_report_start($end_of_match); | |
ITEM: while (1) { | |
my ( $rule_id, $dot_position, $item_origin ) = | |
$thin_recce->progress_item(); | |
last ITEM if not defined $rule_id; | |
next ITEM if $dot_position >= 0; | |
next ITEM if $rule_id != $first_balanced_rule; | |
$start_of_match = $item_origin if $item_origin < $start_of_match; | |
} ## end ITEM: while (1) | |
my $value = substr $s, $start_of_match, $end_of_match - $start_of_match; | |
return 0 if $thin_answer_shown; | |
$thin_answer_shown = $value; | |
say qq{Marpa::R2::Thin: "$value" at $start_of_match-$end_of_match}; | |
return 0; | |
} ## end sub do_thin | |
for my $length (qw(10 100 500 1000 2000 3000)) { | |
my $target = '(()())(('; | |
my $test_string = ( '(' x ( $length - length $target ) ) . $target; | |
timethis( -4, sub { do_thin($test_string); }, "Marpa::R2::Thin $length" ); | |
timethis( -4, sub { do_marpa_r2($test_string); }, "Marpa::R2 $length" ); | |
timethis( -4, sub { do_regex($test_string); }, "regex $length" ); | |
my $answer = '(()())'; | |
$marpa_answer_shown eq $answer | |
or say {*STDERR} 'R2 ANSWER DOES NOT MATCH!'; | |
$thin_answer_shown eq $answer | |
or say {*STDERR} 'Thin ANSWER DOES NOT MATCH!'; | |
$regex_answer_shown eq $answer | |
or say {*STDERR} 'Regex ANSWER DOES NOT MATCH!'; | |
$marpa_answer_shown = $thin_answer_shown = $regex_answer_shown = 0; | |
} ## end for my $length (qw(10 100 500 1000 2000 3000)) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment