Skip to content

Instantly share code, notes, and snippets.

@jeffreykegler
Created August 6, 2012 05:37
Show Gist options
  • Save jeffreykegler/3271072 to your computer and use it in GitHub Desktop.
Save jeffreykegler/3271072 to your computer and use it in GitHub Desktop.
Marpa v. Perl regex, round 2
#!perl
use 5.010;
use strict;
use warnings;
use autodie;
use Benchmark qw(timethis);
use List::Util qw(min);
use Regexp::Common qw /balanced/;
use Marpa::R2 2.016000;
my $tchrist_regex = '(\\((?:[^()]++|(?-1))*+\\))';
my $marpa_answer_shown;
my $thin_answer_shown;
my $regex_answer_shown;
sub do_marpa_r2 {
my ($s) = @_;
my $grammar_args = {
start => 'S',
rules => [
[ S => [qw(prefix first_balanced endmark )] ],
{ lhs => 'S',
rhs => [qw(prefix first_balanced )]
},
{ lhs => 'prefix', rhs => [qw(prefix_char)], min => 0 },
{ lhs => 'prefix_char', rhs => [qw(xlparen)] },
{ lhs => 'prefix_char', rhs => [qw(rparen)] },
{ lhs => 'lparen', rhs => [qw(xlparen)] },
{ lhs => 'lparen', rhs => [qw(ilparen)] },
{ lhs => 'first_balanced',
rhs => [qw(xlparen balanced_sequence rparen)],
},
{ lhs => 'balanced',
rhs => [qw(lparen balanced_sequence rparen)],
},
{ lhs => 'balanced_sequence',
rhs => [qw(balanced)],
min => 0,
},
],
};
my $grammar = Marpa::R2::Grammar->new($grammar_args);
$grammar->precompute();
my ($first_balanced_rule) =
grep { ( $grammar->rule($_) )[0] eq 'first_balanced' }
$grammar->rule_ids();
my $recce = Marpa::R2::Recognizer->new( { grammar => $grammar } );
$recce->expected_symbol_event_set( 'endmark', 1 );
my $location = 0;
my $string_length = length $s;
my $end_of_match;
# find the match which ends first -- the one which starts
# first must start at or before it does
CHAR: while ( $location < $string_length ) {
my $value = substr $s, $location, 1;
my $event_count;
if ( $value eq '(' ) {
# say "Adding xlparen at $location";
$event_count = $recce->read('xlparen');
}
else {
# say "Adding rparen at $location";
$event_count = $recce->read('rparen');
}
if ( $event_count
and grep { $_->[0] eq 'SYMBOL_EXPECTED' } @{ $recce->events() } )
{
$end_of_match = $location + 1;
last CHAR;
} ## end if ( $event_count and grep { $_->[0] eq 'SYMBOL_EXPECTED'...})
$location++;
} ## end CHAR: while ( $location < $string_length )
if ( not defined $end_of_match ) {
say 'No balanced parens';
return 0;
}
CHAR: while ( ++$location < $string_length ) {
my $value = substr $s, $location, 1;
my $token = $value eq '(' ? 'ilparen' : 'rparen';
my $event_count = $recce->read($token);
last CHAR if not defined $event_count;
if ( $event_count
and grep { $_->[0] eq 'SYMBOL_EXPECTED' } @{ $recce->events() } )
{
$end_of_match = $location + 1;
}
} ## end CHAR: while ( ++$location < $string_length )
my $report = $recce->progress($end_of_match);
# say Dumper($report);
my $start_of_match = List::Util::min map { $_->[2] }
grep { $_->[1] < 0 && $_->[0] == $first_balanced_rule } @{$report};
my $value = substr $s, $start_of_match, $end_of_match - $start_of_match;
return 0 if $marpa_answer_shown;
$marpa_answer_shown = $value;
say qq{Marpa::R2: "$value" at $start_of_match-$end_of_match};
return 0;
} ## end sub do_marpa_r2
sub do_regex {
my ($s) = @_;
my $answer =
$s =~ $tchrist_regex
? $1
: 'no balanced parentheses';
return 0 if $regex_answer_shown;
$regex_answer_shown = $answer;
say qq{regex: "$answer"};
return 0;
} ## end sub do_regex
sub do_thin {
my ($s) = @_;
my $thin_grammar = Marpa::R2::Thin::G->new( { if => 1 } );
my $s_xlparen = $thin_grammar->symbol_new();
my $s_ilparen = $thin_grammar->symbol_new();
my $s_rparen = $thin_grammar->symbol_new();
my $s_lparen = $thin_grammar->symbol_new();
my $s_endmark = $thin_grammar->symbol_new();
my $s_start = $thin_grammar->symbol_new();
my $s_prefix = $thin_grammar->symbol_new();
my $s_first_balanced = $thin_grammar->symbol_new();
my $s_prefix_char = $thin_grammar->symbol_new();
my $s_balanced_sequence = $thin_grammar->symbol_new();
my $s_balanced = $thin_grammar->symbol_new();
$thin_grammar->start_symbol_set($s_start);
$thin_grammar->rule_new( $s_start,
[ $s_prefix, $s_first_balanced, $s_endmark ] );
$thin_grammar->rule_new( $s_start, [ $s_prefix, $s_first_balanced ] );
$thin_grammar->rule_new( $s_prefix_char, [$s_xlparen] );
$thin_grammar->rule_new( $s_prefix_char, [$s_rparen] );
$thin_grammar->rule_new( $s_lparen, [$s_xlparen] );
$thin_grammar->rule_new( $s_lparen, [$s_ilparen] );
my $first_balanced_rule =
$thin_grammar->rule_new( $s_first_balanced,
[ $s_xlparen, $s_balanced_sequence, $s_rparen ] );
$thin_grammar->rule_new( $s_balanced,
[ $s_lparen, $s_balanced_sequence, $s_rparen ] );
$thin_grammar->sequence_new( $s_prefix, $s_prefix_char, { min => 0 } );
$thin_grammar->sequence_new( $s_balanced_sequence, $s_balanced,
{ min => 0 } );
$thin_grammar->precompute();
my $thin_recce = Marpa::R2::Thin::R->new($thin_grammar);
$thin_recce->start_input();
$thin_recce->expected_symbol_event_set( $s_endmark, 1 );
my $location = 0;
my $string_length = length $s;
my $end_of_match;
# find the match which ends first -- the one which starts
# first must start at or before it does
CHAR: while ( $location < $string_length ) {
my $value = substr $s, $location, 1;
my $event_count;
if ( $value eq '(' ) {
# say "Adding xlparen at $location";
$thin_recce->alternative( $s_xlparen, 0, 1 );
$event_count = $thin_recce->earleme_complete();
} ## end if ( $value eq '(' ) )
else {
# say "Adding rparen at $location";
$thin_recce->alternative( $s_rparen, 0, 1 );
$event_count = $thin_recce->earleme_complete();
}
if ($event_count
and grep { $_ eq 'MARPA_EVENT_SYMBOL_EXPECTED' }
map { ; ( $thin_grammar->event($_) )[0] }
( 0 .. $event_count - 1 )
)
{
$end_of_match = $location + 1;
last CHAR;
} ## end if ( $event_count and grep { $_ eq ...})
$location++;
} ## end CHAR: while ( $location < $string_length )
if ( not defined $end_of_match ) {
say 'No balanced parens';
return 0;
}
CHAR: while ( ++$location < $string_length ) {
my $value = substr $s, $location, 1;
my $token = $value eq '(' ? $s_ilparen : $s_rparen;
# say "Adding $token at $location";
last CHAR if not defined $thin_recce->alternative( $token, 0, 1 );
my $event_count = $thin_recce->earleme_complete();
if ($event_count
and grep { $_ eq 'MARPA_EVENT_SYMBOL_EXPECTED' }
map { ; ( $thin_grammar->event($_) )[0] }
( 0 .. $event_count - 1 )
)
{
$end_of_match = $location + 1;
} ## end if ( $event_count and grep { $_ eq ...})
} ## end CHAR: while ( ++$location < $string_length )
my $start_of_match = $end_of_match;
$thin_recce->progress_report_start($end_of_match);
ITEM: while (1) {
my ( $rule_id, $dot_position, $item_origin ) =
$thin_recce->progress_item();
last ITEM if not defined $rule_id;
next ITEM if $dot_position >= 0;
next ITEM if $rule_id != $first_balanced_rule;
$start_of_match = $item_origin if $item_origin < $start_of_match;
} ## end ITEM: while (1)
my $value = substr $s, $start_of_match, $end_of_match - $start_of_match;
return 0 if $thin_answer_shown;
$thin_answer_shown = $value;
say qq{Marpa::R2::Thin: "$value" at $start_of_match-$end_of_match};
return 0;
} ## end sub do_thin
for my $length (qw(10 100 500 1000 2000 3000)) {
my $target = '(()())((';
my $test_string = ( '(' x ( $length - length $target ) ) . $target;
timethis( -4, sub { do_thin($test_string); }, "Marpa::R2::Thin $length" );
timethis( -4, sub { do_marpa_r2($test_string); }, "Marpa::R2 $length" );
timethis( -4, sub { do_regex($test_string); }, "regex $length" );
my $answer = '(()())';
$marpa_answer_shown eq $answer
or say {*STDERR} 'R2 ANSWER DOES NOT MATCH!';
$thin_answer_shown eq $answer
or say {*STDERR} 'Thin ANSWER DOES NOT MATCH!';
$regex_answer_shown eq $answer
or say {*STDERR} 'Regex ANSWER DOES NOT MATCH!';
$marpa_answer_shown = $thin_answer_shown = $regex_answer_shown = 0;
} ## end for my $length (qw(10 100 500 1000 2000 3000))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment