Created
March 9, 2016 07:10
-
-
Save rns/3e8aec88afc988b6cbb5 to your computer and use it in GitHub Desktop.
template code to build ASTs from ASF per https://github.com/jeffreykegler/Marpa--R2/issues/266
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
use 5.010; | |
use strict; | |
use warnings; | |
use Test::More tests => 3; | |
use Marpa::R2; | |
use Data::Dumper; | |
$Data::Dumper::Indent = 0; | |
$Data::Dumper::Terse = 1; | |
$Data::Dumper::Deepcopy = 1; | |
my $dsl = <<'END_OF_SOURCE'; | |
:default ::= action => [ name, values ] | |
lexeme default = action => [ name, value ] | |
:discard ~ whitespace | |
S ::= NP VP period | |
NP ::= NN | |
| NNS | |
| DT NN | |
| NN NNS | |
| NNS CC NNS | |
VP ::= VBZ NP | |
| VP VBZ NNS | |
| VP CC VP | |
| VP VP CC VP | |
| VBZ | |
CC ~ 'and' | |
DT ~ 'a' | 'an' | |
NN ~ 'panda' | |
NNS ~ 'shoots' | 'leaves' | |
VBZ ~ 'eats' | 'shoots' | 'leaves' | |
period ~ '.' | |
whitespace ~ [\s]+ | |
END_OF_SOURCE | |
my $sentence = 'a panda eats shoots and leaves.'; | |
my $bracketed_expected = <<'END_OF_OUTPUT'; | |
(S (NP (DT a) (NN panda)) | |
(VP | |
(VP (VBZ eats) (NP (NNS shoots))) (CC and) | |
(VP (VBZ leaves))) | |
(. .)) | |
(S (NP (DT a) (NN panda)) | |
(VP | |
(VP (VBZ eats)) | |
(VP (VBZ shoots)) (CC and) | |
(VP (VBZ leaves))) | |
(. .)) | |
(S (NP (DT a) (NN panda)) | |
(VP (VBZ eats) (NP (NNS shoots) (CC and) (NNS leaves))) | |
(. .)) | |
END_OF_OUTPUT | |
# parse $sentence | |
my $grammar = Marpa::R2::Scanless::G->new( { source => \$dsl, } ); | |
my $recce = Marpa::R2::Scanless::R->new( { grammar => $grammar } ); | |
$recce->read( \$sentence ); | |
# get ASTs by enumerating values | |
my @expected_asts = (); | |
while ( defined( my $value_ref = $recce->value() ) ) { | |
my $value = $value_ref ? ${$value_ref} : 'No parse'; | |
push @expected_asts, $value; | |
} | |
is join("\n", sort map { bracket($_) } @expected_asts) . "\n", | |
$bracketed_expected, | |
"ASTs from value()"; | |
# get ASTs by traversing the ASF | |
my $asts = []; | |
$recce->series_restart(); # recognizer needs to be reset before building the ASF | |
my $asf = Marpa::R2::ASF->new( { slr => $recce } ); | |
$asts = $asf->traverse( {}, \&ast_builder ); | |
@$asts = map { unpack_token_nodes($_) } @$asts; | |
# ASTs from the ASF have been built, but their order differs from that | |
# of ASTs built by enumerating the parse values. Hence, we expect that | |
# 6 tests must fail, and 3 trees must be the same of a total of 9 | |
my $match = 0; | |
for my $i_got (0..@$asts-1){ | |
for my $i_expected (0..@expected_asts-1){ | |
$match++ if Dumper($asts->[$i_got]) eq Dumper($expected_asts[$i_expected]); | |
} | |
} | |
is $match, 3, "Ambiguous English phrase: ASTs from ASF"; | |
is join("\n", sort map { bracket($_) } @$asts) . "\n", | |
$bracketed_expected, | |
"Ambiguous English phrase: bracketed ASTs from ASF"; | |
# traverser to build ASTs from the ASF | |
# todo: add callbacks for node construction, | |
# e.g. for pruning by setting node to undef | |
my ($token_head, $token_head_length, $token_template); | |
BEGIN { | |
$token_head = "__TOKEN_HEAD"; | |
$token_head_length = length($token_head); | |
$token_template = "C/A*C/A*"; # 2 length-prefixed strings: token’s name and value | |
} | |
sub ast_builder { | |
# This routine converts the glade into a list of elements. It is called recursively. | |
my ($glade, $scratch) = @_; | |
my $rule_id = $glade->rule_id(); | |
my $symbol_id = $glade->symbol_id(); | |
my $symbol_name = $grammar->symbol_name($symbol_id); | |
# A token is a single choice, and we know enough to return the node | |
if ( not defined $rule_id ) { | |
my $literal = $glade->literal(); | |
# A token node need to be a single value to avoid confusing all_choices() | |
# so we pack them to unpack later | |
return [ $token_head . pack($token_template, $symbol_name, $literal) ]; | |
} ## end if ( not defined $rule_id ) | |
# Our result will be a list of choices | |
my @return_value = (); | |
CHOICE: while (1) { | |
# The results at each position are a list of choices, so | |
# to produce a new result list, we need to take a Cartesian | |
# product of all the choices | |
my @results = $glade->all_choices(); | |
# Special case for the start rule: return all ASTs as array refs | |
if ( $symbol_name eq '[:start]' ) { | |
my $asts = [ map { @{$_} } @results ]; | |
# say "# ASTs:\n", Dumper $asts; | |
return $asts; | |
} | |
# Now we have a list of choices, as a list of lists. | |
# Each sub list is a list of elements, which we need | |
# to join into a single element. The result will be | |
# to collapse one level of lists, and leave us | |
# with a list of elements | |
push @return_value, map { | |
# new AST node construction | |
# to check for root $symbol_name eq start symbol of the grammar | |
my $node = [ $symbol_name, @{$_} ]; | |
# say "# node:\n", Dumper $node; | |
# $symbol_name eq 'NP' ? $node : undef; | |
} @results; | |
# Look at the next alternative in this glade, or end the | |
# loop if there is none | |
last CHOICE if not defined $glade->next(); | |
} ## end CHOICE: while (1) | |
# Return the list of elements for this glade | |
return \@return_value; | |
} ## end sub ast_builder | |
sub unpack_token_nodes{ | |
my $ast = shift; | |
if ( ref $ast eq "ARRAY" ){ | |
for my $i (0..@$ast-1){ | |
my $child = $ast->[$i]; | |
# test if a child is a token node | |
if (ref $child ne "ARRAY" and index ($child, $token_head) == 0 ){ | |
$child = substr $child, $token_head_length; | |
$ast->[$i] = [ unpack($token_template, $child) ]; | |
} | |
} | |
map { unpack_token_nodes($_) } @$ast; | |
} | |
return $ast; | |
} | |
# stringify AST nodes as (tag text) per Penn Treebank | |
# structural tags -- need a newline | |
my %s_tags; | |
BEGIN { %s_tags = map { $_ => undef } qw{ VP period } } | |
sub bracket { | |
my ($ast) = @_; | |
my ($tag, @contents) = @{$ast}; | |
state $level++; | |
my $bracketed = exists $s_tags{$tag} ? ("\n" . (" " x ($level-1))) : ''; | |
$tag = '.' if $tag eq 'period'; | |
if (ref $contents[0] eq "ARRAY"){ | |
$bracketed .= "($tag " . join(' ', map { bracket($_) } @contents) . ")"; | |
} | |
else { | |
$bracketed .= "($tag $contents[0])"; | |
} | |
$level--; | |
$bracketed =~ s/ \n/\n/gs; | |
return $bracketed; | |
} | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment