Skip to content

Instantly share code, notes, and snippets.

@rns
Created March 9, 2016 07:10
Show Gist options
  • Save rns/3e8aec88afc988b6cbb5 to your computer and use it in GitHub Desktop.
Save rns/3e8aec88afc988b6cbb5 to your computer and use it in GitHub Desktop.
template code to build ASTs from ASF per https://github.com/jeffreykegler/Marpa--R2/issues/266
use 5.010;
use strict;
use warnings;
use Test::More tests => 3;
use Marpa::R2;
use Data::Dumper;
$Data::Dumper::Indent = 0;
$Data::Dumper::Terse = 1;
$Data::Dumper::Deepcopy = 1;
my $dsl = <<'END_OF_SOURCE';
:default ::= action => [ name, values ]
lexeme default = action => [ name, value ]
:discard ~ whitespace
S ::= NP VP period
NP ::= NN
| NNS
| DT NN
| NN NNS
| NNS CC NNS
VP ::= VBZ NP
| VP VBZ NNS
| VP CC VP
| VP VP CC VP
| VBZ
CC ~ 'and'
DT ~ 'a' | 'an'
NN ~ 'panda'
NNS ~ 'shoots' | 'leaves'
VBZ ~ 'eats' | 'shoots' | 'leaves'
period ~ '.'
whitespace ~ [\s]+
END_OF_SOURCE
my $sentence = 'a panda eats shoots and leaves.';
my $bracketed_expected = <<'END_OF_OUTPUT';
(S (NP (DT a) (NN panda))
(VP
(VP (VBZ eats) (NP (NNS shoots))) (CC and)
(VP (VBZ leaves)))
(. .))
(S (NP (DT a) (NN panda))
(VP
(VP (VBZ eats))
(VP (VBZ shoots)) (CC and)
(VP (VBZ leaves)))
(. .))
(S (NP (DT a) (NN panda))
(VP (VBZ eats) (NP (NNS shoots) (CC and) (NNS leaves)))
(. .))
END_OF_OUTPUT
# parse $sentence
my $grammar = Marpa::R2::Scanless::G->new( { source => \$dsl, } );
my $recce = Marpa::R2::Scanless::R->new( { grammar => $grammar } );
$recce->read( \$sentence );
# get ASTs by enumerating values
my @expected_asts = ();
while ( defined( my $value_ref = $recce->value() ) ) {
my $value = $value_ref ? ${$value_ref} : 'No parse';
push @expected_asts, $value;
}
is join("\n", sort map { bracket($_) } @expected_asts) . "\n",
$bracketed_expected,
"ASTs from value()";
# get ASTs by traversing the ASF
my $asts = [];
$recce->series_restart(); # recognizer needs to be reset before building the ASF
my $asf = Marpa::R2::ASF->new( { slr => $recce } );
$asts = $asf->traverse( {}, \&ast_builder );
@$asts = map { unpack_token_nodes($_) } @$asts;
# ASTs from the ASF have been built, but their order differs from that
# of ASTs built by enumerating the parse values. Hence, we expect that
# 6 tests must fail, and 3 trees must be the same of a total of 9
my $match = 0;
for my $i_got (0..@$asts-1){
for my $i_expected (0..@expected_asts-1){
$match++ if Dumper($asts->[$i_got]) eq Dumper($expected_asts[$i_expected]);
}
}
is $match, 3, "Ambiguous English phrase: ASTs from ASF";
is join("\n", sort map { bracket($_) } @$asts) . "\n",
$bracketed_expected,
"Ambiguous English phrase: bracketed ASTs from ASF";
# traverser to build ASTs from the ASF
# todo: add callbacks for node construction,
# e.g. for pruning by setting node to undef
my ($token_head, $token_head_length, $token_template);
BEGIN {
$token_head = "__TOKEN_HEAD";
$token_head_length = length($token_head);
$token_template = "C/A*C/A*"; # 2 length-prefixed strings: token’s name and value
}
sub ast_builder {
# This routine converts the glade into a list of elements. It is called recursively.
my ($glade, $scratch) = @_;
my $rule_id = $glade->rule_id();
my $symbol_id = $glade->symbol_id();
my $symbol_name = $grammar->symbol_name($symbol_id);
# A token is a single choice, and we know enough to return the node
if ( not defined $rule_id ) {
my $literal = $glade->literal();
# A token node need to be a single value to avoid confusing all_choices()
# so we pack them to unpack later
return [ $token_head . pack($token_template, $symbol_name, $literal) ];
} ## end if ( not defined $rule_id )
# Our result will be a list of choices
my @return_value = ();
CHOICE: while (1) {
# The results at each position are a list of choices, so
# to produce a new result list, we need to take a Cartesian
# product of all the choices
my @results = $glade->all_choices();
# Special case for the start rule: return all ASTs as array refs
if ( $symbol_name eq '[:start]' ) {
my $asts = [ map { @{$_} } @results ];
# say "# ASTs:\n", Dumper $asts;
return $asts;
}
# Now we have a list of choices, as a list of lists.
# Each sub list is a list of elements, which we need
# to join into a single element. The result will be
# to collapse one level of lists, and leave us
# with a list of elements
push @return_value, map {
# new AST node construction
# to check for root $symbol_name eq start symbol of the grammar
my $node = [ $symbol_name, @{$_} ];
# say "# node:\n", Dumper $node;
# $symbol_name eq 'NP' ? $node : undef;
} @results;
# Look at the next alternative in this glade, or end the
# loop if there is none
last CHOICE if not defined $glade->next();
} ## end CHOICE: while (1)
# Return the list of elements for this glade
return \@return_value;
} ## end sub ast_builder
sub unpack_token_nodes{
my $ast = shift;
if ( ref $ast eq "ARRAY" ){
for my $i (0..@$ast-1){
my $child = $ast->[$i];
# test if a child is a token node
if (ref $child ne "ARRAY" and index ($child, $token_head) == 0 ){
$child = substr $child, $token_head_length;
$ast->[$i] = [ unpack($token_template, $child) ];
}
}
map { unpack_token_nodes($_) } @$ast;
}
return $ast;
}
# stringify AST nodes as (tag text) per Penn Treebank
# structural tags -- need a newline
my %s_tags;
BEGIN { %s_tags = map { $_ => undef } qw{ VP period } }
sub bracket {
my ($ast) = @_;
my ($tag, @contents) = @{$ast};
state $level++;
my $bracketed = exists $s_tags{$tag} ? ("\n" . (" " x ($level-1))) : '';
$tag = '.' if $tag eq 'period';
if (ref $contents[0] eq "ARRAY"){
$bracketed .= "($tag " . join(' ', map { bracket($_) } @contents) . ")";
}
else {
$bracketed .= "($tag $contents[0])";
}
$level--;
$bracketed =~ s/ \n/\n/gs;
return $bracketed;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment