Created
August 23, 2017 11:49
-
-
Save jelmervdl/eb99eb57c78017d46ed7d88c76bbcfae to your computer and use it in GitHub Desktop.
Small (and inefficient) parser in Python that can deal with ambiguity
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from pprint import pprint | |
class l(object): | |
__slots__ = ('word',) | |
def __init__(self, word): | |
self.word = word | |
def __repr__(self): | |
return 'l({})'.format(self.word) | |
def test(self, word): | |
return self.word == word | |
def is_literal(obj): | |
return isinstance(obj, l) | |
rules = [ | |
('extended_claims', ['extended_claim']), | |
('extended_claims', ['extended_claim', l('and'), 'extended_claims']), | |
('extended_claim', ['claim', 'supports', 'attacks']), | |
('claim', [l('birds'), l('can'), l('fly')]), | |
('claim', [l('Tweety'), l('can'), l('fly')]), | |
('claim', [l('Tweety'), l('has'), l('wings')]), | |
('claim', [l('Tweety'), l('is'), l('a'), l('bird')]), | |
('claim', [l('Tweety'), l('is'), l('a'), l('penguin')]), | |
('supports', []), | |
('supports', ['support']), | |
('supports', ['support', l('and'), 'supports']), | |
('support', [l('because'), 'extended_claims']), | |
('attacks', []), | |
('attacks', ['attack']), | |
('attacks', ['attack', l('and'), 'attacks']), | |
('attack', [l('but'), 'extended_claims']) | |
] | |
sentence = 'Tweety can fly because Tweety is a bird and birds can fly but Tweety is a penguin' | |
words = sentence.split(' ') | |
def find_rules(name): | |
found = False | |
for rule in rules: | |
if rule[0] == name: | |
found = True | |
yield rule[1] | |
if not found: | |
raise Error('No rules for {}' .format(name)) | |
def parse(rule, words): | |
for acc, rem_words in _parse(rule[1], words): | |
if len(rem_words) == 0: | |
yield (rule[0], acc) | |
def _parse(rule, words): | |
if len(rule) == 0: | |
yield [], words | |
elif is_literal(rule[0]): | |
if len(words) == 0 or not rule[0].test(words[0]): | |
return | |
else: | |
for subacc, words_rem in _parse(rule[1:], words[1:]): | |
yield [l(words[0])] + subacc, words_rem | |
else: | |
for subrule in find_rules(rule[0]): | |
for subacc, words_rem in _parse(subrule, words): | |
for recacc, rec_words_rem in _parse(rule[1:], words_rem): | |
yield [(rule[0], subacc)] + recacc, rec_words_rem | |
for n, parsed in enumerate(parse(rules[0], words)): | |
print("Parse {}:".format(n)) | |
pprint(parsed) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment