Skip to content

Instantly share code, notes, and snippets.

@jelmervdl
Created August 23, 2017 11:49
Show Gist options
  • Save jelmervdl/eb99eb57c78017d46ed7d88c76bbcfae to your computer and use it in GitHub Desktop.
Save jelmervdl/eb99eb57c78017d46ed7d88c76bbcfae to your computer and use it in GitHub Desktop.
Small (and inefficient) parser in Python that can deal with ambiguity
from pprint import pprint
class l(object):
__slots__ = ('word',)
def __init__(self, word):
self.word = word
def __repr__(self):
return 'l({})'.format(self.word)
def test(self, word):
return self.word == word
def is_literal(obj):
return isinstance(obj, l)
rules = [
('extended_claims', ['extended_claim']),
('extended_claims', ['extended_claim', l('and'), 'extended_claims']),
('extended_claim', ['claim', 'supports', 'attacks']),
('claim', [l('birds'), l('can'), l('fly')]),
('claim', [l('Tweety'), l('can'), l('fly')]),
('claim', [l('Tweety'), l('has'), l('wings')]),
('claim', [l('Tweety'), l('is'), l('a'), l('bird')]),
('claim', [l('Tweety'), l('is'), l('a'), l('penguin')]),
('supports', []),
('supports', ['support']),
('supports', ['support', l('and'), 'supports']),
('support', [l('because'), 'extended_claims']),
('attacks', []),
('attacks', ['attack']),
('attacks', ['attack', l('and'), 'attacks']),
('attack', [l('but'), 'extended_claims'])
]
sentence = 'Tweety can fly because Tweety is a bird and birds can fly but Tweety is a penguin'
words = sentence.split(' ')
def find_rules(name):
found = False
for rule in rules:
if rule[0] == name:
found = True
yield rule[1]
if not found:
raise Error('No rules for {}' .format(name))
def parse(rule, words):
for acc, rem_words in _parse(rule[1], words):
if len(rem_words) == 0:
yield (rule[0], acc)
def _parse(rule, words):
if len(rule) == 0:
yield [], words
elif is_literal(rule[0]):
if len(words) == 0 or not rule[0].test(words[0]):
return
else:
for subacc, words_rem in _parse(rule[1:], words[1:]):
yield [l(words[0])] + subacc, words_rem
else:
for subrule in find_rules(rule[0]):
for subacc, words_rem in _parse(subrule, words):
for recacc, rec_words_rem in _parse(rule[1:], words_rem):
yield [(rule[0], subacc)] + recacc, rec_words_rem
for n, parsed in enumerate(parse(rules[0], words)):
print("Parse {}:".format(n))
pprint(parsed)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment