mxbi · May 22, 2021 22:05
diff --git a/earley.py b/earley.py
 ######### SETTING UP GRAMMAR

 rules_list = [("S", "NP", "VP"), 
         ("NP", "N", "PP"),
         ("NP", "N"),
         ("PP", "P", "NP"),
         
         ("VP", "VP", "PP"),
         ("VP", "V", "VP"),
         ("VP", "V", "NP"),
         ("VP", "V"),
        ]

 from collections import defaultdict

 rules = defaultdict(list)
 for rule in rules_list:
    rules[rule[0]].append(rule[1:])
    
 word_rules = defaultdict(list)
 word_rules['N'] = [(x, ) for x in ['they', 'can', 'fish', 'rivers', 'december']]
 word_rules['P'] = [(x, ) for x in ['in']]
 word_rules['V'] = [(x, ) for x in ['can', 'fish']]

 ########### PARSER STARTS HERE

 class Derivation:
    def __init__(self, root, rule, ptr, start, end, hist):
        self.root = root
        self.rule = rule
        self.ptr = ptr
        self.start = start
        self.end = end
        self.hist = hist
        
    def __repr__(self):
        rule2 = list(self.rule)
        rule2.insert(self.ptr, '⬛')
        return f"{self.root.rjust(2)} -> {' '.join(rule2)}".ljust(15) + " [{}, {}] {}".format(self.start, self.end, self.hist)

    def __members(self):
        return (self.root, self.rule, self.ptr, self.start, self.end, self.hist)

    def __eq__(self, other):
        if type(other) is type(self):
            return self.__members() == other.__members()
        else:
            return False

    def __hash__(self):
        return hash(self.__members())

 def predict(chart):
    new_derivs = []
    
    for deriv in chart:
        if deriv.ptr < len(deriv.rule): # incomplete derivation
            first_token = deriv.rule[deriv.ptr]

            for rule in rules[first_token]:
                n = Derivation(first_token, rule, 0, deriv.end, deriv.end, [])
                new_derivs.append(n)

    return chart + new_derivs
  
 def scan(chart, tokens):
    new_derivs = []
    
    for deriv in chart:
        if deriv.ptr < len(deriv.rule):
            first_token = deriv.rule[deriv.ptr]

            if first_token in word_rules and deriv.end < len(tokens): # we can potentially parse a word here
                if (tokens[deriv.end], ) in word_rules[first_token]:
                    n = Derivation(first_token, (tokens[deriv.end], ), 1, deriv.end, deriv.end+1, [])
                    new_derivs.append(n)
                
    return chart + new_derivs

 def complete(chart):
    completions = []
    
    for d1 in chart:
        for i2, d2 in enumerate(chart):
            
            # check if d2 completes d1
            # also need to check that d2 itself is complete
            if d1.ptr < len(d1.rule) and d2.root == d1.rule[d1.ptr] and d2.ptr == len(d2.rule) and d2.start == d1.end:
                n = Derivation(d1.root, d1.rule, d1.ptr+1, d1.start, d2.end, d1.hist + [i2])
                completions.append(n)

    return chart + completions

 def normalize(chart):
    new_chart = []
    for d in chart:
        if d not in new_chart:
            new_chart.append(d)
    return new_chart

 def tokenize(string):
    return string.lower().split(" ")

 def pprint(chart):
    for i, d in enumerate(chart):
        print(f"{i} {d}")

 def execute(string):
    tokens = tokenize(string)
    
    # Start
    chart = []
    chart.append(Derivation('S', rules['S'][0], 0, 0, 0, []))
    
    # TODO: fix hardcoded number of iterations
    for i in range(10):
        print('\n\n###### BEGIN CYCLE ######')
        pprint(chart)
        print('PREDICT')
        chart = normalize(predict(chart))
        pprint(chart)

        print('SCAN')
        chart = normalize(scan(chart, tokens))
        pprint(chart)

        print('COMPLETE')
        while True:
            new_chart = normalize(complete(chart))
            if len(new_chart) == len(chart):
                break
            chart = new_chart
        pprint(chart)
    
    num_parses = 0
    for d in chart:
        if d.root == 'S' and d.end == len(tokens):
            num_parses += 1
    print(num_parses)
	######### SETTING UP GRAMMAR

	rules_list = [("S", "NP", "VP"),
	("NP", "N", "PP"),
	("NP", "N"),
	("PP", "P", "NP"),

	("VP", "VP", "PP"),
	("VP", "V", "VP"),
	("VP", "V", "NP"),
	("VP", "V"),
	]

	from collections import defaultdict

	rules = defaultdict(list)
	for rule in rules_list:
	rules[rule[0]].append(rule[1:])

	word_rules = defaultdict(list)
	word_rules['N'] = [(x, ) for x in ['they', 'can', 'fish', 'rivers', 'december']]
	word_rules['P'] = [(x, ) for x in ['in']]
	word_rules['V'] = [(x, ) for x in ['can', 'fish']]

	########### PARSER STARTS HERE

	class Derivation:
	def __init__(self, root, rule, ptr, start, end, hist):
	self.root = root
	self.rule = rule
	self.ptr = ptr
	self.start = start
	self.end = end
	self.hist = hist

	def __repr__(self):
	rule2 = list(self.rule)
	rule2.insert(self.ptr, '⬛')
	return f"{self.root.rjust(2)} -> {' '.join(rule2)}".ljust(15) + " [{}, {}] {}".format(self.start, self.end, self.hist)

	def __members(self):
	return (self.root, self.rule, self.ptr, self.start, self.end, self.hist)

	def __eq__(self, other):
	if type(other) is type(self):
	return self.__members() == other.__members()
	else:
	return False

	def __hash__(self):
	return hash(self.__members())

	def predict(chart):
	new_derivs = []

	for deriv in chart:
	if deriv.ptr < len(deriv.rule): # incomplete derivation
	first_token = deriv.rule[deriv.ptr]

	for rule in rules[first_token]:
	n = Derivation(first_token, rule, 0, deriv.end, deriv.end, [])
	new_derivs.append(n)

	return chart + new_derivs

	def scan(chart, tokens):
	new_derivs = []

	for deriv in chart:
	if deriv.ptr < len(deriv.rule):
	first_token = deriv.rule[deriv.ptr]

	if first_token in word_rules and deriv.end < len(tokens): # we can potentially parse a word here
	if (tokens[deriv.end], ) in word_rules[first_token]:
	n = Derivation(first_token, (tokens[deriv.end], ), 1, deriv.end, deriv.end+1, [])
	new_derivs.append(n)

	return chart + new_derivs

	def complete(chart):
	completions = []

	for d1 in chart:
	for i2, d2 in enumerate(chart):

	# check if d2 completes d1
	# also need to check that d2 itself is complete
	if d1.ptr < len(d1.rule) and d2.root == d1.rule[d1.ptr] and d2.ptr == len(d2.rule) and d2.start == d1.end:
	n = Derivation(d1.root, d1.rule, d1.ptr+1, d1.start, d2.end, d1.hist + [i2])
	completions.append(n)

	return chart + completions

	def normalize(chart):
	new_chart = []
	for d in chart:
	if d not in new_chart:
	new_chart.append(d)
	return new_chart

	def tokenize(string):
	return string.lower().split(" ")

	def pprint(chart):
	for i, d in enumerate(chart):
	print(f"{i} {d}")

	def execute(string):
	tokens = tokenize(string)

	# Start
	chart = []
	chart.append(Derivation('S', rules['S'][0], 0, 0, 0, []))

	# TODO: fix hardcoded number of iterations
	for i in range(10):
	print('\n\n###### BEGIN CYCLE ######')
	pprint(chart)
	print('PREDICT')
	chart = normalize(predict(chart))
	pprint(chart)

	print('SCAN')
	chart = normalize(scan(chart, tokens))
	pprint(chart)

	print('COMPLETE')
	while True:
	new_chart = normalize(complete(chart))
	if len(new_chart) == len(chart):
	break
	chart = new_chart
	pprint(chart)

	num_parses = 0
	for d in chart:
	if d.root == 'S' and d.end == len(tokens):
	num_parses += 1
	print(num_parses)
No results found