Last active
May 11, 2016 18:09
-
-
Save Yoxem/bd6f031f847d4e068d661e0791347f7b to your computer and use it in GitHub Desktop.
Simple recursive descent (?) parser with a tiny tokenizer function in Python 3
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
import re | |
token_pattern = \ | |
[ | |
[r'[+]', 'ADD'], | |
[r'[-]', 'SUB'], | |
[r'[+-]?[0-9]*\.[0-9]+(e[+-]?[0-9]+)?','FLOAT'], | |
[r'[+-]?[0-9]+','INT'], | |
[r'[*]', 'MUL'], | |
[r'[/]', 'DIV'], | |
[r'[a-zA-Z_][a-zA-Z0-9_]*','NAME'], | |
[r'[=]', 'DEF'], | |
[r'[(]', 'PATH_L'], | |
[r'[)]', 'PATH_R'], | |
[r'[\n\s\t]',''], # Ignored | |
] | |
grammar_table = \ | |
[ | |
['START', 'NAME', 'DEF', 'EXP'], # START -> NAME 'DEF' EXP | |
['START', 'EXP'], # START -> EXP | |
['EXP', 'A', ['ADD', 'SUB'], 'EXP'], | |
['EXP', 'A'], | |
['A', 'B', ['MUL','DIV'], 'A'], | |
['A', 'B'], | |
['B', 'PATH_L', 'C'], | |
['C', 'EXP', 'PATH_R'], | |
['B', 'D'], | |
['D', 'INT'], | |
['D', 'FLOAT'], | |
['D', 'NAME'], | |
['INT', 'TERM'], #TERM = terminal | |
['FLOAT', 'TERM'], | |
['NAME', 'TERM'], | |
['DIV', 'TERM'], | |
['ADD', 'TERM'], | |
['SUB', 'TERM'], | |
['MUL', 'TERM'], | |
['DEF', 'TERM'], | |
['PATH_L', 'TERM'], | |
['PATH_R', 'TERM'], | |
] | |
input_var = "abcde = (-e.2*-2.0e-9+2 / 12 * - 1.00)" | |
def tokenize(input_var, token_pattern): | |
token_result = [] | |
while input_var: | |
for i in range(len(token_pattern)): | |
token_item = token_pattern[i] | |
item_re_pattern = re.compile(r"^(" + token_item[0] + r")") | |
item_re_search = item_re_pattern.search(input_var) | |
if item_re_search: | |
matched_variable = item_re_search.group(0) | |
else: | |
matched_variable = None | |
if matched_variable: | |
input_var = input_var[len(matched_variable):] | |
#find substring not in "ignored" shown by '' | |
if token_item[1] != '': | |
token_result.append([matched_variable, token_item[1]]) | |
break | |
else: | |
if i == len(token_pattern) - 1: | |
try: | |
raise Exception(input_var[0]) | |
except Exception as ex: | |
print("Error:", ex.args[0] , \ | |
"are not in the token pattern table") | |
raise | |
return token_result | |
def detokenize(token_result): | |
result_string = "" | |
for i in token_result: | |
result_string = result_string+ i[0] | |
return result_string | |
def select_grammar_rule(grammar_table, situation): | |
selected_grammar_rule = [] | |
for i in grammar_table: | |
if i[0] == situation: | |
selected_grammar_rule.append(i) | |
return selected_grammar_rule | |
def parsing_iter(grammar_table, tokenized, situation): | |
selected_grammar_rule = select_grammar_rule(grammar_table, situation) | |
for i,rule in enumerate(selected_grammar_rule): | |
# operator of infix notation | |
if len(rule) == 4 and len(tokenized) >= 3: | |
for j in range(len(tokenized)): | |
if tokenized[j][1] == rule[2] or tokenized[j][1] in rule[2]: | |
'''if the amount of ( and the amount of ) is equal, | |
split it.''' | |
token_name_before = [i[1] for i in tokenized[:j]] | |
path_l_amount = token_name_before.count("PATH_L") | |
path_r_amount = token_name_before.count("PATH_R") | |
if path_r_amount == path_l_amount: | |
return_tree = [tokenized[j], | |
parsing_iter(grammar_table,tokenized[0:j],rule[1]), | |
parsing_iter(grammar_table,tokenized[j+1:],rule[3]) | |
] | |
return return_tree | |
else: | |
continue | |
else: | |
continue | |
if len(rule) == 3 and len(tokenized) >= 2: | |
# poland notation | |
if tokenized[0][1] == rule[1]: | |
return_tree = [tokenized[0:1], | |
parsing_iter(grammar_table,tokenized[1:],rule[2]), | |
] | |
return return_tree | |
# reversing poland notatio n | |
elif tokenized[-1][1] == rule[-1]: | |
return_tree = [parsing_iter(grammar_table,tokenized[:-1],rule[1]), | |
tokenized[-1], | |
] | |
return return_tree | |
else: | |
continue | |
if len(rule) == 2: | |
if rule[1] == "TERM" and len(tokenized) == 1: | |
if rule[0] == "FLOAT": | |
return [float(tokenized[0][0]),tokenized[0][1]] | |
elif rule[0] == "INT": | |
return [int(tokenized[0][0]),tokenized[0][1]] | |
else: | |
return tokenized | |
elif tokenized[0][1] == rule[1]: | |
return_tree = parsing_iter(grammar_table,tokenized[:],rule[1]) | |
return return_tree | |
elif i == len(selected_grammar_rule) - 1: | |
return parsing_iter(grammar_table,tokenized,rule[1]) | |
else: | |
continue | |
def parsing(grammar_table, tokenized): | |
result_table = [] | |
return parsing_iter(grammar_table, tokenized, "START") | |
''' | |
returns: | |
[['abcde', 'NAME'], ['=', 'DEF'], ['(', 'PATH_L'], ['-', 'SUB'], ['e', 'NAME'], ['.2', 'FLOAT'], [ | |
'*', 'MUL'], ['-2.0e-9', 'FLOAT'], ['+2', 'INT'], ['/', 'DIV'], ['a', 'NAME'], ['+', 'ADD'], ['_12 | |
', 'NAME'], ['*', 'MUL'], ['-1.00', 'FLOAT'], ['+', 'ADD'], ['df2', 'NAME'], [')', 'PATH_R']]''' | |
#print(tokenize(input_var, token_pattern)) | |
'''returns: | |
abcde=(-e.2*-2.0e-9+2/a+_12*-1.00+df2)''' | |
#print(detokenize(tokenize(input_var, token_pattern))) | |
'''print token_pattern''' | |
print("token_pattern") | |
print("Pattern\tToken") | |
print("------------------------------") | |
for i in token_pattern: | |
is_first_item = True | |
print_line = "" | |
for j in i: | |
if is_first_item: | |
print_line += j | |
is_first_item = False | |
else: | |
print_line += ('\t'+j) | |
print(print_line) | |
'''print grammar_table''' | |
print("\ngrammar_table") | |
print("LHS\t->\tRHS") | |
print("------------------------------") | |
for i in grammar_table: | |
is_first_item = True | |
print_line = "" | |
for j in i: | |
if is_first_item: | |
print_line += (j + '\t->') | |
is_first_item = False | |
else: | |
if type(j) is list: | |
print_line += ("\t("+ " | ".join(j) + ")") | |
else: | |
print_line += ('\t'+ j) | |
print(print_line) | |
while True: | |
raw_string = input("\n\ninput the string to be parsed>> ") | |
""" | |
(1+2)*3/(405) -> [['(', 'PATH_L'], ['1', 'INT'], ['+', 'ADD'], | |
['2', 'INT'], [')', 'PATH_R'], ['*', 'MUL'], ['3', 'INT'], ['/', 'DIV'], | |
['(', 'PATH_L'], ['405', 'INT'], [')', 'PATH_R']] | |
""" | |
tokenized_list= tokenize(raw_string, token_pattern) | |
""" | |
[['(', 'PATH_L'], ['1', 'INT'], ['+', 'ADD'], | |
['2', 'INT'], [')', 'PATH_R'], ['*', 'MUL'], ['3', 'INT'], ['/', 'DIV'], | |
['(', 'PATH_L'], ['405', 'INT'], [')', 'PATH_R']] -> | |
[['*', 'MUL'], [[['(', 'PATH_L']], [[['+', 'ADD'], [1, 'INT'], [2, 'INT']], | |
[')', 'PATH_R']]], [['/', 'DIV'], [3, 'INT'], [[['(', 'PATH_L']], | |
[[405, 'INT'], [')', 'PATH_R']]]]] | |
""" | |
parsed_result = parsing(grammar_table, tokenized_list) | |
print("Tokenized List:\n", tokenized_list) | |
print("\nParsed Tree:\n", parsed_result) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment