Last active
March 13, 2019 06:08
-
-
Save seeeturtle/4e091bcfb7886751353322ea84ff2646 to your computer and use it in GitHub Desktop.
PLY for indented language
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import ply.lex as lex | |
from ply.lex import LexToken | |
import ply.yacc as yacc | |
tokens = ('ID', | |
'COLON', | |
'WS', | |
'NEWLINE', | |
'INDENT', | |
'DEDENT') | |
t_ID = r'[a-zA-Z_][a-zA-Z0-9_]*' | |
t_COLON = r':' | |
t_WS = r'[^\n\S]+' | |
t_NEWLINE = r'\n' | |
lexer = lex.lex() | |
class IndentLexer: | |
def __init__(self, lexer): | |
self.lexer = lexer | |
self.tok = None | |
self.data = None | |
def input(self, data): | |
self.lexer.input(data) | |
def token(self): | |
if self.tok is None: | |
self.tok = self._token() | |
try: | |
return next(self.tok) | |
except StopIteration: | |
return None | |
def empty_tok(self): | |
tok = LexToken() | |
(tok.type, | |
tok.value, | |
tok.lineno, | |
tok.lexpos) = ('', '', 0, 0) | |
return tok | |
def logical_lines(self): | |
for t in self.lexer: | |
tokens = [] | |
indent = 0 | |
while t.type != 'NEWLINE': | |
if t.type != 'WS': | |
tokens.append(t) | |
elif not tokens: | |
indent = len(t.value) | |
t = self.lexer.token() | |
tokens.append(t) | |
if len(tokens) == 1 and tokens[0].type == 'NEWLINE': | |
continue | |
if tokens: | |
yield tokens, indent | |
yield 'EOF', 0 | |
def __iter__(self): | |
return self._token() | |
def _token(self): | |
indent_stack = [0] | |
for tokens, indent in self.logical_lines(): | |
indent = indent | |
indent_tok = self.empty_tok() | |
# EOF에 도달하면 가장 처음 레벌(indent=0)으로 돌아가서 끝낸다. | |
if tokens == 'EOF': | |
while len(indent_stack) > 1: | |
indent_tok.type = 'DEDENT' | |
indent_stack.pop() | |
yield indent_tok | |
break | |
last_indent = indent_stack[-1] | |
if last_indent < indent: | |
indent_stack.append(indent) | |
indent_tok.type = 'INDENT' | |
# INDENT 토큰 발행 | |
yield indent_tok | |
elif last_indent > indent: | |
indent_tok.type = 'DEDENT' | |
while indent_stack[-1] > indent: | |
indent_stack.pop() | |
# DEDENT 토큰 발행 | |
yield indent_tok | |
if indent_stack[-1] != indent: | |
raise IndentationError("unindent가 다른 어떤 바깥 인덴트 레벨과 맞지 않습니다.") | |
# 나머지 토큰 발행 | |
yield from tokens | |
# print(f"indent_stack: {indent_stack}", | |
# f"indent: {indent}", | |
# f"tokens: {tokens}", | |
# "", sep='\n') | |
data = """ | |
list of: | |
a | |
b | |
c | |
""" | |
lexer = IndentLexer(lexer) | |
lexer.input(data) | |
for t in lexer: | |
print(t) | |
def p_program(p): | |
'''program : stmts''' | |
p[0] = p[1] | |
def p_stmts(p): | |
'''stmts : stmts stmt | |
| stmt | |
| NEWLINE''' | |
try: | |
p[0] = p[1] + [p[2]] | |
except IndexError: | |
p[0] = [p[1]] | |
def p_stmt(p): | |
'''stmt : simple_stmt NEWLINE | |
| compound_stmt''' | |
p[0] = p[1] | |
def p_simple_stmt(p): | |
'''simple_stmt : expr''' | |
p[0] = p[1] | |
def p_expr(p): | |
'''expr : id''' | |
p[0] = p[1] | |
def p_compound_stmt(p): | |
'''compound_stmt : ID ID COLON suite''' | |
p[0] = p[4] | |
def p_suite(p): | |
'''suite : NEWLINE INDENT stmts DEDENT | |
| simple_stmt NEWLINE ''' | |
if len(p) == 3: | |
p[0] = [p[1]] | |
else: | |
p[0] = p[3] | |
def p_id(p): | |
'''id : ID''' | |
p[0] = p[1] | |
parser = yacc.yacc() | |
data = """ | |
list of: | |
a | |
b | |
c | |
list of: | |
d | |
e | |
""" | |
res = parser.parse(data, lexer=lexer) | |
print(res) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
LexToken(ID,'list',1,1) | |
LexToken(ID,'of',1,6) | |
LexToken(COLON,':',1,8) | |
LexToken(NEWLINE,'\n',1,9) | |
LexToken(INDENT,'',0,0) | |
LexToken(ID,'a',1,14) | |
LexToken(NEWLINE,'\n',1,15) | |
LexToken(ID,'b',1,20) | |
LexToken(NEWLINE,'\n',1,21) | |
LexToken(ID,'c',1,26) | |
LexToken(NEWLINE,'\n',1,27) | |
LexToken(DEDENT,'',0,0) | |
[['a', 'b', 'c', ['d', 'e']]] |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
파서도 추가!