Created
June 2, 2012 08:57
-
-
Save kaharlichenko/2857433 to your computer and use it in GitHub Desktop.
Parsing subset of regexp
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# We will consider the following regular expressions: | |
# | |
# single characters # a | |
# regexp1 regexp2 # ab | |
# regexp * # a* | |
# regexp1 | regexp2 # a|b | |
# ( regexp ) # (a|b)* -- same as (?:a|b) | |
import ply.lex as lex | |
import ply.yacc as yacc | |
############### | |
# Lexer | |
############### | |
tokens = ('LETTER', 'STAR', 'BAR', 'LPAREN', 'RPAREN') | |
t_LETTER = r'[a-zA-Z0-9]' | |
t_STAR = r'\*' | |
t_BAR = r'\|' | |
t_LPAREN = r'\(' | |
t_RPAREN = r'\)' | |
def t_error(t): | |
# XXX Mute for now | |
#print 'ERROR: Lexing error' | |
pass | |
############## | |
# Parser | |
############## | |
# The grammar: | |
# GRP -> ALT | |
# GRP -> ( GRP ) | |
# ALT -> SEQ | |
# ALT -> ALT | SEQ | |
# SEQ -> REP | |
# SEQ -> SEQ REP | |
# REP -> letter | |
# REP -> | |
# REP -> GRP * | |
def p_grp_alt(p): | |
'grp : alt' | |
p[0] = p[1] | |
def p_grp(p): | |
'grp : LPAREN grp RPAREN' | |
p[0] = p[2] | |
def p_alt_seq(p): | |
'alt : seq' | |
p[0] = p[1] | |
def p_alt(p): | |
'alt : alt BAR seq' | |
if p[1][0] == 'any-of': | |
p[0] = ('any-of', p[1][1] + [p[3]]) | |
else: | |
p[0] = ('any-of', [p[1], p[3]]) | |
def p_seq_rep(p): | |
'seq : rep' | |
p[0] = p[1] | |
def p_seq(p): | |
'seq : seq rep' | |
if p[1] is None: | |
p[0] = p[2] | |
elif p[1][0] == 'sequence': | |
p[0] = ('sequence', p[1][1] + [p[2]]) | |
else: | |
p[0] = ('sequence', [p[1], p[2]]) | |
def p_rep_letter(p): | |
'rep : LETTER' | |
p[0] = ('letter', p[1]) | |
def p_rep_empty(p): | |
'rep : ' | |
p[0] = None | |
def p_rep_star(p): | |
'rep : grp STAR' | |
p[0] = ('zero-or-more', p[1]) | |
#def p_error(p): | |
## XXX Mute for now | |
#print 'ERROR: Parsing error' | |
#pass | |
lexer = lex.lex() | |
parser = yacc.yacc() | |
input4c = 'ab|c' | |
output4c = ('any-of', [('sequence', [('letter', 'a'), ('letter', 'b')]), ('letter', 'c')]) | |
print parser.parse(input4c, lexer=lexer) | |
#print parser.parse(input4c, lexer=lexer) == output4c |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment