Skip to content

Instantly share code, notes, and snippets.

@dcluna
Created July 18, 2012 04:07
Show Gist options
  • Select an option

  • Save dcluna/3134101 to your computer and use it in GitHub Desktop.

Select an option

Save dcluna/3134101 to your computer and use it in GitHub Desktop.
Regex parser in python
"Recursive-descent parser for Regexes, based on: http://matt.might.net/articles/parsing-regex-with-recursive-descent/ \
Test project for Emacs + Python"
class Parser():
"Recursive descent regex parser"
def __init__(self, regex):
"Initializes the parser with the regex string"
self.input = regex
self.current = 0
# Primitives
def __peek__(self):
"Peeks at next input"
return self.input[self.current]
def __consume__(self, input):
"Consumes token if input is accepted"
if self.__peek__() == input:
self.current += 1
else:
raise ParserException('Expected \'' + self.__peek__() + '\', got \'' + input + '\'')
def __next__(self):
"Consumes token and returns it if successful"
token = self.__peek__()
self.__consume__(token)
return token
# Non-default primitive
def __more__(self):
"Checks if there is more input"
return self.current != self.input.__len__()
# Grammar:
# <regex> ::= <term> '|' <regex>
# | <term>
# <term> ::= { <factor> }
# <factor> ::= <base> { '*' }
# <base> ::= <char>
# | '\' <char>
# | '(' <regex> ')'
# The implementation is straightforward:
def regex(self):
"<regex> ::= <term> '|' <regex> \
| <term>"
term = self.term() # parses a term
if self.__more__() and self.__peek__() == '|': # if term is followed by '|'
self.__consume__('|')
regex = self.regex()
choice = Choice(term,regex)
return choice
else:
return term
def term(self):
"<term> ::= { <factor> }"
factor = [] # begins empty
while self.__more__() and self.__peek__() != '|' and self.__peek__() != ')': # while there's still input and it's not a regex delimiter
factor += [self.factor()]
return factor
def factor(self):
"<factor> ::= <base> { '*' }"
base = self.base()
if self.__more__() and self.__peek__() == '*':
self.__consume__('*')
return Repetition(base)
else:
return base
def base(self):
"<base> ::= <char> \
| '\' <char> \
| '(' <regex> ')'"
if self.__peek__() == '\\':
self.__consume__('\\')
return '\\' + self.consumechar()
elif self.__peek__() == '(':
self.__consume__('(')
regex = self.regex()
self.__consume__(')')
return regex
else:
return self.consumechar()
def consumechar(self):
"Expects an alphanumeric character"
c = self.__next__()
if not c.isalnum():
raise ParserException('Expected alphanumeric, got ' + c + ' at ' + self.current)
else:
return c
# def parse(self):
# acc = []
# while self.__more__():
# acc += [self.regex()]
# return acc
class Regex():
"Abstract class to represent a regex"
class Choice(Regex):
"Choice between a term and another regex"
def __init__(self, regex1, regex2):
"Holds 2 regexes"
self.regex1 = regex1
self.regex2 = regex2
class Repetition(Regex):
"Repetition between regex"
def __init__(self, repetition):
"Holds repetition"
self.repetition = repetition
class ParserException(Exception):
"Thrown when a parser error occurs"
def __init__(self, message):
"Prints error message"
print message
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment