Created
July 18, 2012 04:07
-
-
Save dcluna/3134101 to your computer and use it in GitHub Desktop.
Regex parser in python
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| "Recursive-descent parser for Regexes, based on: http://matt.might.net/articles/parsing-regex-with-recursive-descent/ \ | |
| Test project for Emacs + Python" | |
| class Parser(): | |
| "Recursive descent regex parser" | |
| def __init__(self, regex): | |
| "Initializes the parser with the regex string" | |
| self.input = regex | |
| self.current = 0 | |
| # Primitives | |
| def __peek__(self): | |
| "Peeks at next input" | |
| return self.input[self.current] | |
| def __consume__(self, input): | |
| "Consumes token if input is accepted" | |
| if self.__peek__() == input: | |
| self.current += 1 | |
| else: | |
| raise ParserException('Expected \'' + self.__peek__() + '\', got \'' + input + '\'') | |
| def __next__(self): | |
| "Consumes token and returns it if successful" | |
| token = self.__peek__() | |
| self.__consume__(token) | |
| return token | |
| # Non-default primitive | |
| def __more__(self): | |
| "Checks if there is more input" | |
| return self.current != self.input.__len__() | |
| # Grammar: | |
| # <regex> ::= <term> '|' <regex> | |
| # | <term> | |
| # <term> ::= { <factor> } | |
| # <factor> ::= <base> { '*' } | |
| # <base> ::= <char> | |
| # | '\' <char> | |
| # | '(' <regex> ')' | |
| # The implementation is straightforward: | |
| def regex(self): | |
| "<regex> ::= <term> '|' <regex> \ | |
| | <term>" | |
| term = self.term() # parses a term | |
| if self.__more__() and self.__peek__() == '|': # if term is followed by '|' | |
| self.__consume__('|') | |
| regex = self.regex() | |
| choice = Choice(term,regex) | |
| return choice | |
| else: | |
| return term | |
| def term(self): | |
| "<term> ::= { <factor> }" | |
| factor = [] # begins empty | |
| while self.__more__() and self.__peek__() != '|' and self.__peek__() != ')': # while there's still input and it's not a regex delimiter | |
| factor += [self.factor()] | |
| return factor | |
| def factor(self): | |
| "<factor> ::= <base> { '*' }" | |
| base = self.base() | |
| if self.__more__() and self.__peek__() == '*': | |
| self.__consume__('*') | |
| return Repetition(base) | |
| else: | |
| return base | |
| def base(self): | |
| "<base> ::= <char> \ | |
| | '\' <char> \ | |
| | '(' <regex> ')'" | |
| if self.__peek__() == '\\': | |
| self.__consume__('\\') | |
| return '\\' + self.consumechar() | |
| elif self.__peek__() == '(': | |
| self.__consume__('(') | |
| regex = self.regex() | |
| self.__consume__(')') | |
| return regex | |
| else: | |
| return self.consumechar() | |
| def consumechar(self): | |
| "Expects an alphanumeric character" | |
| c = self.__next__() | |
| if not c.isalnum(): | |
| raise ParserException('Expected alphanumeric, got ' + c + ' at ' + self.current) | |
| else: | |
| return c | |
| # def parse(self): | |
| # acc = [] | |
| # while self.__more__(): | |
| # acc += [self.regex()] | |
| # return acc | |
| class Regex(): | |
| "Abstract class to represent a regex" | |
| class Choice(Regex): | |
| "Choice between a term and another regex" | |
| def __init__(self, regex1, regex2): | |
| "Holds 2 regexes" | |
| self.regex1 = regex1 | |
| self.regex2 = regex2 | |
| class Repetition(Regex): | |
| "Repetition between regex" | |
| def __init__(self, repetition): | |
| "Holds repetition" | |
| self.repetition = repetition | |
| class ParserException(Exception): | |
| "Thrown when a parser error occurs" | |
| def __init__(self, message): | |
| "Prints error message" | |
| print message |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment