Created
October 24, 2011 19:18
-
-
Save thepaul/1309885 to your computer and use it in GitHub Desktop.
safer version of python's re.Scanner
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# SaferScanner is just like re.Scanner, but it neuters any grouping in the lexicon | |
# regular expressions and throws an error on group references, named groups, or | |
# regex in-pattern flags. Any of those can break correct operation of Scanner. | |
import re | |
from sre_constants import BRANCH, SUBPATTERN, GROUPREF, GROUPREF_IGNORE, GROUPREF_EXISTS | |
class SaferScanner(re.Scanner): | |
def __init__(self, lexicon, flags=0): | |
self.lexicon = lexicon | |
p = [] | |
s = re.sre_parse.Pattern() | |
s.flags = flags | |
for phrase, action in lexicon: | |
p.append(re.sre_parse.SubPattern(s, [ | |
(SUBPATTERN, (len(p)+1, self.subpat(phrase, flags))), | |
])) | |
s.groups = len(p)+1 | |
p = re.sre_parse.SubPattern(s, [(BRANCH, (None, p))]) | |
self.p = p | |
self.scanner = re.sre_compile.compile(p) | |
@classmethod | |
def subpat(cls, phrase, flags): | |
return cls.scrub_sub(re.sre_parse.parse(phrase, flags)) | |
@classmethod | |
def scrub_sub(cls, sub): | |
scrubbedsub = [] | |
seqtypes = (type(()), type([])) | |
for op, arg in sub.data: | |
if type(arg) in seqtypes: | |
arg = [cls.scrub_sub(a) if isinstance(a, re.sre_parse.SubPattern) else a | |
for a in arg] | |
if op in (BRANCH, SUBPATTERN): | |
arg = [None] + arg[1:] | |
if op in (GROUPREF, GROUPREF_IGNORE, GROUPREF_EXISTS): | |
raise ValueError("Group references not allowed in SaferScanner lexicon") | |
scrubbedsub.append((op, arg)) | |
if sub.pattern.groupdict: | |
raise ValueError("Named captures not allowed in SaferScanner lexicon") | |
if sub.pattern.flags: | |
raise ValueError("RE flag setting not allowed in SaferScanner lexicon") | |
return re.sre_parse.SubPattern(sub.pattern, scrubbedsub) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
The
scan
method ofre.Scanner
is still meant to be used through this class.