Created
October 24, 2011 19:18
-
-
Save thepaul/1309885 to your computer and use it in GitHub Desktop.
safer version of python's re.Scanner
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# SaferScanner is just like re.Scanner, but it neuters any grouping in the lexicon | |
# regular expressions and throws an error on group references, named groups, or | |
# regex in-pattern flags. Any of those can break correct operation of Scanner. | |
import re | |
from sre_constants import BRANCH, SUBPATTERN, GROUPREF, GROUPREF_IGNORE, GROUPREF_EXISTS | |
class SaferScanner(re.Scanner): | |
def __init__(self, lexicon, flags=0): | |
self.lexicon = lexicon | |
p = [] | |
s = re.sre_parse.Pattern() | |
s.flags = flags | |
for phrase, action in lexicon: | |
p.append(re.sre_parse.SubPattern(s, [ | |
(SUBPATTERN, (len(p)+1, self.subpat(phrase, flags))), | |
])) | |
s.groups = len(p)+1 | |
p = re.sre_parse.SubPattern(s, [(BRANCH, (None, p))]) | |
self.p = p | |
self.scanner = re.sre_compile.compile(p) | |
@classmethod | |
def subpat(cls, phrase, flags): | |
return cls.scrub_sub(re.sre_parse.parse(phrase, flags)) | |
@classmethod | |
def scrub_sub(cls, sub): | |
scrubbedsub = [] | |
seqtypes = (type(()), type([])) | |
for op, arg in sub.data: | |
if type(arg) in seqtypes: | |
arg = [cls.scrub_sub(a) if isinstance(a, re.sre_parse.SubPattern) else a | |
for a in arg] | |
if op in (BRANCH, SUBPATTERN): | |
arg = [None] + arg[1:] | |
if op in (GROUPREF, GROUPREF_IGNORE, GROUPREF_EXISTS): | |
raise ValueError("Group references not allowed in SaferScanner lexicon") | |
scrubbedsub.append((op, arg)) | |
if sub.pattern.groupdict: | |
raise ValueError("Named captures not allowed in SaferScanner lexicon") | |
if sub.pattern.flags: | |
raise ValueError("RE flag setting not allowed in SaferScanner lexicon") | |
return re.sre_parse.SubPattern(sub.pattern, scrubbedsub) |
If there are other methods in the class they will function as usual.
The scan
method of re.Scanner
is still meant to be used through this class.
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Why does this inherit
re. Scanner
? It doesn't appear to use anything from it - or did I miss something?