Last active
September 23, 2019 15:54
-
-
Save jeacom25b/3096829522f2b99a2806f1d8571ba713 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
''' | |
Created by Jeacom | |
This is a experimental utility module for constructing and concatenating regular expressions | |
such that we can reuse them to build more complex regexes out of simpler ones. all hidden behind | |
a readable python interface so we don't have to read things like this: | |
(?:(?:(?:struct|enum)[ \t]+)?\b[a-zA-Z][a-zA-Z\d]*[ \t]+(\b[a-zA-Z][a-zA-Z\d]*)) | |
instead we read things like this: | |
struc_enum = OrGroup("struct", "enum") | |
typed_name = (struc_enum + whitespace).optional() + valid_name + whitespace + valid_name.captured() | |
Warning: this is just a test, dont take it seriously. | |
''' | |
def _quantifier_choice(min, max): | |
if not min and not max: | |
return "*" | |
elif min == 0 and max == 1: | |
return "?" | |
elif min == 1 and max == 0: | |
return "+" | |
elif min == max: | |
return "{" + str(max) + "}" | |
elif min >= 0 and max == 0: | |
return "{" + str(min) + ",}" | |
elif min == 0 and max >= 0: | |
return "{," + str(max) + "}" | |
else: | |
return "{" + str(min) + "," + str(max) + "}" | |
def _escape_if_needed(string): | |
escape = r"\*-+/|()[]{}.$^!?" | |
for c in escape: | |
string = string.replace(c, "\\" + c) | |
return string | |
class NakedExpr: | |
expr = "" | |
def __init__(self, expr): | |
self.expr = expr | |
def __str__(self): | |
return self.expr | |
def __repr__(self): | |
return str(self) | |
def __add__(self, other): | |
return Group(str(self) + str(other), escape=False) | |
class CharList: | |
open = "[" | |
contents = "" | |
close = "]" | |
quantifier = "" | |
negative_open = "[^" | |
positive_open = "[" | |
def __init__(self, *args, escape=False): | |
for content in args: | |
if isinstance(content, CharList): | |
self.contents += content.contents | |
elif escape and type(content) == str: | |
self.contents += _escape_if_needed(content) | |
else: | |
self.contents += str(content) | |
def __str__(self): | |
return f"{self.open}{self.contents}{self.close}{self.quantifier}" | |
def __repr__(self): | |
return str(self) | |
def __neg__(self): | |
new = type(self)(self.contents, escape=False) | |
if self.open == self.positive_open: | |
new.open = self.negative_open | |
else: | |
new.open = self.positive_open | |
return new | |
def __sub__(self, other): | |
return self + -other | |
def __add__(self, other): | |
if isinstance(other, CharList): | |
return CharList(self.contents + other.contents) | |
else: | |
return Group(str(self) + str(other), escape=False) | |
def __mul__(self, other): | |
return self.repeated(other, other) | |
def repeated(self, min=0, max=0, reluctant=False): | |
""" | |
repeated(0, 0) -> [...]* | |
repeated(0, 1) -> [...]? | |
repeated(1, 0) -> [...]+ | |
repeated(0, 0, True) -> [...]*? | |
repeated(0, 1, True) -> [...]?? | |
repeated(1, 0, True) -> [...]+? | |
repeated(10, 10) -> [...]{10} | |
repeated(10, 15) -> [...]{10,15} | |
""" | |
new = type(self)(self.contents, escape=False) | |
new.quantifier = _quantifier_choice(min, max) | |
if "," not in new.quantifier and reluctant: | |
new.quantifier += "?" | |
return new | |
def optional(self): | |
""" | |
optional() -> [...]? | |
""" | |
return self.repeated(max=1) | |
def once_or_more(self, reluctant=False): | |
""" | |
once_or_more() -> [...]+ | |
""" | |
return self.repeated(min=1, max=0, reluctant=reluctant) | |
def at_least(self, v, reluctant=False): | |
""" | |
at_least(v) -> []{v,} | |
at_least(1) -> []+ | |
""" | |
return self.repeated(min=v, reluctant=reluctant) | |
def at_most(self, v): | |
""" | |
at_most(v) -> [...]{,v} | |
at_most(1) -> [...]? | |
""" | |
return self.repeated(max=v) | |
class Group(CharList): | |
open = "(?:" | |
contents = "" | |
close = ")" | |
quantifier = "" | |
positive_open = "(?=" | |
negative_open = "(?!" | |
def __init__(self, *args, escape=True): | |
for arg in args: | |
if type(arg) == Group and arg.open == "(?:" and arg.quantifier == "": | |
self.contents += arg.contents | |
elif type(arg) == str and escape: | |
self.contents += _escape_if_needed(arg) | |
else: | |
self.contents += str(arg) | |
def __or__(self, other): | |
return Group(str(self) + "|" + str(other), escape=False) | |
def __ror__(self, other): | |
return Group(str(other) + "|" + str(self), escape=False) | |
def __str__(self): | |
return f"{self.open}{self.contents}{self.close}{self.quantifier}" | |
def __add__(self, other): | |
return Group(self, other, escape=False) | |
def __radd__(self, other): | |
return Group(other, self, escape=False) | |
def captured(self): | |
return Capture(self.contents, escape=False) | |
class OrGroup(Group): | |
def __init__(self, *args, escape=True): | |
self.contents = "|".join( | |
[_escape_if_needed(arg) if escape and type(arg) == str else str(arg) | |
for arg in args] | |
) | |
class Capture(Group): | |
open = "(" | |
close = ")" | |
class LookAhead(Group): | |
open = "(?=" | |
close = ")" | |
positive_open = "(?=" | |
positive_close = ")" | |
negative_open = "(?!" | |
negative_close = ")" | |
class LookBehind(Group): | |
open = "(?<=" | |
close = ")" | |
positive_open = "(?<=" | |
positive_close = ")" | |
negative_open = "(<!" | |
negative_close = ")" | |
A_Z = CharList("A-Z") | |
a_z = CharList("a-z") | |
NUMBER = CharList(r"\d") | |
NON_NUMBER = CharList(r"\D") | |
WORD_CHAR = CharList(r"\w") | |
NON_WORD_CHAS = CharList(r"\w") | |
WHITESPACE = CharList(r"\s") | |
NON_WHITESPACE = CharList(r"\S") | |
ANY_CHAR = CharList(r".") | |
PERIOD = CharList(r"\.") | |
TAB_SPACE = CharList(r" \t") | |
ALPHA = a_z + A_Z | |
ALPHANUMERIC = ALPHA + NUMBER | |
SPACE = NakedExpr(" ") | |
TAB = NakedExpr(r"\t") | |
LINE_START = NakedExpr("^") | |
LINE_END = NakedExpr("$") | |
WORD_BOUNDARY = NakedExpr(r"\b") | |
NOT_WORD_BOUNDARY = NakedExpr(r"\B") | |
if __name__ == "__main__": | |
keywords = OrGroup("struct", "enum", "void", "double", "float", "Char", ",,,") | |
whitespace = TAB_SPACE.at_least(1) | |
valid_name = r"\b" + Group(ALPHA, ALPHANUMERIC.repeated()) | |
struc_enum = OrGroup("struct", "enum") | |
typed_name = (struc_enum + whitespace).optional() + valid_name + whitespace + valid_name.captured() | |
print(repr(typed_name)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment