Last active
May 21, 2021 10:09
-
-
Save phith0n/e31ba266ec6fff45bc8b316b1101b723 to your computer and use it in GitHub Desktop.
a simple regexp generator
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import re | |
from enum import Enum | |
class State(Enum): | |
Initial = 0 | |
D = 10 | |
DRest = 11 | |
S = 20 | |
SRest = 21 | |
W = 30 | |
WRest = 31 | |
Other = 100 | |
class ReGenerate(object): | |
def __init__(self): | |
self.tokens = [] | |
self.current_state: State.value = State.Initial | |
self.fragment = '' | |
def flush(self, ch: str): | |
if self.fragment: | |
self.tokens.append(self.fragment) | |
self.fragment = '' | |
if self.is_d(ch): | |
self.fragment = r'\d' | |
self.current_state = State.D | |
elif self.is_w(ch): | |
self.fragment = r'\w' | |
self.current_state = State.W | |
elif self.is_space(ch): | |
self.fragment = r'\s' | |
self.current_state = State.S | |
else: | |
self.fragment = re.escape(ch) | |
self.current_state = State.Other | |
def generate(self, data: str): | |
for ch in data: | |
if self.current_state == State.Initial: | |
self.flush(ch) | |
elif self.current_state == State.D: | |
if self.is_d(ch): | |
self.current_state = State.DRest | |
self.fragment += r'+' | |
else: | |
self.flush(ch) | |
elif self.current_state == State.DRest: | |
if not self.is_d(ch): | |
self.flush(ch) | |
elif self.current_state == State.W: | |
if self.is_w(ch): | |
self.current_state = State.WRest | |
self.fragment += r'+' | |
else: | |
self.flush(ch) | |
elif self.current_state == State.WRest: | |
if not self.is_w(ch): | |
self.flush(ch) | |
elif self.current_state == State.S: | |
if self.is_space(ch): | |
self.current_state = State.SRest | |
self.fragment += r'+' | |
else: | |
self.flush(ch) | |
elif self.current_state == State.SRest: | |
if not self.is_space(ch): | |
self.flush(ch) | |
else: | |
self.flush(ch) | |
self.flush('') | |
return ''.join(self.tokens) | |
def is_d(self, ch: str): | |
return re.match(r'[\d]', ch) | |
def is_space(self, ch: str): | |
return re.match(r'[\s]', ch) | |
def is_w(self, ch: str): | |
return re.match(r'[\w]', ch) | |
if __name__ == '__main__': | |
# output: \w+\s\w+\s\w+\s\w+,\s\w'\w\s\d+\s\w+\s\w+ | |
regexp = ReGenerate().generate('My name is Bob, I\'m 25 years old') | |
print(regexp) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment