Last active
July 15, 2021 15:42
-
-
Save alextremblay/5ad5f379f7aa25a983f55673d974be7a to your computer and use it in GitHub Desktop.
An example string tokenizer / regex splitter
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from __future__ import annotations | |
from typing import Any, Callable, List, Literal, Tuple, Union, TypeVar, cast | |
import regex | |
from regex.regex import Match | |
DEFAULT = '_SCANNER_DEFAULT_PATTERN' # a sentinel value | |
T = TypeVar('T') | |
class TokenMeta(type): | |
"""Metaclass for dynamic attribute access on class (not on class instance) | |
With any class you use this metaclass in, references to capital-case attributes | |
on that class will return named subclasses of that class | |
see Token as an example""" | |
def __getattr__(cls: TokenMeta, name: str) -> TokenMeta: | |
if name[0].isupper(): | |
parent_class_name = cls.__name__ | |
new_subclass_name = f'{parent_class_name}.{name}' | |
new_subclass = cast( | |
TokenMeta, | |
type(new_subclass_name, (cls,), {}) | |
) | |
# register this subclass as an attribute on its parent class | |
# next time it is referenced, that attribute will be used, and this method will not be called | |
setattr(cls, name, new_subclass) | |
return new_subclass | |
#else: | |
raise AttributeError | |
class Token(str, metaclass=TokenMeta): | |
"""subclass of string, used to categorize different types of strings. | |
referencing any capital-case attribute of this class returns a subclass named after that attribute | |
Example: | |
>>> token_string = Token('hello') | |
>>> # all tokens and subclasses are, at the end of the day, strings. | |
>>> isinstance(token_string, str) | |
True | |
>>> # You can define any categories you want | |
>>> other_string = Token.Category('some string') | |
>>> a_third_string = Token.OtherCategory('some other string') | |
>>> # All categories are subclasses of Token (and by extension str) | |
>>> issubclass(Token.Category, Token) and issubclass(Token.OtherCategory, Token) | |
True | |
>>> isinstance(other_string, Token.Category) | |
True | |
>>> isinstance(a_third_string, Token.Category) | |
False | |
>>> # Categories can be arbitrarily nested | |
>>> yet_another_string = Token.Category.SubCategory('hello') | |
>>> # string equality still works between them all | |
>>> token_string == yet_another_string | |
True | |
""" | |
def __repr__(self,): | |
return f'{self.__class__.__name__}({super().__repr__()})' | |
class Scanner: | |
def __init__(self, rules: List[Tuple[str, Callable]], flags=regex.VERSION1) -> None: | |
""" | |
""" | |
self.patterns = {} | |
self.actions = {} | |
self.default_action = str | |
for index, rule in enumerate(rules): | |
pattern, action = rule | |
if pattern == DEFAULT: | |
self.default_action = action | |
continue | |
name = f'scanner_pattern{index}' | |
self.patterns[name] = pattern | |
self.actions[name] = action | |
self.scanner = regex.compile(self.assemble_pattern(), flags) | |
def assemble_pattern(self): | |
r""" | |
Construct a regex pattern from a set of sub-patterns, assigning each sub-pattern to a named capture group | |
Example: | |
>>> self.patterns = { | |
... "scanner_pattern0": r"\w+", | |
... "scanner_pattern1": r"\d{3}" | |
... } | |
>>> self.assemble_pattern() | |
'(?|(?<scanner_pattern0>\\w+)|(?<scanner_pattern1>\\d{3}))' | |
""" | |
named_patterns = [] | |
for name, pattern in self.patterns.items(): | |
named_patterns.append(f'(?<{name}>{pattern})') | |
alternates = '|'.join(named_patterns) | |
full_pattern = fr'(?|{alternates})' | |
return full_pattern | |
def get_pattern_name(self, match_object): | |
""" | |
Identify which pattern from the rules list was matched by a given match object | |
The match object's groupdict() will contain entries for each scanner pattern | |
(ex 'scanner_pattern0', 'scanner_pattern1', etc). all of those entries will have a value of None, | |
except for one, the one that matched. This method finds that entry, and returns its name | |
""" | |
for name, value in match_object.groupdict().items(): | |
name: str | |
if name.startswith('scanner_pattern'): | |
if value is not None: | |
return name | |
# If we get to this point, something has gone seriously wrong | |
raise Exception("text fragment matched a pattern from the rules list, but was not captured by that pattern") | |
def __call__(self, input_str, pass_in_option: Union[Literal['string'], Literal['match_object']] = 'string'): | |
unprocessed_text = input_str | |
while len(unprocessed_text) > 0: | |
m = self.scanner.search(unprocessed_text) | |
if not m: | |
# If we've arrived at this point, we've got a chunk of text that doesn't contain any of the patterns. | |
yield self.default_action(unprocessed_text) | |
break | |
start, end = m.span() | |
if start > 0: | |
# At this point, we've got a match somewhere in the middle of the string, with unmatched text before it. | |
yield self.default_action(unprocessed_text[:start]) | |
pattern_name = self.get_pattern_name(m) | |
action = self.actions[pattern_name] | |
if pass_in_option == 'string': | |
val = m[0] | |
else: | |
val = m | |
yield action(val) | |
unprocessed_text = unprocessed_text[end:] | |
if __name__ == "__main__": | |
scanner = Scanner([ | |
(r'%\{.*?\}', Token.Grok), | |
(r'(?<!\\)\((?:[^)(]*(?R)?)*+(?<!\\)\)', Token.Group), | |
(DEFAULT, Token.Default) | |
]) | |
testdata = r'%{WORD:action}test %\{WORD:action}test (%{ASA_TCP_UDP}|%{ASA_ICMP}) \(type \d, code \d\) (?:(?:[A-Fa-f0-9]{4}\.){2}[A-Fa-f0-9]{4}) some text' # noqa | |
for token in scanner(testdata): | |
print(repr(token)) | |
# Output: | |
# Token.Grok('%{WORD:action}') | |
# Token.Default('test %\\{WORD:action}test ') | |
# Token.Group('(%{ASA_TCP_UDP}|%{ASA_ICMP})') | |
# Token.Default(' \\(type \\d, code \\d\\) ') | |
# Token.Group('(?:(?:[A-Fa-f0-9]{4}\\.){2}[A-Fa-f0-9]{4})') | |
# Token.Default(' some text') | |
# A more advanced example | |
import textwrap | |
class Markdown(Token): | |
pass | |
sample_text = textwrap.dedent(""" | |
# Heading level 1 | |
## Heading level 2 | |
## Heading level 2 | |
I just love **bold text**. | |
I just love __bold text__. | |
Italicized text is the *cat's meow*. | |
Italicized text is the _cat's meow_. | |
> Dorothy followed her through many of the beautiful rooms in her castle. | |
> | |
> The Witch bade her clean the pots and kettles and sweep the floor and keep the fire fed with wood. | |
- First item | |
- Second item | |
- Third item | |
- Fourth item | |
At the command prompt, type `nano`. | |
``` | |
<html> | |
<head> | |
</head> | |
</html> | |
``` | |
My favorite search engine is [Duck Duck Go](https://duckduckgo.com). | |
test **not a | |
heading** | |
""") | |
mdscanner = Scanner([ | |
(r'[#]+ .*\n', Markdown.Heading), | |
(r'[*]{2}[^*\n]+[*]{2}', Markdown.Bold), | |
(r'[_]{2}[^_\n]+[_]{2}', Markdown.Bold), | |
(r'[*]{1}[^*\n]+[*]{1}', Markdown.Italics), | |
(r'[_]{1}[^_\n]+[_]{1}', Markdown.Italics), | |
(r'([>] .*\n)+', Markdown.Blockquote), | |
(r'([-] .*\n)+', Markdown.List), | |
(r'[`]{1}[^`\n]+[`]{1}', Markdown.Code), | |
(r'[`]{3}[^`]+[`]{3}', Markdown.Code), | |
(r'\[[^\]]+\]\([^\)]+\)', Markdown.Link), | |
(DEFAULT, Markdown.Text) | |
]) | |
for token in mdscanner(sample_text): | |
print(repr(token)) | |
# Output: | |
# Markdown.Text('\n') | |
# Markdown.Heading('# Heading level 1\n') | |
# Markdown.Heading('## Heading level 2\n') | |
# Markdown.Heading('## Heading level 2\n') | |
# Markdown.Text('I just love ') | |
# Markdown.Bold('**bold text**') | |
# Markdown.Text('.\nI just love ') | |
# Markdown.Bold('__bold text__') | |
# Markdown.Text('.\nItalicized text is the ') | |
# Markdown.Italics("*cat's meow*") | |
# Markdown.Text('.\nItalicized text is the ') | |
# Markdown.Italics("_cat's meow_") | |
# Markdown.Text('.\n') | |
# Markdown.Blockquote('> Dorothy followed her through many of the beautiful rooms in her castle.\n> \n> The Witch bade her clean the pots and kettles and sweep the floor and keep the fire fed with wood.\n') | |
# Markdown.List('- First item\n- Second item\n- Third item\n- Fourth item\n') | |
# Markdown.Text('At the command prompt, type ') | |
# Markdown.Code('`nano`') | |
# Markdown.Text('.\n') | |
# Markdown.Code('```\n<html>\n<head>\n</head>\n</html>\n```') | |
# Markdown.Text('\nMy favorite search engine is ') | |
# Markdown.Link('[Duck Duck Go](https://duckduckgo.com)') | |
# Markdown.Text('.\ntest **not a\nheading**\n') | |
# An even more advanced example | |
class NewMarkdownBase: | |
"""Base class for all types of NewMarkdown tokens | |
sets up the default behaviour for extracting a printable value from a regex match group | |
__init__ can be overridden to modify the logic for populating the value attribute | |
from the match object | |
""" | |
matches: List[str] | |
value: Any | |
def __repr__(self,): | |
return f'NewMarkdown.{self.__class__.__name__}({repr(self.value)})' | |
def extract(self, d) -> Any: | |
return d['text'][0] | |
def __init__(self, obj) -> None: | |
if isinstance(obj, Match): | |
self.value = self.extract(obj.capturesdict()) | |
else: | |
# class is being manually instantiated | |
self.value = obj | |
class NewMarkdown: | |
class Text(str): | |
def __repr__(self,): | |
return f'NewMarkdown.Text({repr(str(self))})' | |
class Heading(NewMarkdownBase): | |
pattern = r'[#]+ (?<text>.*)\n' | |
class Italics(NewMarkdownBase): | |
patterns = [ | |
r'[*]{1}(?<text>[^*\n]+)[*]{1}', | |
r'[_]{1}(?<text>[^_\n]+)[_]{1}' | |
] | |
class Bold(NewMarkdownBase): | |
patterns = [ | |
r'[*]{2}(?<text>[^*\n]+)[*]{2}', | |
r'[_]{2}(?<text>[^_\n]+)[_]{2}' | |
] | |
class Blockquote(NewMarkdownBase): | |
pattern = r'([>] (?<text>.*)\n)+' | |
def extract(self, d) -> Any: | |
return '\n'.join(d['text']) | |
class List(NewMarkdownBase): | |
pattern = r'([-] (?<text>.*)\n)+' | |
def extract(self, d) -> Any: | |
return d['text'] | |
class Code(NewMarkdownBase): | |
patterns = [ | |
r'[`]{1}(?<text>[^`\n]+)[`]{1}', | |
r'[`]{3}(?<text>[^`]+)[`]{3}' | |
] | |
class Link(NewMarkdownBase): | |
pattern = r'\[(?<text>[^\]]+)\]\((?<url>[^\)]+)\)' | |
def extract(self, d) -> Any: | |
text = d['text'][0] | |
url = d['url'][0] | |
return dict(text=text, url=url) | |
mdscanner2 = Scanner([ | |
(NewMarkdown.Heading.pattern, NewMarkdown.Heading), | |
(NewMarkdown.Bold.patterns[0], NewMarkdown.Bold), | |
(NewMarkdown.Bold.patterns[1], NewMarkdown.Bold), | |
(NewMarkdown.Italics.patterns[0], NewMarkdown.Italics), | |
(NewMarkdown.Italics.patterns[1], NewMarkdown.Italics), | |
(NewMarkdown.Blockquote.pattern, NewMarkdown.Blockquote), | |
(NewMarkdown.List.pattern, NewMarkdown.List), | |
(NewMarkdown.Code.patterns[0], NewMarkdown.Code), | |
(NewMarkdown.Code.patterns[1], NewMarkdown.Code), | |
(NewMarkdown.Link.pattern, NewMarkdown.Link), | |
(DEFAULT, NewMarkdown.Text) | |
]) | |
for token in mdscanner2(sample_text, pass_in_option='match_object'): | |
print(repr(token)) | |
# Output: | |
# NewMarkdown.Text('\n') | |
# NewMarkdown.Heading('Heading level 1') | |
# NewMarkdown.Heading('Heading level 2') | |
# NewMarkdown.Heading('Heading level 2') | |
# NewMarkdown.Text('I just love ') | |
# NewMarkdown.Bold('bold text') | |
# NewMarkdown.Text('.\nI just love ') | |
# NewMarkdown.Bold('bold text') | |
# NewMarkdown.Text('.\nItalicized text is the ') | |
# NewMarkdown.Italics("cat's meow") | |
# NewMarkdown.Text('.\nItalicized text is the ') | |
# NewMarkdown.Italics("cat's meow") | |
# NewMarkdown.Text('.\n') | |
# NewMarkdown.Blockquote('Dorothy followed her through many of the beautiful rooms in her castle.\n\nThe Witch bade her clean the pots and kettles and sweep the floor and keep the fire fed with wood.') | |
# NewMarkdown.List(['First item', 'Second item', 'Third item', 'Fourth item']) | |
# NewMarkdown.Text('At the command prompt, type ') | |
# NewMarkdown.Code('nano') | |
# NewMarkdown.Text('.\n') | |
# NewMarkdown.Code('\n<html>\n<head>\n</head>\n</html>\n') | |
# NewMarkdown.Text('\nMy favorite search engine is ') | |
# NewMarkdown.Link({'text': 'Duck Duck Go', 'url': 'https://duckduckgo.com'}) | |
# NewMarkdown.Text('.\ntest **not a\nheading**\n') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment