Last active
June 24, 2017 22:20
-
-
Save edcrypt/935dbbac8149b079002a486a39dc0ee9 to your computer and use it in GitHub Desktop.
Quick tokenizer to showcase some py3.6 features
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# flake8: noqa | |
import regex | |
import collections | |
from typing import NamedTuple | |
class OrderedClassMembers(type): | |
@classmethod | |
def __prepare__(self, name, bases): | |
return collections.OrderedDict() | |
def __new__(self, name, bases, classdict): | |
classdict['__ordered__'] = [key for key in classdict.keys() | |
if key not in ('__module__', '__qualname__')] | |
return type.__new__(self, name, bases, classdict) | |
class Token(NamedTuple): | |
kind: str | |
value: str | |
start: int | |
end: int | |
class Tokenizer(metaclass=OrderedClassMembers): | |
""" >>> tokenizer.Tokenizer().scan('10. .10 10.10 10e10 10e-10 0xFF asd') | |
[Token(kind='T_FLOAT', value='10.', start=0, end=3), | |
Token(kind='T_FLOAT', value='.10', start=4, end=7), | |
Token(kind='T_FLOAT', value='10.10', start=8, end=13), | |
Token(kind='T_SCI_FLOAT', value='10e10', start=14, end=19), | |
Token(kind='T_SCI_FLOAT', value='10e-10', start=20, end=26), | |
Token(kind='T_HEX_INTEGER', value='0xFF', start=27, end=31), | |
Token(kind='T_IDENTIFIER', value='asd', start=32, end=35)] | |
""" | |
REAL = "[0-9]+\.[0-9]+|[0-9]+\.|\.[0-9]+" | |
NATURAL = "[0-9]+" | |
T_IDENT = r"\n\s{4,}" | |
T_SCI_FLOAT = rf"[+-]?({REAL}|{NATURAL})[eE][+-]?[0-9]+" | |
T_FLOAT = rf"[+-]?{REAL}" | |
T_HEX_INTEGER = r"0x[0-9A-Fa-f]+" | |
T_OCT_INTEGER = r"0o[0-7]+" | |
T_INTEGER = r"[+-]?[0-9]+" | |
T_IDENTIFIER = r"[^0-9\s;.][^\s;.]*" | |
T_PUNCTUATION = r"[;.]" | |
IGNORE = "\s+" | |
def __init__(self): | |
to_scan = [(getattr(self, token_type), | |
self.get_action(token_type)) | |
for token_type in self.__ordered__ | |
if token_type.startswith('T_')] | |
to_scan.append((self.IGNORE, None)) | |
self.scanner = regex.Scanner(to_scan) | |
def scan(self, source): | |
tokens, remainder = self.scanner.scan(source) | |
if remainder: | |
raise SyntaxError( | |
f"Unknown token: {remainder} at: {tokens[-1].end}") | |
return tokens | |
def get_action(self, token_type): | |
def action(scanner, token): | |
return Token(token_type, token, | |
scanner.match.start(), | |
scanner.match.end()) | |
return action |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment