Last active
March 20, 2021 20:56
-
-
Save codebrainz/ffbd2fde8d44b93c22f0 to your computer and use it in GitHub Desktop.
Simple tokenizing in Python
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
import re | |
from collections import namedtuple | |
class Tokenizer: | |
Token = namedtuple('Token', 'name text span') | |
def __init__(self, tokens): | |
self.tokens = tokens | |
pat_list = [] | |
for tok, pat in self.tokens: | |
pat_list.append('(?P<%s>%s)' % (tok, pat)) | |
self.re = re.compile('|'.join(pat_list)) | |
def iter_tokens(self, input, ignore_ws=True): | |
for match in self.re.finditer(input): | |
if ignore_ws and match.lastgroup == 'WHITESPACE': | |
continue | |
yield Tokenizer.Token(match.lastgroup, match.group(0), match.span(0)) | |
def tokenize(self, input, ignore_ws=True): | |
return list(self.iter_tokens(input, ignore_ws)) | |
# test program | |
if __name__ == "__main__": | |
TOKENS = [ | |
('NIL' , r"nil|\'()"), | |
('TRUE' , r'true|#t'), | |
('FALSE' , r'false|#f'), | |
('NUMBER' , r'\d+'), | |
('STRING' , r'"(\\.|[^"])*"'), | |
('SYMBOL' , r'[\x21-\x26\x2a-\x7e]+'), | |
('QUOTE' , r"'"), | |
('LPAREN' , r'\('), | |
('RPAREN' , r'\)'), | |
('DOT' , r'\.'), | |
('WHITESPACE' , r'\s+'), | |
('ERROR' , r'.'), | |
] | |
for t in Tokenizer(TOKENS).iter_tokens('(+ nil 1 2)'): | |
print(t) |
Thanks, and good catch. I actually totally forgot about this Gist :)
Updated with suggestions.
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Excellent gist. However, you have an error. Token
\w
is for word in Python, token\s
is for whitespace.Also, as a pointer, one could have a crude error searching by adding
('ERROR', r'.')
at the end of the token list. This will produce 1-length error tokens for any char that doesn't match any other token - similar to how Antlr does it. At least it worked for me.