Skip to content

Instantly share code, notes, and snippets.

@whiler
Created October 19, 2018 03:37
Show Gist options
  • Save whiler/0fa8a811ad962894ed3650524405b3ec to your computer and use it in GitHub Desktop.
Save whiler/0fa8a811ad962894ed3650524405b3ec to your computer and use it in GitHub Desktop.
Python Tokenizer using re.Scanner
#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
import collections
import re
Token = collections.namedtuple('Token', ['tpe', 'value'])
Scanner = re.Scanner((
(r'[a-zA-Z_][a-zA-Z0-9_]*', lambda scanner, value: Token('IDENT', value)),
(r'0x[0-9a-fA-F]+', lambda scanner, value: Token('HEX', value)),
(r'[0-9]+\.[0-9]+', lambda scanner, value: Token('DECIMAL', value)),
(r'[0-9]+', lambda scanner, value: Token('INTEGER', value)),
(r'[\r\n]+', lambda scanner, value: Token('NEWLINE', value)),
(r'\s+', lambda scanner, value: Token('SPACE', value)),
(r'\"\"\"', lambda scanner, value: Token('TRIPLEQUOTE', value)),
(r'#!', lambda scanner, value: Token('SHEBANG', value)),
(r'>=', lambda scanner, value: Token('GE', value)),
(r'<=', lambda scanner, value: Token('LE', value)),
(r'!=', lambda scanner, value: Token('NE', value)),
(r'<<', lambda scanner, value: Token('LSHIFTING', value)),
(r'>>', lambda scanner, value: Token('RSHIFTING', value)),
(r'\(', lambda scanner, value: Token('LBRACKET', value)),
(r'\)', lambda scanner, value: Token('RBRACKET', value)),
(r'\[', lambda scanner, value: Token('LSQUAREBRACKET', value)),
(r'\]', lambda scanner, value: Token('RSQUAREBRACKET', value)),
(r'\{', lambda scanner, value: Token('LBRACE', value)),
(r'\}', lambda scanner, value: Token('RBRACE', value)),
(r'>', lambda scanner, value: Token('GT', value)),
(r'<', lambda scanner, value: Token('LT', value)),
(r'=', lambda scanner, value: Token('EQ', value)),
(r',', lambda scanner, value: Token('COMMA', value)),
(r';', lambda scanner, value: Token('SEMICOLON', value)),
(r'\.', lambda scanner, value: Token('DOT', value)),
(r'\*', lambda scanner, value: Token('STAR', value)),
(r'\?', lambda scanner, value: Token('PARAM', value)),
(r'\+', lambda scanner, value: Token('ADD', value)),
(r'-', lambda scanner, value: Token('MINUS', value)),
(r'\^', lambda scanner, value: Token('BITXOR', value)),
(r'&', lambda scanner, value: Token('BITAND', value)),
(r'\|', lambda scanner, value: Token('BITOR', value)),
(r'#', lambda scanner, value: Token('COMMENT', value)),
(r':', lambda scanner, value: Token('COLON', value)),
(r'/', lambda scanner, value: Token('SLASH', value)),
(r'\'', lambda scanner, value: Token('SINGLEQUOTE', value)),
(r'\"', lambda scanner, value: Token('DOUBLEQUOTE', value)),
(r'\\', lambda scanner, value: Token('BACKSLASH', value)),
), re.MULTILINE)
if __name__ == '__main__':
tokens, remaind = Scanner.scan(open(__file__).read())
for i, token in enumerate(tokens):
print(i, token)
print('remaind: {}'.format(remaind))
print('remaind length: {}'.format(len(remaind)))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment