Created
April 26, 2012 19:35
-
-
Save matteobertozzi/2502392 to your computer and use it in GitHub Desktop.
Simple Tokenizer
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| from cStringIO import StringIO | |
| SYMBOLS_QUOTE = ('"', "'") | |
| TOKEN_STRING = 0 | |
| TOKEN_NUMBER = 1 | |
| TOKEN_KEYWORD = 2 | |
| TOKEN_PARENTHESES_OPEN = 3 | |
| TOKEN_PARENTHESES_CLOSE = 4 | |
| TOKEN_COMMA = 5 | |
| TOKEN_SYMBOLS_TABLE = { | |
| '(': TOKEN_PARENTHESES_OPEN, | |
| ')': TOKEN_PARENTHESES_CLOSE, | |
| ',': TOKEN_COMMA, | |
| ' ': None, | |
| } | |
| def _sdataToToken(sdata): | |
| if not sdata: | |
| return None | |
| token = ''.join(sdata) | |
| for t in (int, float): | |
| try: | |
| return TOKEN_NUMBER, t(token) | |
| except ValueError: | |
| pass | |
| return TOKEN_KEYWORD, token | |
| def tokenize(query): | |
| if isinstance(query, basestring): | |
| query = StringIO(query) | |
| quoted = False | |
| sdata = [] | |
| while True: | |
| c = query.read(1) | |
| if not c: break | |
| token = None | |
| if quoted: | |
| if c in SYMBOLS_QUOTE: | |
| yield TOKEN_STRING, ''.join(sdata) | |
| sdata = [] | |
| quoted = False | |
| else: | |
| sdata.append(c) | |
| continue | |
| if c in SYMBOLS_QUOTE: | |
| quoted = True | |
| continue | |
| for sym, sym_token in TOKEN_SYMBOLS_TABLE.iteritems(): | |
| if (isinstance(sym, basestring) and c == sym) or c in sym: | |
| token = sym_token | |
| break | |
| else: | |
| sdata.append(c) | |
| continue | |
| stoken = _sdataToToken(sdata) | |
| sdata = [] | |
| if stoken is not None: yield stoken | |
| if token is not None: yield token, c | |
| if quoted: | |
| raise Exception("Missing end quote") | |
| stoken = _sdataToToken(sdata) | |
| if stoken is not None: yield stoken |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment