Skip to content

Instantly share code, notes, and snippets.

@matteobertozzi
Created April 26, 2012 19:35
Show Gist options
  • Select an option

  • Save matteobertozzi/2502392 to your computer and use it in GitHub Desktop.

Select an option

Save matteobertozzi/2502392 to your computer and use it in GitHub Desktop.
Simple Tokenizer
from cStringIO import StringIO
SYMBOLS_QUOTE = ('"', "'")
TOKEN_STRING = 0
TOKEN_NUMBER = 1
TOKEN_KEYWORD = 2
TOKEN_PARENTHESES_OPEN = 3
TOKEN_PARENTHESES_CLOSE = 4
TOKEN_COMMA = 5
TOKEN_SYMBOLS_TABLE = {
'(': TOKEN_PARENTHESES_OPEN,
')': TOKEN_PARENTHESES_CLOSE,
',': TOKEN_COMMA,
' ': None,
}
def _sdataToToken(sdata):
if not sdata:
return None
token = ''.join(sdata)
for t in (int, float):
try:
return TOKEN_NUMBER, t(token)
except ValueError:
pass
return TOKEN_KEYWORD, token
def tokenize(query):
if isinstance(query, basestring):
query = StringIO(query)
quoted = False
sdata = []
while True:
c = query.read(1)
if not c: break
token = None
if quoted:
if c in SYMBOLS_QUOTE:
yield TOKEN_STRING, ''.join(sdata)
sdata = []
quoted = False
else:
sdata.append(c)
continue
if c in SYMBOLS_QUOTE:
quoted = True
continue
for sym, sym_token in TOKEN_SYMBOLS_TABLE.iteritems():
if (isinstance(sym, basestring) and c == sym) or c in sym:
token = sym_token
break
else:
sdata.append(c)
continue
stoken = _sdataToToken(sdata)
sdata = []
if stoken is not None: yield stoken
if token is not None: yield token, c
if quoted:
raise Exception("Missing end quote")
stoken = _sdataToToken(sdata)
if stoken is not None: yield stoken
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment