Skip to content

Instantly share code, notes, and snippets.

@ambv
Last active April 23, 2018 00:34
Show Gist options
  • Save ambv/679018041d85dd1a7497e6d89c45fb86 to your computer and use it in GitHub Desktop.
Save ambv/679018041d85dd1a7497e6d89c45fb86 to your computer and use it in GitHub Desktop.
Unified diff between Lib/tokenize.py and Lib/lib2to3/pgen2/tokenize.py
--- Lib/tokenize.py 2018-04-22 17:33:48.000000000 -0700
+++ Lib/lib2to3/pgen2/tokenize.py 2018-04-22 17:32:55.000000000 -0700
@@ -31,14 +31,15 @@
import itertools as _itertools
import re
import sys
-from token import *
+from .token import *
cookie_re = re.compile(r'^[ \t\f]*#.*?coding[:=][ \t]*([-\w.]+)', re.ASCII)
blank_re = re.compile(br'^[ \t\f]*(?:[#\r\n]|$)', re.ASCII)
-import token
+from . import token
__all__ = token.__all__ + ["tokenize", "detect_encoding",
- "untokenize", "TokenInfo"]
+ "untokenize", "TokenInfo",
+ "generate_tokens"]
del token
EXACT_TOKEN_TYPES = {
@@ -114,10 +115,10 @@
Ignore = Whitespace + any(r'\\\r?\n' + Whitespace) + maybe(Comment)
Name = r'\w+'
-Hexnumber = r'0[xX](?:_?[0-9a-fA-F])+'
+Hexnumber = r'0[xX](?:_?[0-9a-fA-F])+[lL]?'
Binnumber = r'0[bB](?:_?[01])+'
-Octnumber = r'0[oO](?:_?[0-7])+'
-Decnumber = r'(?:0(?:_?0)*|[1-9](?:_?[0-9])*)'
+Octnumber = r'0[oO]?(?:_?[0-7])+[lL]?'
+Decnumber = r'(?:0(?:_?0)*|[1-9](?:_?[0-9])*)[lL]?'
Intnumber = group(Hexnumber, Binnumber, Octnumber, Decnumber)
Exponent = r'[eE][-+]?[0-9](?:_?[0-9])*'
Pointfloat = group(r'[0-9](?:_?[0-9])*\.(?:[0-9](?:_?[0-9])*)?',
@@ -134,7 +135,7 @@
# 'rf'). The various permutations will be generated.
_valid_string_prefixes = ['b', 'r', 'u', 'f', 'br', 'fr']
# if we add binary f-strings, add: ['fb', 'fbr']
- result = {''}
+ result = {'', 'ur', 'Ur', 'uR', 'UR'}
for prefix in _valid_string_prefixes:
for t in _itertools.permutations(prefix):
# create a list with upper and lower versions of each
@@ -167,12 +168,13 @@
# longest operators first (e.g., if = came before ==, == would get
# recognized as two instances of =).
Operator = group(r"\*\*=?", r">>=?", r"<<=?", r"!=",
+ r"<>",
r"//=?", r"->",
r"[+\-*/%&@|^=<>]=?",
r"~")
Bracket = '[][(){}]'
-Special = group(r'\r?\n', r'\.\.\.', r'[:;.,@]')
+Special = group(r'\r?\n', r'\.\.\.', r'[:;.,@`]')
Funny = group(Operator, Bracket, Special)
PlainToken = group(Number, Funny, String, Name)
@@ -283,7 +285,7 @@
self.encoding = tokval
continue
- if toknum in (NAME, NUMBER):
+ if toknum in (NAME, NUMBER, ASYNC, AWAIT):
tokval += ' '
# Insert a space between two consecutive strings
@@ -455,7 +457,7 @@
raise
-def tokenize(readline):
+def tokenize(readline, tokeneater=None):
"""
The tokenize() generator requires one argument, readline, which
must be a callable object which provides the same interface as the
@@ -473,7 +475,21 @@
The first token sequence will always be an ENCODING token
which tells you which encoding was used to decode the bytes stream.
+
+ The `tokeneater` argument is deprecated and intentionally undocumented.
"""
+ if tokeneater:
+ import warnings
+ warnings.warn(PendingDeprecationWarning,
+ "The `tokeneater` argument to tokenize() is deprecated. "
+ "Use `for token in tokenize(readline): tokeneater` "
+ "instead. Note: readline should return bytes.")
+ try:
+ tokenize_loop(readline, tokeneater)
+ except StopTokenizing:
+ pass
+ return
+
encoding, consumed = detect_encoding(readline)
empty = _itertools.repeat(b"")
rl_gen = _itertools.chain(consumed, iter(readline, b""), empty)
@@ -487,6 +503,12 @@
contline = None
indents = [0]
+ # 'stashed' and 'async_*' are used for async/await parsing
+ stashed = None
+ async_def = False
+ async_def_indent = 0
+ async_def_nl = False
+
if encoding is not None:
if encoding == "utf-8-sig":
# BOM will already have been stripped.
@@ -540,13 +562,16 @@
if pos == max:
break
+ if stashed:
+ yield stashed
+ stashed = None
+
if line[pos] in '#\r\n': # skip comments or blank lines
if line[pos] == '#':
comment_token = line[pos:].rstrip('\r\n')
yield TokenInfo(COMMENT, comment_token,
(lnum, pos), (lnum, pos + len(comment_token)), line)
pos += len(comment_token)
-
yield TokenInfo(NL, line[pos:],
(lnum, pos), (lnum, len(line)), line)
continue
@@ -561,8 +586,18 @@
("<tokenize>", lnum, pos, line))
indents = indents[:-1]
+ if async_def and async_def_indent >= indents[-1]:
+ async_def = False
+ async_def_nl = False
+ async_def_indent = 0
+
yield TokenInfo(DEDENT, '', (lnum, pos), (lnum, pos), line)
+ if async_def and async_def_nl and async_def_indent >= indents[-1]:
+ async_def = False
+ async_def_nl = False
+ async_def_indent = 0
+
else: # continued statement
if not line:
raise TokenError("EOF in multi-line statement", (lnum, 0))
@@ -581,13 +616,21 @@
(initial == '.' and token != '.' and token != '...')):
yield TokenInfo(NUMBER, token, spos, epos, line)
elif initial in '\r\n':
+ newline = NEWLINE
if parenlev > 0:
- yield TokenInfo(NL, token, spos, epos, line)
- else:
- yield TokenInfo(NEWLINE, token, spos, epos, line)
+ newline = NL
+ elif async_def:
+ async_def_nl = True
+ if stashed:
+ yield stashed
+ stashed = None
+ yield TokenInfo(newline, token, spos, epos, line)
elif initial == '#':
assert not token.endswith("\n")
+ if stashed:
+ yield stashed
+ stashed = None
yield TokenInfo(COMMENT, token, spos, epos, line)
elif token in triple_quoted:
@@ -596,6 +639,9 @@
if endmatch: # all on one line
pos = endmatch.end(0)
token = line[start:pos]
+ if stashed:
+ yield stashed
+ stashed = None
yield TokenInfo(STRING, token, spos, (lnum, pos), line)
else:
strstart = (lnum, start) # multiple lines
@@ -631,23 +677,65 @@
contline = line
break
else: # ordinary string
+ if stashed:
+ yield stashed
+ stashed = None
yield TokenInfo(STRING, token, spos, epos, line)
-
elif initial.isidentifier(): # ordinary name
- yield TokenInfo(NAME, token, spos, epos, line)
+ if token in ('async', 'await'):
+ if async_def:
+ yield TokenInfo(ASYNC if token == 'async' else AWAIT,
+ token, spos, epos, line)
+ continue
+
+ tok = TokenInfo(NAME, token, spos, epos, line)
+ if token == 'async' and not stashed:
+ stashed = tok
+ continue
+
+ if token == 'def':
+ if (stashed
+ and stashed[0] == NAME
+ and stashed[1] == 'async'):
+
+ async_def = True
+ async_def_indent = indents[-1]
+
+ yield TokenInfo(ASYNC, stashed[1],
+ stashed[2], stashed[3],
+ stashed[4])
+ stashed = None
+
+ if stashed:
+ yield stashed
+ stashed = None
+
+ yield tok
elif initial == '\\': # continued stmt
+ # This yield is new; needed for better idempotency:
+ if stashed:
+ yield stashed
+ stashed = None
+ yield TokenInfo(NL, token, spos, (lnum, pos), line)
continued = 1
else:
if initial in '([{':
parenlev += 1
elif initial in ')]}':
parenlev -= 1
+ if stashed:
+ yield stashed
+ stashed = None
yield TokenInfo(OP, token, spos, epos, line)
else:
yield TokenInfo(ERRORTOKEN, line[pos],
(lnum, pos), (lnum, pos+1), line)
pos += 1
+ if stashed:
+ yield stashed
+ stashed = None
+
for indent in indents[1:]: # pop remaining indent levels
yield TokenInfo(DEDENT, '', (lnum, 0), (lnum, 0), '')
yield TokenInfo(ENDMARKER, '', (lnum, 0), (lnum, 0), '')
@@ -658,6 +746,22 @@
def generate_tokens(readline):
return _tokenize(readline, None)
+
+# An undocumented, backwards compatible, API for users looping over tokens
+# with a callback.
+def tokenize_loop(readline, tokeneater):
+ for token_info in generate_tokens(readline):
+ tokeneater(*token_info)
+
+
+# An undocumented, backwards compatible token eater.
+def printtoken(type, token, spos, epos, line):
+ srow, scol = spos
+ erow, ecol = epos
+ print("%d,%d-%d,%d:\t%s\t%s" % \
+ (srow, scol, erow, ecol, tok_name[type], repr(token)))
+
+
def main():
import argparse
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment