Last active
April 23, 2018 00:34
-
-
Save ambv/679018041d85dd1a7497e6d89c45fb86 to your computer and use it in GitHub Desktop.
Unified diff between Lib/tokenize.py and Lib/lib2to3/pgen2/tokenize.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
--- Lib/tokenize.py 2018-04-22 17:33:48.000000000 -0700 | |
+++ Lib/lib2to3/pgen2/tokenize.py 2018-04-22 17:32:55.000000000 -0700 | |
@@ -31,14 +31,15 @@ | |
import itertools as _itertools | |
import re | |
import sys | |
-from token import * | |
+from .token import * | |
cookie_re = re.compile(r'^[ \t\f]*#.*?coding[:=][ \t]*([-\w.]+)', re.ASCII) | |
blank_re = re.compile(br'^[ \t\f]*(?:[#\r\n]|$)', re.ASCII) | |
-import token | |
+from . import token | |
__all__ = token.__all__ + ["tokenize", "detect_encoding", | |
- "untokenize", "TokenInfo"] | |
+ "untokenize", "TokenInfo", | |
+ "generate_tokens"] | |
del token | |
EXACT_TOKEN_TYPES = { | |
@@ -114,10 +115,10 @@ | |
Ignore = Whitespace + any(r'\\\r?\n' + Whitespace) + maybe(Comment) | |
Name = r'\w+' | |
-Hexnumber = r'0[xX](?:_?[0-9a-fA-F])+' | |
+Hexnumber = r'0[xX](?:_?[0-9a-fA-F])+[lL]?' | |
Binnumber = r'0[bB](?:_?[01])+' | |
-Octnumber = r'0[oO](?:_?[0-7])+' | |
-Decnumber = r'(?:0(?:_?0)*|[1-9](?:_?[0-9])*)' | |
+Octnumber = r'0[oO]?(?:_?[0-7])+[lL]?' | |
+Decnumber = r'(?:0(?:_?0)*|[1-9](?:_?[0-9])*)[lL]?' | |
Intnumber = group(Hexnumber, Binnumber, Octnumber, Decnumber) | |
Exponent = r'[eE][-+]?[0-9](?:_?[0-9])*' | |
Pointfloat = group(r'[0-9](?:_?[0-9])*\.(?:[0-9](?:_?[0-9])*)?', | |
@@ -134,7 +135,7 @@ | |
# 'rf'). The various permutations will be generated. | |
_valid_string_prefixes = ['b', 'r', 'u', 'f', 'br', 'fr'] | |
# if we add binary f-strings, add: ['fb', 'fbr'] | |
- result = {''} | |
+ result = {'', 'ur', 'Ur', 'uR', 'UR'} | |
for prefix in _valid_string_prefixes: | |
for t in _itertools.permutations(prefix): | |
# create a list with upper and lower versions of each | |
@@ -167,12 +168,13 @@ | |
# longest operators first (e.g., if = came before ==, == would get | |
# recognized as two instances of =). | |
Operator = group(r"\*\*=?", r">>=?", r"<<=?", r"!=", | |
+ r"<>", | |
r"//=?", r"->", | |
r"[+\-*/%&@|^=<>]=?", | |
r"~") | |
Bracket = '[][(){}]' | |
-Special = group(r'\r?\n', r'\.\.\.', r'[:;.,@]') | |
+Special = group(r'\r?\n', r'\.\.\.', r'[:;.,@`]') | |
Funny = group(Operator, Bracket, Special) | |
PlainToken = group(Number, Funny, String, Name) | |
@@ -283,7 +285,7 @@ | |
self.encoding = tokval | |
continue | |
- if toknum in (NAME, NUMBER): | |
+ if toknum in (NAME, NUMBER, ASYNC, AWAIT): | |
tokval += ' ' | |
# Insert a space between two consecutive strings | |
@@ -455,7 +457,7 @@ | |
raise | |
-def tokenize(readline): | |
+def tokenize(readline, tokeneater=None): | |
""" | |
The tokenize() generator requires one argument, readline, which | |
must be a callable object which provides the same interface as the | |
@@ -473,7 +475,21 @@ | |
The first token sequence will always be an ENCODING token | |
which tells you which encoding was used to decode the bytes stream. | |
+ | |
+ The `tokeneater` argument is deprecated and intentionally undocumented. | |
""" | |
+ if tokeneater: | |
+ import warnings | |
+ warnings.warn(PendingDeprecationWarning, | |
+ "The `tokeneater` argument to tokenize() is deprecated. " | |
+ "Use `for token in tokenize(readline): tokeneater` " | |
+ "instead. Note: readline should return bytes.") | |
+ try: | |
+ tokenize_loop(readline, tokeneater) | |
+ except StopTokenizing: | |
+ pass | |
+ return | |
+ | |
encoding, consumed = detect_encoding(readline) | |
empty = _itertools.repeat(b"") | |
rl_gen = _itertools.chain(consumed, iter(readline, b""), empty) | |
@@ -487,6 +503,12 @@ | |
contline = None | |
indents = [0] | |
+ # 'stashed' and 'async_*' are used for async/await parsing | |
+ stashed = None | |
+ async_def = False | |
+ async_def_indent = 0 | |
+ async_def_nl = False | |
+ | |
if encoding is not None: | |
if encoding == "utf-8-sig": | |
# BOM will already have been stripped. | |
@@ -540,13 +562,16 @@ | |
if pos == max: | |
break | |
+ if stashed: | |
+ yield stashed | |
+ stashed = None | |
+ | |
if line[pos] in '#\r\n': # skip comments or blank lines | |
if line[pos] == '#': | |
comment_token = line[pos:].rstrip('\r\n') | |
yield TokenInfo(COMMENT, comment_token, | |
(lnum, pos), (lnum, pos + len(comment_token)), line) | |
pos += len(comment_token) | |
- | |
yield TokenInfo(NL, line[pos:], | |
(lnum, pos), (lnum, len(line)), line) | |
continue | |
@@ -561,8 +586,18 @@ | |
("<tokenize>", lnum, pos, line)) | |
indents = indents[:-1] | |
+ if async_def and async_def_indent >= indents[-1]: | |
+ async_def = False | |
+ async_def_nl = False | |
+ async_def_indent = 0 | |
+ | |
yield TokenInfo(DEDENT, '', (lnum, pos), (lnum, pos), line) | |
+ if async_def and async_def_nl and async_def_indent >= indents[-1]: | |
+ async_def = False | |
+ async_def_nl = False | |
+ async_def_indent = 0 | |
+ | |
else: # continued statement | |
if not line: | |
raise TokenError("EOF in multi-line statement", (lnum, 0)) | |
@@ -581,13 +616,21 @@ | |
(initial == '.' and token != '.' and token != '...')): | |
yield TokenInfo(NUMBER, token, spos, epos, line) | |
elif initial in '\r\n': | |
+ newline = NEWLINE | |
if parenlev > 0: | |
- yield TokenInfo(NL, token, spos, epos, line) | |
- else: | |
- yield TokenInfo(NEWLINE, token, spos, epos, line) | |
+ newline = NL | |
+ elif async_def: | |
+ async_def_nl = True | |
+ if stashed: | |
+ yield stashed | |
+ stashed = None | |
+ yield TokenInfo(newline, token, spos, epos, line) | |
elif initial == '#': | |
assert not token.endswith("\n") | |
+ if stashed: | |
+ yield stashed | |
+ stashed = None | |
yield TokenInfo(COMMENT, token, spos, epos, line) | |
elif token in triple_quoted: | |
@@ -596,6 +639,9 @@ | |
if endmatch: # all on one line | |
pos = endmatch.end(0) | |
token = line[start:pos] | |
+ if stashed: | |
+ yield stashed | |
+ stashed = None | |
yield TokenInfo(STRING, token, spos, (lnum, pos), line) | |
else: | |
strstart = (lnum, start) # multiple lines | |
@@ -631,23 +677,65 @@ | |
contline = line | |
break | |
else: # ordinary string | |
+ if stashed: | |
+ yield stashed | |
+ stashed = None | |
yield TokenInfo(STRING, token, spos, epos, line) | |
- | |
elif initial.isidentifier(): # ordinary name | |
- yield TokenInfo(NAME, token, spos, epos, line) | |
+ if token in ('async', 'await'): | |
+ if async_def: | |
+ yield TokenInfo(ASYNC if token == 'async' else AWAIT, | |
+ token, spos, epos, line) | |
+ continue | |
+ | |
+ tok = TokenInfo(NAME, token, spos, epos, line) | |
+ if token == 'async' and not stashed: | |
+ stashed = tok | |
+ continue | |
+ | |
+ if token == 'def': | |
+ if (stashed | |
+ and stashed[0] == NAME | |
+ and stashed[1] == 'async'): | |
+ | |
+ async_def = True | |
+ async_def_indent = indents[-1] | |
+ | |
+ yield TokenInfo(ASYNC, stashed[1], | |
+ stashed[2], stashed[3], | |
+ stashed[4]) | |
+ stashed = None | |
+ | |
+ if stashed: | |
+ yield stashed | |
+ stashed = None | |
+ | |
+ yield tok | |
elif initial == '\\': # continued stmt | |
+ # This yield is new; needed for better idempotency: | |
+ if stashed: | |
+ yield stashed | |
+ stashed = None | |
+ yield TokenInfo(NL, token, spos, (lnum, pos), line) | |
continued = 1 | |
else: | |
if initial in '([{': | |
parenlev += 1 | |
elif initial in ')]}': | |
parenlev -= 1 | |
+ if stashed: | |
+ yield stashed | |
+ stashed = None | |
yield TokenInfo(OP, token, spos, epos, line) | |
else: | |
yield TokenInfo(ERRORTOKEN, line[pos], | |
(lnum, pos), (lnum, pos+1), line) | |
pos += 1 | |
+ if stashed: | |
+ yield stashed | |
+ stashed = None | |
+ | |
for indent in indents[1:]: # pop remaining indent levels | |
yield TokenInfo(DEDENT, '', (lnum, 0), (lnum, 0), '') | |
yield TokenInfo(ENDMARKER, '', (lnum, 0), (lnum, 0), '') | |
@@ -658,6 +746,22 @@ | |
def generate_tokens(readline): | |
return _tokenize(readline, None) | |
+ | |
+# An undocumented, backwards compatible, API for users looping over tokens | |
+# with a callback. | |
+def tokenize_loop(readline, tokeneater): | |
+ for token_info in generate_tokens(readline): | |
+ tokeneater(*token_info) | |
+ | |
+ | |
+# An undocumented, backwards compatible token eater. | |
+def printtoken(type, token, spos, epos, line): | |
+ srow, scol = spos | |
+ erow, ecol = epos | |
+ print("%d,%d-%d,%d:\t%s\t%s" % \ | |
+ (srow, scol, erow, ecol, tok_name[type], repr(token))) | |
+ | |
+ | |
def main(): | |
import argparse | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment