ambv · April 23, 2018 00:34
diff --git a/tokenize.diff b/tokenize.diff
 --- Lib/tokenize.py	2018-04-22 17:33:48.000000000 -0700
 +++ Lib/lib2to3/pgen2/tokenize.py	2018-04-22 17:32:55.000000000 -0700
 @@ -31,14 +31,15 @@
 import itertools as _itertools
 import re
 import sys
 -from token import *
 +from .token import *
 
 cookie_re = re.compile(r'^[ \t\f]*#.*?coding[:=][ \t]*([-\w.]+)', re.ASCII)
 blank_re = re.compile(br'^[ \t\f]*(?:[#\r\n]|$)', re.ASCII)
 
 -import token
 +from . import token
 __all__ = token.__all__ + ["tokenize", "detect_encoding",
 -                           "untokenize", "TokenInfo"]
 +                           "untokenize", "TokenInfo",
 +                           "generate_tokens"]
 del token
 
 EXACT_TOKEN_TYPES = {
 @@ -114,10 +115,10 @@
 Ignore = Whitespace + any(r'\\\r?\n' + Whitespace) + maybe(Comment)
 Name = r'\w+'
 
 -Hexnumber = r'0[xX](?:_?[0-9a-fA-F])+'
 +Hexnumber = r'0[xX](?:_?[0-9a-fA-F])+[lL]?'
 Binnumber = r'0[bB](?:_?[01])+'
 -Octnumber = r'0[oO](?:_?[0-7])+'
 -Decnumber = r'(?:0(?:_?0)*|[1-9](?:_?[0-9])*)'
 +Octnumber = r'0[oO]?(?:_?[0-7])+[lL]?'
 +Decnumber = r'(?:0(?:_?0)*|[1-9](?:_?[0-9])*)[lL]?'
 Intnumber = group(Hexnumber, Binnumber, Octnumber, Decnumber)
 Exponent = r'[eE][-+]?[0-9](?:_?[0-9])*'
 Pointfloat = group(r'[0-9](?:_?[0-9])*\.(?:[0-9](?:_?[0-9])*)?',
 @@ -134,7 +135,7 @@
     #  'rf'). The various permutations will be generated.
     _valid_string_prefixes = ['b', 'r', 'u', 'f', 'br', 'fr']
     # if we add binary f-strings, add: ['fb', 'fbr']
 -    result = {''}
 +    result = {'', 'ur', 'Ur', 'uR', 'UR'}
     for prefix in _valid_string_prefixes:
         for t in _itertools.permutations(prefix):
             # create a list with upper and lower versions of each
 @@ -167,12 +168,13 @@
 # longest operators first (e.g., if = came before ==, == would get
 # recognized as two instances of =).
 Operator = group(r"\*\*=?", r">>=?", r"<<=?", r"!=",
 +                 r"<>",
                  r"//=?", r"->",
                  r"[+\-*/%&@|^=<>]=?",
                  r"~")
 
 Bracket = '[][(){}]'
 -Special = group(r'\r?\n', r'\.\.\.', r'[:;.,@]')
 +Special = group(r'\r?\n', r'\.\.\.', r'[:;.,@`]')
 Funny = group(Operator, Bracket, Special)
 
 PlainToken = group(Number, Funny, String, Name)
 @@ -283,7 +285,7 @@
                 self.encoding = tokval
                 continue
 
 -            if toknum in (NAME, NUMBER):
 +            if toknum in (NAME, NUMBER, ASYNC, AWAIT):
                 tokval += ' '
 
             # Insert a space between two consecutive strings
 @@ -455,7 +457,7 @@
         raise
 
 
 -def tokenize(readline):
 +def tokenize(readline, tokeneater=None):
     """
     The tokenize() generator requires one argument, readline, which
     must be a callable object which provides the same interface as the
 @@ -473,7 +475,21 @@
 
     The first token sequence will always be an ENCODING token
     which tells you which encoding was used to decode the bytes stream.
 +
 +    The `tokeneater` argument is deprecated and intentionally undocumented.
     """
 +    if tokeneater:
 +        import warnings
 +        warnings.warn(PendingDeprecationWarning,
 +                      "The `tokeneater` argument to tokenize() is deprecated. "
 +                      "Use `for token in tokenize(readline): tokeneater` "
 +                      "instead. Note: readline should return bytes.")
 +        try:
 +            tokenize_loop(readline, tokeneater)
 +        except StopTokenizing:
 +            pass
 +        return
 +
     encoding, consumed = detect_encoding(readline)
     empty = _itertools.repeat(b"")
     rl_gen = _itertools.chain(consumed, iter(readline, b""), empty)
 @@ -487,6 +503,12 @@
     contline = None
     indents = [0]
 
 +    # 'stashed' and 'async_*' are used for async/await parsing
 +    stashed = None
 +    async_def = False
 +    async_def_indent = 0
 +    async_def_nl = False
 +
     if encoding is not None:
         if encoding == "utf-8-sig":
             # BOM will already have been stripped.
 @@ -540,13 +562,16 @@
             if pos == max:
                 break
 
 +            if stashed:
 +                yield stashed
 +                stashed = None
 +
             if line[pos] in '#\r\n':           # skip comments or blank lines
                 if line[pos] == '#':
                     comment_token = line[pos:].rstrip('\r\n')
                     yield TokenInfo(COMMENT, comment_token,
                            (lnum, pos), (lnum, pos + len(comment_token)), line)
                     pos += len(comment_token)
 -
                 yield TokenInfo(NL, line[pos:],
                            (lnum, pos), (lnum, len(line)), line)
                 continue
 @@ -561,8 +586,18 @@
                         ("<tokenize>", lnum, pos, line))
                 indents = indents[:-1]
 
 +                if async_def and async_def_indent >= indents[-1]:
 +                    async_def = False
 +                    async_def_nl = False
 +                    async_def_indent = 0
 +
                 yield TokenInfo(DEDENT, '', (lnum, pos), (lnum, pos), line)
 
 +            if async_def and async_def_nl and async_def_indent >= indents[-1]:
 +                async_def = False
 +                async_def_nl = False
 +                async_def_indent = 0
 +
         else:                                  # continued statement
             if not line:
                 raise TokenError("EOF in multi-line statement", (lnum, 0))
 @@ -581,13 +616,21 @@
                     (initial == '.' and token != '.' and token != '...')):
                     yield TokenInfo(NUMBER, token, spos, epos, line)
                 elif initial in '\r\n':
 +                    newline = NEWLINE
                     if parenlev > 0:
 -                        yield TokenInfo(NL, token, spos, epos, line)
 -                    else:
 -                        yield TokenInfo(NEWLINE, token, spos, epos, line)
 +                        newline = NL
 +                    elif async_def:
 +                        async_def_nl = True
 +                    if stashed:
 +                        yield stashed
 +                        stashed = None
 +                    yield TokenInfo(newline, token, spos, epos, line)
 
                 elif initial == '#':
                     assert not token.endswith("\n")
 +                    if stashed:
 +                        yield stashed
 +                        stashed = None
                     yield TokenInfo(COMMENT, token, spos, epos, line)
 
                 elif token in triple_quoted:
 @@ -596,6 +639,9 @@
                     if endmatch:                           # all on one line
                         pos = endmatch.end(0)
                         token = line[start:pos]
 +                        if stashed:
 +                            yield stashed
 +                            stashed = None
                         yield TokenInfo(STRING, token, spos, (lnum, pos), line)
                     else:
                         strstart = (lnum, start)           # multiple lines
 @@ -631,23 +677,65 @@
                         contline = line
                         break
                     else:                                  # ordinary string
 +                        if stashed:
 +                            yield stashed
 +                            stashed = None
                         yield TokenInfo(STRING, token, spos, epos, line)
 -
                 elif initial.isidentifier():               # ordinary name
 -                    yield TokenInfo(NAME, token, spos, epos, line)
 +                    if token in ('async', 'await'):
 +                        if async_def:
 +                            yield TokenInfo(ASYNC if token == 'async' else AWAIT,
 +                                   token, spos, epos, line)
 +                            continue
 +
 +                    tok = TokenInfo(NAME, token, spos, epos, line)
 +                    if token == 'async' and not stashed:
 +                        stashed = tok
 +                        continue
 +
 +                    if token == 'def':
 +                        if (stashed
 +                                and stashed[0] == NAME
 +                                and stashed[1] == 'async'):
 +
 +                            async_def = True
 +                            async_def_indent = indents[-1]
 +
 +                            yield TokenInfo(ASYNC, stashed[1],
 +                                   stashed[2], stashed[3],
 +                                   stashed[4])
 +                            stashed = None
 +
 +                    if stashed:
 +                        yield stashed
 +                        stashed = None
 +
 +                    yield tok
                 elif initial == '\\':                      # continued stmt
 +                    # This yield is new; needed for better idempotency:
 +                    if stashed:
 +                        yield stashed
 +                        stashed = None
 +                    yield TokenInfo(NL, token, spos, (lnum, pos), line)
                     continued = 1
                 else:
                     if initial in '([{':
                         parenlev += 1
                     elif initial in ')]}':
                         parenlev -= 1
 +                    if stashed:
 +                        yield stashed
 +                        stashed = None
                     yield TokenInfo(OP, token, spos, epos, line)
             else:
                 yield TokenInfo(ERRORTOKEN, line[pos],
                            (lnum, pos), (lnum, pos+1), line)
                 pos += 1
 
 +    if stashed:
 +        yield stashed
 +        stashed = None
 +
     for indent in indents[1:]:                 # pop remaining indent levels
         yield TokenInfo(DEDENT, '', (lnum, 0), (lnum, 0), '')
     yield TokenInfo(ENDMARKER, '', (lnum, 0), (lnum, 0), '')
 @@ -658,6 +746,22 @@
 def generate_tokens(readline):
     return _tokenize(readline, None)
 
 +
 +# An undocumented, backwards compatible, API for users looping over tokens
 +# with a callback.
 +def tokenize_loop(readline, tokeneater):
 +    for token_info in generate_tokens(readline):
 +        tokeneater(*token_info)
 +
 +
 +# An undocumented, backwards compatible token eater.
 +def printtoken(type, token, spos, epos, line):
 +    srow, scol = spos
 +    erow, ecol = epos
 +    print("%d,%d-%d,%d:\t%s\t%s" % \
 +        (srow, scol, erow, ecol, tok_name[type], repr(token)))
 +
 +
 def main():
     import argparse
	--- Lib/tokenize.py 2018-04-22 17:33:48.000000000 -0700
	+++ Lib/lib2to3/pgen2/tokenize.py 2018-04-22 17:32:55.000000000 -0700
	@@ -31,14 +31,15 @@
	import itertools as _itertools
	import re
	import sys
	-from token import *
	+from .token import *

	cookie_re = re.compile(r'^[ \t\f]#.?coding[:=][ \t]*([-\w.]+)', re.ASCII)
	blank_re = re.compile(br'^[ \t\f]*(?:[#\r\n]\|$)', re.ASCII)

	-import token
	+from . import token
	__all__ = token.__all__ + ["tokenize", "detect_encoding",
	- "untokenize", "TokenInfo"]
	+ "untokenize", "TokenInfo",
	+ "generate_tokens"]
	del token

	EXACT_TOKEN_TYPES = {
	@@ -114,10 +115,10 @@
	Ignore = Whitespace + any(r'\\\r?\n' + Whitespace) + maybe(Comment)
	Name = r'\w+'

	-Hexnumber = r'0[xX](?:_?[0-9a-fA-F])+'
	+Hexnumber = r'0[xX](?:_?[0-9a-fA-F])+[lL]?'
	Binnumber = r'0[bB](?:_?[01])+'
	-Octnumber = r'0[oO](?:_?[0-7])+'
	-Decnumber = r'(?:0(?:_?0)\|[1-9](?:_?[0-9]))'
	+Octnumber = r'0[oO]?(?:_?[0-7])+[lL]?'
	+Decnumber = r'(?:0(?:_?0)\|[1-9](?:_?[0-9]))[lL]?'
	Intnumber = group(Hexnumber, Binnumber, Octnumber, Decnumber)
	Exponent = r'[eE][-+]?[0-9](?:_?[0-9])*'
	Pointfloat = group(r'[0-9](?:_?[0-9])\.(?:[0-9](?:_?[0-9]))?',
	@@ -134,7 +135,7 @@
	# 'rf'). The various permutations will be generated.
	_valid_string_prefixes = ['b', 'r', 'u', 'f', 'br', 'fr']
	# if we add binary f-strings, add: ['fb', 'fbr']
	- result = {''}
	+ result = {'', 'ur', 'Ur', 'uR', 'UR'}
	for prefix in _valid_string_prefixes:
	for t in _itertools.permutations(prefix):
	# create a list with upper and lower versions of each
	@@ -167,12 +168,13 @@
	# longest operators first (e.g., if = came before ==, == would get
	# recognized as two instances of =).
	Operator = group(r"\\=?", r">>=?", r"<<=?", r"!=",
	+ r"<>",
	r"//=?", r"->",
	r"[+\-*/%&@\|^=<>]=?",
	r"~")

	Bracket = '[][(){}]'
	-Special = group(r'\r?\n', r'\.\.\.', r'[:;.,@]')
	+Special = group(r'\r?\n', r'\.\.\.', r'[:;.,@`]')
	Funny = group(Operator, Bracket, Special)

	PlainToken = group(Number, Funny, String, Name)
	@@ -283,7 +285,7 @@
	self.encoding = tokval
	continue

	- if toknum in (NAME, NUMBER):
	+ if toknum in (NAME, NUMBER, ASYNC, AWAIT):
	tokval += ' '

	# Insert a space between two consecutive strings
	@@ -455,7 +457,7 @@
	raise


	-def tokenize(readline):
	+def tokenize(readline, tokeneater=None):
	"""
	The tokenize() generator requires one argument, readline, which
	must be a callable object which provides the same interface as the
	@@ -473,7 +475,21 @@

	The first token sequence will always be an ENCODING token
	which tells you which encoding was used to decode the bytes stream.
	+
	+ The `tokeneater` argument is deprecated and intentionally undocumented.
	"""
	+ if tokeneater:
	+ import warnings
	+ warnings.warn(PendingDeprecationWarning,
	+ "The `tokeneater` argument to tokenize() is deprecated. "
	+ "Use `for token in tokenize(readline): tokeneater` "
	+ "instead. Note: readline should return bytes.")
	+ try:
	+ tokenize_loop(readline, tokeneater)
	+ except StopTokenizing:
	+ pass
	+ return
	+
	encoding, consumed = detect_encoding(readline)
	empty = _itertools.repeat(b"")
	rl_gen = _itertools.chain(consumed, iter(readline, b""), empty)
	@@ -487,6 +503,12 @@
	contline = None
	indents = [0]

	+ # 'stashed' and 'async_*' are used for async/await parsing
	+ stashed = None
	+ async_def = False
	+ async_def_indent = 0
	+ async_def_nl = False
	+
	if encoding is not None:
	if encoding == "utf-8-sig":
	# BOM will already have been stripped.
	@@ -540,13 +562,16 @@
	if pos == max:
	break

	+ if stashed:
	+ yield stashed
	+ stashed = None
	+
	if line[pos] in '#\r\n': # skip comments or blank lines
	if line[pos] == '#':
	comment_token = line[pos:].rstrip('\r\n')
	yield TokenInfo(COMMENT, comment_token,
	(lnum, pos), (lnum, pos + len(comment_token)), line)
	pos += len(comment_token)
	-
	yield TokenInfo(NL, line[pos:],
	(lnum, pos), (lnum, len(line)), line)
	continue
	@@ -561,8 +586,18 @@
	("<tokenize>", lnum, pos, line))
	indents = indents[:-1]

	+ if async_def and async_def_indent >= indents[-1]:
	+ async_def = False
	+ async_def_nl = False
	+ async_def_indent = 0
	+
	yield TokenInfo(DEDENT, '', (lnum, pos), (lnum, pos), line)

	+ if async_def and async_def_nl and async_def_indent >= indents[-1]:
	+ async_def = False
	+ async_def_nl = False
	+ async_def_indent = 0
	+
	else: # continued statement
	if not line:
	raise TokenError("EOF in multi-line statement", (lnum, 0))
	@@ -581,13 +616,21 @@
	(initial == '.' and token != '.' and token != '...')):
	yield TokenInfo(NUMBER, token, spos, epos, line)
	elif initial in '\r\n':
	+ newline = NEWLINE
	if parenlev > 0:
	- yield TokenInfo(NL, token, spos, epos, line)
	- else:
	- yield TokenInfo(NEWLINE, token, spos, epos, line)
	+ newline = NL
	+ elif async_def:
	+ async_def_nl = True
	+ if stashed:
	+ yield stashed
	+ stashed = None
	+ yield TokenInfo(newline, token, spos, epos, line)

	elif initial == '#':
	assert not token.endswith("\n")
	+ if stashed:
	+ yield stashed
	+ stashed = None
	yield TokenInfo(COMMENT, token, spos, epos, line)

	elif token in triple_quoted:
	@@ -596,6 +639,9 @@
	if endmatch: # all on one line
	pos = endmatch.end(0)
	token = line[start:pos]
	+ if stashed:
	+ yield stashed
	+ stashed = None
	yield TokenInfo(STRING, token, spos, (lnum, pos), line)
	else:
	strstart = (lnum, start) # multiple lines
	@@ -631,23 +677,65 @@
	contline = line
	break
	else: # ordinary string
	+ if stashed:
	+ yield stashed
	+ stashed = None
	yield TokenInfo(STRING, token, spos, epos, line)
	-
	elif initial.isidentifier(): # ordinary name
	- yield TokenInfo(NAME, token, spos, epos, line)
	+ if token in ('async', 'await'):
	+ if async_def:
	+ yield TokenInfo(ASYNC if token == 'async' else AWAIT,
	+ token, spos, epos, line)
	+ continue
	+
	+ tok = TokenInfo(NAME, token, spos, epos, line)
	+ if token == 'async' and not stashed:
	+ stashed = tok
	+ continue
	+
	+ if token == 'def':
	+ if (stashed
	+ and stashed[0] == NAME
	+ and stashed[1] == 'async'):
	+
	+ async_def = True
	+ async_def_indent = indents[-1]
	+
	+ yield TokenInfo(ASYNC, stashed[1],
	+ stashed[2], stashed[3],
	+ stashed[4])
	+ stashed = None
	+
	+ if stashed:
	+ yield stashed
	+ stashed = None
	+
	+ yield tok
	elif initial == '\\': # continued stmt
	+ # This yield is new; needed for better idempotency:
	+ if stashed:
	+ yield stashed
	+ stashed = None
	+ yield TokenInfo(NL, token, spos, (lnum, pos), line)
	continued = 1
	else:
	if initial in '([{':
	parenlev += 1
	elif initial in ')]}':
	parenlev -= 1
	+ if stashed:
	+ yield stashed
	+ stashed = None
	yield TokenInfo(OP, token, spos, epos, line)
	else:
	yield TokenInfo(ERRORTOKEN, line[pos],
	(lnum, pos), (lnum, pos+1), line)
	pos += 1

	+ if stashed:
	+ yield stashed
	+ stashed = None
	+
	for indent in indents[1:]: # pop remaining indent levels
	yield TokenInfo(DEDENT, '', (lnum, 0), (lnum, 0), '')
	yield TokenInfo(ENDMARKER, '', (lnum, 0), (lnum, 0), '')
	@@ -658,6 +746,22 @@
	def generate_tokens(readline):
	return _tokenize(readline, None)

	+
	+# An undocumented, backwards compatible, API for users looping over tokens
	+# with a callback.
	+def tokenize_loop(readline, tokeneater):
	+ for token_info in generate_tokens(readline):
	+ tokeneater(*token_info)
	+
	+
	+# An undocumented, backwards compatible token eater.
	+def printtoken(type, token, spos, epos, line):
	+ srow, scol = spos
	+ erow, ecol = epos
	+ print("%d,%d-%d,%d:\t%s\t%s" % \
	+ (srow, scol, erow, ecol, tok_name[type], repr(token)))
	+
	+
	def main():
	import argparse