Created
April 7, 2025 13:21
-
-
Save pablogsal/66b5d2f43a011443c72af3a25f6e3102 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import tokenize | |
from io import BytesIO | |
from typing import List, Tuple, Optional, Dict, Any | |
import keyword | |
from codeop import compile_command | |
from _suggestions import _generate_suggestions | |
from textwrap import dedent | |
class StatementFinder: | |
"""Finds the smallest surrounding statement given a line number and Python source.""" | |
def __init__(self, source: str, line_number: int): | |
self.source = source | |
self.line_number = line_number | |
self.statement_tokens = [] | |
self.all_statements = [] | |
self.statement_brackets = [] | |
self.begin_brackets = [] | |
self.end_bracket = None | |
self.bad_token = None | |
self.prev_token = None | |
# Tokens that should begin a statement and cannot occur within brackets | |
self.should_begin_statement = [ | |
"assert", "async", "await", "break", "class", "continue", | |
"def", "del", "elif", "except", "finally", "global", | |
"import", "nonlocal", "pass", "raise", "return", "try", | |
"with", "while", "yield" | |
] | |
def find_statement(self) -> str: | |
"""Find and return the smallest complete statement containing the target line.""" | |
source_tokens = self._get_source_tokens() | |
self._obtain_statement(source_tokens) | |
return self._reconstruct_statement() | |
def _get_source_tokens(self) -> List[tokenize.TokenInfo]: | |
"""Get all tokens from the source.""" | |
return list(tokenize.tokenize(BytesIO(self.source.encode('utf-8')).readline)) | |
def _obtain_statement(self, source_tokens: List[tokenize.TokenInfo]) -> None: | |
"""Find the statement containing the target line number.""" | |
previous_row = -1 | |
previous_token = None | |
continuation_line = False | |
last_line_to_include = self.line_number | |
for token in source_tokens: | |
if (token.start[0] > last_line_to_include and | |
not continuation_line and | |
not self.statement_brackets): | |
break | |
if token.start[0] > previous_row: | |
if previous_token is not None: | |
continuation_line = previous_token.line.endswith("\\\n") | |
if (token.start[0] <= last_line_to_include and | |
not self.statement_brackets): | |
if self.statement_tokens: | |
self.all_statements.append(self.statement_tokens[:]) | |
self.statement_tokens = [] | |
self.begin_brackets = [] | |
if token.start[0] > last_line_to_include and self.statement_brackets: | |
last_line_to_include = token.start[0] | |
previous_row = token.start[0] | |
self.statement_tokens.append(token) | |
if (token.start[0] == self.line_number and | |
token.string.strip() and | |
token.type != tokenize.COMMENT): | |
self.bad_token = token | |
if token.string.strip() and token.type != tokenize.COMMENT: | |
self.prev_token = token | |
previous_token = token | |
# Check if we have a statement-starting token in the middle of brackets | |
if (self.bad_token and | |
self.bad_token.string in self.should_begin_statement and | |
self.bad_token != self.statement_tokens[0] and | |
self.statement_brackets): | |
break | |
if not token.string or token.string not in "()[]}{": | |
continue | |
if token.string in "([{": | |
self.statement_brackets.append(token.string) | |
if self.bad_token is None or self.bad_token is token: | |
self.begin_brackets.append(token) | |
elif token.string in ")]}": | |
self.end_bracket = token | |
if not self.statement_brackets: | |
break | |
open_bracket = self.statement_brackets.pop() | |
if not self._matching_brackets(open_bracket, token.string): | |
self.statement_brackets.append(open_bracket) | |
break | |
if self.begin_brackets and self.bad_token is None: | |
self.begin_brackets.pop() | |
self.end_bracket = None | |
if self.statement_tokens: | |
last_line = self._untokenize(self.statement_tokens) | |
if last_line.strip(): | |
self.all_statements.append(self.statement_tokens) | |
elif self.all_statements: | |
self.statement_tokens = self.all_statements[-1] | |
def _matching_brackets(self, open_bracket: str, close_bracket: str) -> bool: | |
"""Check if brackets match.""" | |
return (open_bracket == '(' and close_bracket == ')') or \ | |
(open_bracket == '[' and close_bracket == ']') or \ | |
(open_bracket == '{' and close_bracket == '}') | |
def _untokenize(self, tokens: List[tokenize.TokenInfo]) -> str: | |
"""Convert tokens back to source code.""" | |
return ''.join(token.string for token in tokens) | |
def _reconstruct_statement(self) -> str: | |
"""Extract the statement directly from the source using token positions.""" | |
if not self.statement_tokens: | |
return "" | |
# Get the first and last tokens | |
first_token = self.statement_tokens[0] | |
last_token = self.statement_tokens[-1] | |
# Split source into lines | |
lines = self.source.splitlines(keepends=True) | |
# Get the relevant lines | |
start_line = first_token.start[0] - 1 # convert to 0-based index | |
end_line = last_token.end[0] - 1 | |
# Extract the statement | |
statement_lines = [] | |
for i in range(start_line, end_line + 1): | |
if i < len(lines): | |
line = lines[i] | |
if i == start_line: | |
# For first line, start from first token's column | |
line = line[first_token.start[1]:] | |
if i == end_line: | |
# For last line, end at last token's column | |
line = line[:last_token.end[1]] | |
statement_lines.append(line.rstrip()) | |
return '\n'.join(statement_lines) | |
def try_keyword_substitutions(syntax_error: SyntaxError, source: str | None) -> Tuple[str, Tuple[int, int, int, int], str] | None: | |
""" | |
Try to find a valid Python statement by substituting keywords in the problematic code. | |
Args: | |
syntax_error: A SyntaxError exception containing the error information | |
source: Optional source code. If None, will try to read from file. | |
Returns: | |
A tuple of (suggestion, (start_line, start_offset, end_line, end_offset), modified_statement) | |
in the original source, or None if no valid modification was found | |
""" | |
# Get the source file and line number | |
filename = syntax_error.filename | |
line_number = syntax_error.lineno | |
if not filename or not isinstance(filename, str): | |
return None | |
if not line_number or not isinstance(line_number, int): | |
return None | |
if source is None: | |
try: | |
# Read the source file | |
with open(filename, 'r', encoding='utf-8') as f: | |
source = f.read() | |
except (IOError, OSError): | |
return None | |
# Find the problematic statement | |
finder = StatementFinder(source, line_number) | |
statement = finder.find_statement() | |
if not statement: | |
return None | |
# Get the first token's position in the original source | |
first_token = finder.statement_tokens[0] | |
source_start_line = first_token.start[0] | |
source_start_col = first_token.start[1] | |
# Tokenize the statement | |
try: | |
tokens = list(tokenize.tokenize(BytesIO(statement.encode('utf-8')).readline)) | |
except tokenize.TokenError: | |
return None | |
# Get the lines of the statement | |
lines = statement.splitlines(keepends=True) | |
# Try substituting each NAME token with Python keywords | |
for token in tokens: | |
if token.type != tokenize.NAME: | |
continue | |
# Calculate the position in the source string | |
line_idx = token.start[0] - 1 | |
if line_idx >= len(lines): | |
continue | |
# Get the line and its start/end positions | |
line = lines[line_idx] | |
start_pos = token.start[1] | |
end_pos = token.end[1] | |
suggestion = _generate_suggestions(keyword.kwlist, token.string) | |
if suggestion: | |
# Create modified line by replacing the token | |
modified_line = line[:start_pos] + suggestion + line[end_pos:] | |
modified_lines = lines[:line_idx] + [modified_line] + lines[line_idx + 1:] | |
modified_statement = ''.join(modified_lines) | |
# Try to compile | |
try: | |
code = dedent(modified_statement) | |
compile_command(code, "<string>", "exec") | |
# Calculate positions in original source | |
original_start_line = source_start_line + line_idx | |
original_start_col = source_start_col + start_pos if line_idx == 0 else start_pos | |
original_end_line = source_start_line + line_idx | |
original_end_col = source_start_col + end_pos if line_idx == 0 else end_pos | |
return (suggestion, | |
(original_start_line, original_start_col, | |
original_end_line, original_end_col), | |
modified_statement) | |
except (SyntaxError, ValueError): | |
continue | |
return None | |
# Example usage | |
if __name__ == "__main__": | |
# Example usage | |
code = """\ | |
from os import path | |
class A: | |
def foo(): | |
... | |
def bar(): | |
x= 1 | |
y = [x for x in | |
range(3)] | |
with ( | |
open("test.txt", "w") as f, | |
open("test2.txt", "r") os g, | |
open("test3.txt", "r") as h, | |
): | |
pass | |
z = 3 | |
def blech(): | |
... | |
""" | |
# Example of using the new function | |
try: | |
# This would normally come from a SyntaxError | |
compile(code, "<string>", "exec") | |
except SyntaxError as e: | |
result = try_keyword_substitutions(e, code) | |
if result: | |
kw, (start_line, start_col, end_line, end_col), modified = result | |
e.msg += f" Did you mean '{kw}'?" | |
e.lineno, e.end_lineno = start_line, end_line | |
e.offset, e.end_offset = start_col + 1, end_col + 1 | |
raise e from None |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment