pablogsal · April 7, 2025 13:21
diff --git a/statement_finder.py b/statement_finder.py
 import tokenize
 from io import BytesIO
 from typing import List, Tuple, Optional, Dict, Any
 import keyword
 from codeop import compile_command
 from _suggestions import _generate_suggestions
 from textwrap import dedent

 class StatementFinder:
    """Finds the smallest surrounding statement given a line number and Python source."""
    
    def __init__(self, source: str, line_number: int):
        self.source = source
        self.line_number = line_number
        self.statement_tokens = []
        self.all_statements = []
        self.statement_brackets = []
        self.begin_brackets = []
        self.end_bracket = None
        self.bad_token = None
        self.prev_token = None
        
        # Tokens that should begin a statement and cannot occur within brackets
        self.should_begin_statement = [
            "assert", "async", "await", "break", "class", "continue",
            "def", "del", "elif", "except", "finally", "global",
            "import", "nonlocal", "pass", "raise", "return", "try",
            "with", "while", "yield"
        ]
        
    def find_statement(self) -> str:
        """Find and return the smallest complete statement containing the target line."""
        source_tokens = self._get_source_tokens()
        self._obtain_statement(source_tokens)
        return self._reconstruct_statement()
        
    def _get_source_tokens(self) -> List[tokenize.TokenInfo]:
        """Get all tokens from the source."""
        return list(tokenize.tokenize(BytesIO(self.source.encode('utf-8')).readline))
        
    def _obtain_statement(self, source_tokens: List[tokenize.TokenInfo]) -> None:
        """Find the statement containing the target line number."""
        previous_row = -1
        previous_token = None
        continuation_line = False
        last_line_to_include = self.line_number
        
        for token in source_tokens:
            if (token.start[0] > last_line_to_include and 
                not continuation_line and 
                not self.statement_brackets):
                break
                
            if token.start[0] > previous_row:
                if previous_token is not None:
                    continuation_line = previous_token.line.endswith("\\\n")
                if (token.start[0] <= last_line_to_include and 
                    not self.statement_brackets):
                    if self.statement_tokens:
                        self.all_statements.append(self.statement_tokens[:])
                    self.statement_tokens = []
                    self.begin_brackets = []
                    
            if token.start[0] > last_line_to_include and self.statement_brackets:
                last_line_to_include = token.start[0]
                
            previous_row = token.start[0]
            self.statement_tokens.append(token)
            
            if (token.start[0] == self.line_number and 
                token.string.strip() and 
                token.type != tokenize.COMMENT):
                self.bad_token = token
                
            if token.string.strip() and token.type != tokenize.COMMENT:
                self.prev_token = token
                
            previous_token = token
            
            # Check if we have a statement-starting token in the middle of brackets
            if (self.bad_token and 
                self.bad_token.string in self.should_begin_statement and 
                self.bad_token != self.statement_tokens[0] and 
                self.statement_brackets):
                break
                
            if not token.string or token.string not in "()[]}{":
                continue
                
            if token.string in "([{":
                self.statement_brackets.append(token.string)
                if self.bad_token is None or self.bad_token is token:
                    self.begin_brackets.append(token)
            elif token.string in ")]}":
                self.end_bracket = token
                if not self.statement_brackets:
                    break
                    
                open_bracket = self.statement_brackets.pop()
                if not self._matching_brackets(open_bracket, token.string):
                    self.statement_brackets.append(open_bracket)
                    break
                if self.begin_brackets and self.bad_token is None:
                    self.begin_brackets.pop()
                self.end_bracket = None
                
        if self.statement_tokens:
            last_line = self._untokenize(self.statement_tokens)
            if last_line.strip():
                self.all_statements.append(self.statement_tokens)
            elif self.all_statements:
                self.statement_tokens = self.all_statements[-1]
                
    def _matching_brackets(self, open_bracket: str, close_bracket: str) -> bool:
        """Check if brackets match."""
        return (open_bracket == '(' and close_bracket == ')') or \
               (open_bracket == '[' and close_bracket == ']') or \
               (open_bracket == '{' and close_bracket == '}')
               
    def _untokenize(self, tokens: List[tokenize.TokenInfo]) -> str:
        """Convert tokens back to source code."""
        return ''.join(token.string for token in tokens)
        
    def _reconstruct_statement(self) -> str:
        """Extract the statement directly from the source using token positions."""
        if not self.statement_tokens:
            return ""
            
        # Get the first and last tokens
        first_token = self.statement_tokens[0]
        last_token = self.statement_tokens[-1]
        
        # Split source into lines
        lines = self.source.splitlines(keepends=True)
        
        # Get the relevant lines
        start_line = first_token.start[0] - 1  # convert to 0-based index
        end_line = last_token.end[0] - 1
        
        # Extract the statement
        statement_lines = []
        for i in range(start_line, end_line + 1):
            if i < len(lines):
                line = lines[i]
                if i == start_line:
                    # For first line, start from first token's column
                    line = line[first_token.start[1]:]
                if i == end_line:
                    # For last line, end at last token's column
                    line = line[:last_token.end[1]]
                statement_lines.append(line.rstrip())
                
        return '\n'.join(statement_lines)

 def try_keyword_substitutions(syntax_error: SyntaxError, source: str | None) -> Tuple[str, Tuple[int, int, int, int], str] | None:
    """
    Try to find a valid Python statement by substituting keywords in the problematic code.
    
    Args:
        syntax_error: A SyntaxError exception containing the error information
        source: Optional source code. If None, will try to read from file.
        
    Returns:
        A tuple of (suggestion, (start_line, start_offset, end_line, end_offset), modified_statement)
        in the original source, or None if no valid modification was found
    """
    # Get the source file and line number
    filename = syntax_error.filename
    line_number = syntax_error.lineno
    
    if not filename or not isinstance(filename, str):
        return None
        
    if not line_number or not isinstance(line_number, int):
        return None
    
    if source is None:
        try:
            # Read the source file
            with open(filename, 'r', encoding='utf-8') as f:
                source = f.read()
        except (IOError, OSError):
            return None
    
    # Find the problematic statement
    finder = StatementFinder(source, line_number)
    statement = finder.find_statement()
    
    if not statement:
        return None
    
    # Get the first token's position in the original source
    first_token = finder.statement_tokens[0]
    source_start_line = first_token.start[0]
    source_start_col = first_token.start[1]
    
    # Tokenize the statement
    try:
        tokens = list(tokenize.tokenize(BytesIO(statement.encode('utf-8')).readline))
    except tokenize.TokenError:
        return None
    
    # Get the lines of the statement
    lines = statement.splitlines(keepends=True)
    
    # Try substituting each NAME token with Python keywords
    for token in tokens:
        if token.type != tokenize.NAME:
            continue
            
        # Calculate the position in the source string
        line_idx = token.start[0] - 1
        if line_idx >= len(lines):
            continue
            
        # Get the line and its start/end positions
        line = lines[line_idx]
        start_pos = token.start[1]
        end_pos = token.end[1]

        suggestion = _generate_suggestions(keyword.kwlist, token.string)
        if suggestion:
            # Create modified line by replacing the token
            modified_line = line[:start_pos] + suggestion + line[end_pos:]
            modified_lines = lines[:line_idx] + [modified_line] + lines[line_idx + 1:]
            modified_statement = ''.join(modified_lines)
            
            # Try to compile
            try:
                code = dedent(modified_statement)
                compile_command(code, "<string>", "exec")
                
                # Calculate positions in original source
                original_start_line = source_start_line + line_idx
                original_start_col = source_start_col + start_pos if line_idx == 0 else start_pos
                original_end_line = source_start_line + line_idx
                original_end_col = source_start_col + end_pos if line_idx == 0 else end_pos
                
                return (suggestion, 
                        (original_start_line, original_start_col,
                         original_end_line, original_end_col),
                        modified_statement)
            except (SyntaxError, ValueError):
                continue
    
    return None

 # Example usage
 if __name__ == "__main__":
    # Example usage
    code = """\
 from os import path
 class A:
   def foo():
       ...

   def bar():
       x= 1
       y = [x for x in 
                        range(3)]
       with (
         open("test.txt", "w") as f,
         open("test2.txt", "r") os g,
         open("test3.txt", "r") as h,
       ):
            pass
       z = 3

   def blech():
        ...
 """
    
    # Example of using the new function
    try:
        # This would normally come from a SyntaxError
        compile(code, "<string>", "exec")
    except SyntaxError as e:
        result = try_keyword_substitutions(e, code)
        if result:
            kw, (start_line, start_col, end_line, end_col), modified = result
            e.msg += f" Did you mean '{kw}'?"
            e.lineno, e.end_lineno = start_line, end_line
            e.offset, e.end_offset = start_col + 1, end_col + 1
        raise e from None
	import tokenize
	from io import BytesIO
	from typing import List, Tuple, Optional, Dict, Any
	import keyword
	from codeop import compile_command
	from _suggestions import _generate_suggestions
	from textwrap import dedent

	class StatementFinder:
	"""Finds the smallest surrounding statement given a line number and Python source."""

	def __init__(self, source: str, line_number: int):
	self.source = source
	self.line_number = line_number
	self.statement_tokens = []
	self.all_statements = []
	self.statement_brackets = []
	self.begin_brackets = []
	self.end_bracket = None
	self.bad_token = None
	self.prev_token = None

	# Tokens that should begin a statement and cannot occur within brackets
	self.should_begin_statement = [
	"assert", "async", "await", "break", "class", "continue",
	"def", "del", "elif", "except", "finally", "global",
	"import", "nonlocal", "pass", "raise", "return", "try",
	"with", "while", "yield"
	]

	def find_statement(self) -> str:
	"""Find and return the smallest complete statement containing the target line."""
	source_tokens = self._get_source_tokens()
	self._obtain_statement(source_tokens)
	return self._reconstruct_statement()

	def _get_source_tokens(self) -> List[tokenize.TokenInfo]:
	"""Get all tokens from the source."""
	return list(tokenize.tokenize(BytesIO(self.source.encode('utf-8')).readline))

	def _obtain_statement(self, source_tokens: List[tokenize.TokenInfo]) -> None:
	"""Find the statement containing the target line number."""
	previous_row = -1
	previous_token = None
	continuation_line = False
	last_line_to_include = self.line_number

	for token in source_tokens:
	if (token.start[0] > last_line_to_include and
	not continuation_line and
	not self.statement_brackets):
	break

	if token.start[0] > previous_row:
	if previous_token is not None:
	continuation_line = previous_token.line.endswith("\\\n")
	if (token.start[0] <= last_line_to_include and
	not self.statement_brackets):
	if self.statement_tokens:
	self.all_statements.append(self.statement_tokens[:])
	self.statement_tokens = []
	self.begin_brackets = []

	if token.start[0] > last_line_to_include and self.statement_brackets:
	last_line_to_include = token.start[0]

	previous_row = token.start[0]
	self.statement_tokens.append(token)

	if (token.start[0] == self.line_number and
	token.string.strip() and
	token.type != tokenize.COMMENT):
	self.bad_token = token

	if token.string.strip() and token.type != tokenize.COMMENT:
	self.prev_token = token

	previous_token = token

	# Check if we have a statement-starting token in the middle of brackets
	if (self.bad_token and
	self.bad_token.string in self.should_begin_statement and
	self.bad_token != self.statement_tokens[0] and
	self.statement_brackets):
	break

	if not token.string or token.string not in "()[]}{":
	continue

	if token.string in "([{":
	self.statement_brackets.append(token.string)
	if self.bad_token is None or self.bad_token is token:
	self.begin_brackets.append(token)
	elif token.string in ")]}":
	self.end_bracket = token
	if not self.statement_brackets:
	break

	open_bracket = self.statement_brackets.pop()
	if not self._matching_brackets(open_bracket, token.string):
	self.statement_brackets.append(open_bracket)
	break
	if self.begin_brackets and self.bad_token is None:
	self.begin_brackets.pop()
	self.end_bracket = None

	if self.statement_tokens:
	last_line = self._untokenize(self.statement_tokens)
	if last_line.strip():
	self.all_statements.append(self.statement_tokens)
	elif self.all_statements:
	self.statement_tokens = self.all_statements[-1]

	def _matching_brackets(self, open_bracket: str, close_bracket: str) -> bool:
	"""Check if brackets match."""
	return (open_bracket == '(' and close_bracket == ')') or \
	(open_bracket == '[' and close_bracket == ']') or \
	(open_bracket == '{' and close_bracket == '}')

	def _untokenize(self, tokens: List[tokenize.TokenInfo]) -> str:
	"""Convert tokens back to source code."""
	return ''.join(token.string for token in tokens)

	def _reconstruct_statement(self) -> str:
	"""Extract the statement directly from the source using token positions."""
	if not self.statement_tokens:
	return ""

	# Get the first and last tokens
	first_token = self.statement_tokens[0]
	last_token = self.statement_tokens[-1]

	# Split source into lines
	lines = self.source.splitlines(keepends=True)

	# Get the relevant lines
	start_line = first_token.start[0] - 1 # convert to 0-based index
	end_line = last_token.end[0] - 1

	# Extract the statement
	statement_lines = []
	for i in range(start_line, end_line + 1):
	if i < len(lines):
	line = lines[i]
	if i == start_line:
	# For first line, start from first token's column
	line = line[first_token.start[1]:]
	if i == end_line:
	# For last line, end at last token's column
	line = line[:last_token.end[1]]
	statement_lines.append(line.rstrip())

	return '\n'.join(statement_lines)

	def try_keyword_substitutions(syntax_error: SyntaxError, source: str \| None) -> Tuple[str, Tuple[int, int, int, int], str] \| None:
	"""
	Try to find a valid Python statement by substituting keywords in the problematic code.

	Args:
	syntax_error: A SyntaxError exception containing the error information
	source: Optional source code. If None, will try to read from file.

	Returns:
	A tuple of (suggestion, (start_line, start_offset, end_line, end_offset), modified_statement)
	in the original source, or None if no valid modification was found
	"""
	# Get the source file and line number
	filename = syntax_error.filename
	line_number = syntax_error.lineno

	if not filename or not isinstance(filename, str):
	return None

	if not line_number or not isinstance(line_number, int):
	return None

	if source is None:
	try:
	# Read the source file
	with open(filename, 'r', encoding='utf-8') as f:
	source = f.read()
	except (IOError, OSError):
	return None

	# Find the problematic statement
	finder = StatementFinder(source, line_number)
	statement = finder.find_statement()

	if not statement:
	return None

	# Get the first token's position in the original source
	first_token = finder.statement_tokens[0]
	source_start_line = first_token.start[0]
	source_start_col = first_token.start[1]

	# Tokenize the statement
	try:
	tokens = list(tokenize.tokenize(BytesIO(statement.encode('utf-8')).readline))
	except tokenize.TokenError:
	return None

	# Get the lines of the statement
	lines = statement.splitlines(keepends=True)

	# Try substituting each NAME token with Python keywords
	for token in tokens:
	if token.type != tokenize.NAME:
	continue

	# Calculate the position in the source string
	line_idx = token.start[0] - 1
	if line_idx >= len(lines):
	continue

	# Get the line and its start/end positions
	line = lines[line_idx]
	start_pos = token.start[1]
	end_pos = token.end[1]

	suggestion = _generate_suggestions(keyword.kwlist, token.string)
	if suggestion:
	# Create modified line by replacing the token
	modified_line = line[:start_pos] + suggestion + line[end_pos:]
	modified_lines = lines[:line_idx] + [modified_line] + lines[line_idx + 1:]
	modified_statement = ''.join(modified_lines)

	# Try to compile
	try:
	code = dedent(modified_statement)
	compile_command(code, "<string>", "exec")

	# Calculate positions in original source
	original_start_line = source_start_line + line_idx
	original_start_col = source_start_col + start_pos if line_idx == 0 else start_pos
	original_end_line = source_start_line + line_idx
	original_end_col = source_start_col + end_pos if line_idx == 0 else end_pos

	return (suggestion,
	(original_start_line, original_start_col,
	original_end_line, original_end_col),
	modified_statement)
	except (SyntaxError, ValueError):
	continue

	return None

	# Example usage
	if __name__ == "__main__":
	# Example usage
	code = """\
	from os import path
	class A:
	def foo():
	...

	def bar():
	x= 1
	y = [x for x in
	range(3)]
	with (
	open("test.txt", "w") as f,
	open("test2.txt", "r") os g,
	open("test3.txt", "r") as h,
	):
	pass
	z = 3

	def blech():
	...
	"""

	# Example of using the new function
	try:
	# This would normally come from a SyntaxError
	compile(code, "<string>", "exec")
	except SyntaxError as e:
	result = try_keyword_substitutions(e, code)
	if result:
	kw, (start_line, start_col, end_line, end_col), modified = result
	e.msg += f" Did you mean '{kw}'?"
	e.lineno, e.end_lineno = start_line, end_line
	e.offset, e.end_offset = start_col + 1, end_col + 1
	raise e from None