Skip to content

Instantly share code, notes, and snippets.

@Emmanuerl
Created February 27, 2025 10:23
Show Gist options
  • Save Emmanuerl/00a462e3019f533425a436b804f6920e to your computer and use it in GitHub Desktop.
Save Emmanuerl/00a462e3019f533425a436b804f6920e to your computer and use it in GitHub Desktop.
This is a sample demonstration of a recursive XML broken attribute detector
import xml.etree.ElementTree as ET
import re
def find_broken_attributes(xml_string):
"""Find and report broken attributes in an XML string using recursion"""
# First try simple regex approach to find common broken attribute patterns
broken_attributes = find_broken_attributes_with_regex(xml_string)
# If no broken attributes found with regex, try parsing and recursively checking
if not broken_attributes:
try:
root = ET.fromstring(xml_string)
broken_attributes = recursive_check_element(root)
except Exception as e:
# If parsing fails, extract the error line
error_line = str(e)
line_match = re.search(r'line (\d+)', error_line)
if line_match:
line_number = int(line_match.group(1))
lines = xml_string.split('\n')
if line_number <= len(lines):
problematic_line = lines[line_number - 1].strip()
broken_attributes.append(f"Parsing error on line {line_number}: {problematic_line}")
else:
broken_attributes.append(f"XML parsing error: {e}")
# Return all broken attributes found
return broken_attributes
def find_broken_attributes_with_regex(xml_string):
"""Find broken attributes using regular expressions"""
broken_attributes = []
# Pattern for attributes without quotes
unquoted_pattern = r'([a-zA-Z0-9_\-:]+)=(?!"[^"]*"|\'[^\']*\')(.*?)(?=\s|\>)'
# Check each line
for line_number, line in enumerate(xml_string.split('\n'), 1):
# Find attributes without quotes
for match in re.finditer(unquoted_pattern, line):
attr_name = match.group(1)
attr_value = match.group(2)
broken_attributes.append(f"Unquoted attribute at line {line_number}: {attr_name}={attr_value}")
# Find unclosed attribute values
if re.search(r'([a-zA-Z0-9_\-:]+)=(["|\'])([^"\']*?)$', line):
broken_attributes.append(f"Unclosed attribute value at line {line_number}: {line.strip()}")
return broken_attributes
def recursive_check_element(element, path=""):
"""Recursively check each element for potentially broken attributes"""
broken_attributes = []
current_path = f"{path}/{element.tag}" if path else element.tag
# Check attributes for this element
for attr_name, attr_value in element.attrib.items():
# Check for potentially broken attribute values
if any(c in attr_value for c in ['<', '>', '&']) and not (attr_value.startswith('&') and attr_value.endswith(';')):
broken_attributes.append(f"Invalid characters in attribute at {current_path}: {attr_name}={attr_value}")
# Recursively check all child elements
for child in element:
broken_attributes.extend(recursive_check_element(child, current_path))
return broken_attributes
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment