Created
February 27, 2025 10:23
-
-
Save Emmanuerl/00a462e3019f533425a436b804f6920e to your computer and use it in GitHub Desktop.
This is a sample demonstration of a recursive XML broken attribute detector
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import xml.etree.ElementTree as ET | |
import re | |
def find_broken_attributes(xml_string): | |
"""Find and report broken attributes in an XML string using recursion""" | |
# First try simple regex approach to find common broken attribute patterns | |
broken_attributes = find_broken_attributes_with_regex(xml_string) | |
# If no broken attributes found with regex, try parsing and recursively checking | |
if not broken_attributes: | |
try: | |
root = ET.fromstring(xml_string) | |
broken_attributes = recursive_check_element(root) | |
except Exception as e: | |
# If parsing fails, extract the error line | |
error_line = str(e) | |
line_match = re.search(r'line (\d+)', error_line) | |
if line_match: | |
line_number = int(line_match.group(1)) | |
lines = xml_string.split('\n') | |
if line_number <= len(lines): | |
problematic_line = lines[line_number - 1].strip() | |
broken_attributes.append(f"Parsing error on line {line_number}: {problematic_line}") | |
else: | |
broken_attributes.append(f"XML parsing error: {e}") | |
# Return all broken attributes found | |
return broken_attributes | |
def find_broken_attributes_with_regex(xml_string): | |
"""Find broken attributes using regular expressions""" | |
broken_attributes = [] | |
# Pattern for attributes without quotes | |
unquoted_pattern = r'([a-zA-Z0-9_\-:]+)=(?!"[^"]*"|\'[^\']*\')(.*?)(?=\s|\>)' | |
# Check each line | |
for line_number, line in enumerate(xml_string.split('\n'), 1): | |
# Find attributes without quotes | |
for match in re.finditer(unquoted_pattern, line): | |
attr_name = match.group(1) | |
attr_value = match.group(2) | |
broken_attributes.append(f"Unquoted attribute at line {line_number}: {attr_name}={attr_value}") | |
# Find unclosed attribute values | |
if re.search(r'([a-zA-Z0-9_\-:]+)=(["|\'])([^"\']*?)$', line): | |
broken_attributes.append(f"Unclosed attribute value at line {line_number}: {line.strip()}") | |
return broken_attributes | |
def recursive_check_element(element, path=""): | |
"""Recursively check each element for potentially broken attributes""" | |
broken_attributes = [] | |
current_path = f"{path}/{element.tag}" if path else element.tag | |
# Check attributes for this element | |
for attr_name, attr_value in element.attrib.items(): | |
# Check for potentially broken attribute values | |
if any(c in attr_value for c in ['<', '>', '&']) and not (attr_value.startswith('&') and attr_value.endswith(';')): | |
broken_attributes.append(f"Invalid characters in attribute at {current_path}: {attr_name}={attr_value}") | |
# Recursively check all child elements | |
for child in element: | |
broken_attributes.extend(recursive_check_element(child, current_path)) | |
return broken_attributes |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment