Created
February 6, 2019 15:24
-
-
Save PonteIneptique/5ffa287420f87d8a441518b008164c64 to your computer and use it in GitHub Desktop.
Attempt at a small function for lxml parser that fix illformed xml when possible
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from lxml import etree as ET | |
import re | |
def fix_xml(xml_string: str) -> str: | |
""" Given an illformated xml, try to fix it | |
:param xml_string: XML that is faulty | |
:return: xml that should not be faulty | |
""" | |
parser = ET.XMLParser(recover=True) | |
doc = ET.fromstring(xml_string, parser=parser) | |
xml = None | |
new_xml_string = ""+xml_string | |
tag = re.compile(r"^Opening and ending tag mismatch: ([a-zA-Z\-_0-9]+) line ([0-9]+) and ([a-zA-Z\-_0-9]+), line ([0-9]+), column ([0-9]+)") | |
while xml is None: | |
try: | |
xml = ET.fromstring(new_xml_string) | |
except ET.XMLSyntaxError as E: | |
# Find where the error is | |
res = tag.findall(str(E)) | |
if res: | |
tag1, line1, tag2, line2, col2 = res[0] | |
# Chunk into lines | |
lines = new_xml_string.split("\n") | |
# Find the line of the first tag | |
tag1_line_index = int(line1)-1 | |
tag1_line = lines[tag1_line_index] | |
# Find the line of the second tag | |
tag2_line_index = int(line2)-1 | |
tag2_line = lines[tag2_line_index] | |
column = int(col2) | |
if new_xml_string.count("<"+tag1) == new_xml_string.count("</"+tag1): | |
# Tag 1 is well closed, tag2 is not opened | |
tag2_line = tag2_line[:column-(len("</>")+len(tag2)+1)]+tag2_line[column-1:] | |
lines[tag2_line_index] = tag2_line | |
else: | |
# Tag1 is not not closed, tag2 is well opened | |
tag_length = (len("</>")+len(tag2)+1) | |
# Insert the ending tag just before the ending tag | |
tag2_line = tag2_line[:column-tag_length]+"</"+tag1+">"+tag2_line[column-tag_length:] | |
lines[tag2_line_index] = tag2_line | |
pass | |
if new_xml_string == "\n".join(lines): | |
raise E | |
new_xml_string = "\n".join(lines) | |
else: | |
raise E | |
print(new_xml_string) | |
print("----") | |
return new_xml_string | |
fix_xml("<fragment>"+"""" | |
<lb n="1"/><name><expan><abbr>Sex</abbr><ex>to</ex></expan>Vervicio | |
<lb n="2"/>Modestino</name> et <name>Verv | |
<lb n="4"/> | |
... """+"</fragment>") | |
fix_xml("<fragment>"+"""" | |
<lb n="2"/>Modestino</name> et <name>Verv | |
... """+"</fragment>") | |
fix_xml("<fragment>"+"""" | |
Sex</abbr><ex>to</ex></expan>Vervicio | |
<lb n="2"/>Modestino</name> et <name>Verv | |
<lb break="no" n="3"/>iciae Modestinae</name> | |
<lb n="4"/> | |
... """+"</fragment>") | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment