Created
October 6, 2014 02:03
-
-
Save dalelane/a0514b2e283a882d9ef3 to your computer and use it in GitHub Desktop.
Comparing XML files ignoring order of attributes and elements - see http://dalelane.co.uk/blog/?p=3225 for background
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
########################################################################## | |
# | |
# xmldiff | |
# | |
# Simple utility script to enable a diff of two XML files in a way | |
# that ignores the order or attributes and elements. | |
# | |
# Dale Lane ([email protected]) | |
# 6 Oct 2014 | |
# | |
########################################################################## | |
# | |
# Overview | |
# The approach is to sort both files by attribute and element, and | |
# then reuse an existing diff implementation on the sorted files. | |
# | |
# Arguments | |
# <diffcommand> the command that should be run to diff the sorted files | |
# <filename1> the first XML file to diff | |
# <filename2> the second XML file to diff | |
# | |
# Background | |
# http://dalelane.co.uk/blog/?p=3225 | |
# | |
########################################################################## | |
import os, sys, subprocess, platform | |
import lxml.etree as le | |
from operator import attrgetter | |
# | |
# Check required arguments | |
if len(sys.argv) != 4: | |
print ("Usage: python xmldiff.py <diffcommand> <filename1> <filename2>") | |
quit() | |
# | |
# Prepares the location of the temporary file that will be created by xmldiff | |
def createFileObj(prefix, name): | |
return { | |
"filename" : os.path.abspath(name), | |
"tmpfilename" : "." + prefix + "." + os.path.basename(name) | |
} | |
# | |
# Function to sort XML elements by id | |
# (where the elements have an 'id' attribute that can be cast to an int) | |
def sortbyid(elem): | |
id = elem.get('id') | |
if id: | |
try: | |
return int(id) | |
except ValueError: | |
return 0 | |
return 0 | |
# | |
# Function to sort XML elements by their text contents | |
def sortbytext(elem): | |
text = elem.text | |
if text: | |
return text | |
else: | |
return '' | |
# | |
# Function to sort XML attributes alphabetically by key | |
# The original item is left unmodified, and it's attributes are | |
# copied to the provided sorteditem | |
def sortAttrs(item, sorteditem): | |
attrkeys = sorted(item.keys()) | |
for key in attrkeys: | |
sorteditem.set(key, item.get(key)) | |
# | |
# Function to sort XML elements | |
# The sorted elements will be added as children of the provided newroot | |
# This is a recursive function, and will be called on each of the children | |
# of items. | |
def sortElements(items, newroot): | |
# The intended sort order is to sort by XML element name | |
# If more than one element has the same name, we want to | |
# sort by their text contents. | |
# If more than one element has the same name and they do | |
# not contain any text contents, we want to sort by the | |
# value of their ID attribute. | |
# If more than one element has the same name, but has | |
# no text contents or ID attribute, their order is left | |
# unmodified. | |
# | |
# We do this by performing three sorts in the reverse order | |
items = sorted(items, key=sortbyid) | |
items = sorted(items, key=sortbytext) | |
items = sorted(items, key=attrgetter('tag')) | |
# Once sorted, we sort each of the items | |
for item in items: | |
# Create a new item to represent the sorted version | |
# of the next item, and copy the tag name and contents | |
newitem = le.Element(item.tag) | |
if item.text and item.text.isspace() == False: | |
newitem.text = item.text | |
# Copy the attributes (sorted by key) to the new item | |
sortAttrs(item, newitem) | |
# Copy the children of item (sorted) to the new item | |
sortElements(list(item), newitem) | |
# Append this sorted item to the sorted root | |
newroot.append(newitem) | |
# | |
# Function to sort the provided XML file | |
# fileobj.filename will be left untouched | |
# A new sorted copy of it will be created at fileobj.tmpfilename | |
def sortFile(fileobj): | |
with open(fileobj['filename'], 'r') as original: | |
# parse the XML file and get a pointer to the top | |
xmldoc = le.parse(original) | |
xmlroot = xmldoc.getroot() | |
# create a new XML element that will be the top of | |
# the sorted copy of the XML file | |
newxmlroot = le.Element(xmlroot.tag) | |
# create the sorted copy of the XML file | |
sortAttrs(xmlroot, newxmlroot) | |
sortElements(list(xmlroot), newxmlroot) | |
# write the sorted XML file to the temp file | |
newtree = le.ElementTree(newxmlroot) | |
with open(fileobj['tmpfilename'], 'wb') as newfile: | |
newtree.write(newfile, pretty_print=True) | |
# | |
# sort each of the specified files | |
filefrom = createFileObj("from", sys.argv[2]) | |
sortFile(filefrom) | |
fileto = createFileObj("to", sys.argv[3]) | |
sortFile(fileto) | |
# | |
# invoke the requested diff command to compare the two sorted files | |
if platform.system() == "Windows": | |
sp = subprocess.Popen([ "cmd", "/c", sys.argv[1] + " " + filefrom['tmpfilename'] + " " + fileto['tmpfilename'] ]) | |
sp.communicate() | |
else: | |
sp = subprocess.Popen([ "/bin/bash", "-i", "-c", sys.argv[1] + " " + os.path.abspath(filefrom['tmpfilename']) + " " + os.path.abspath(fileto['tmpfilename']) ]) | |
sp.communicate() | |
# | |
# cleanup - delete the temporary sorted files after the diff terminates | |
os.remove(filefrom['tmpfilename']) | |
os.remove(fileto['tmpfilename']) |
@dalelane what would be the license for this code? See http://choosealicense.com/ for help.
I am getting this error:
Traceback (most recent call last):
File "xmldiff.py", line 146, in <module>
sortFile(filefrom)
File "xmldiff.py", line 135, in sortFile
sortElements(list(xmlroot), newxmlroot)
File "xmldiff.py", line 105, in sortElements
newitem = le.Element(item.tag)
File "src/lxml/etree.pyx", line 2996, in lxml.etree.Element
File "src/lxml/apihelpers.pxi", line 95, in lxml.etree._makeElement
File "src/lxml/apihelpers.pxi", line 1584, in lxml.etree._getNsTag
File "src/lxml/apihelpers.pxi", line 1602, in lxml.etree.__getNsTag
File "src/lxml/apihelpers.pxi", line 1472, in lxml.etree._utf8
TypeError: Argument must be bytes or unicode, got 'cython_function_or_method'
@harryyuanfeng you may have comments in your xml.
You can ignore them by changing the "sortfile" function:
parser = le.XMLParser(remove_comments=True)
# parse the XML file and get a pointer to the top
xmldoc = le.parse(original, parser=parser)
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Great code, thanks! Would you mind adding a license to it? I would love to use this, but don't want to if it's not licensed for re-use!