Last active
August 29, 2015 14:07
-
-
Save ixtli/0ea0c7d5f85c408a3955 to your computer and use it in GitHub Desktop.
Convert XML to JSON using Python 2.7.x
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python | |
import xml.etree.ElementTree as ET; | |
import json, os, fnmatch, time; | |
def coerceStringToType(val): | |
if not val: | |
return None; | |
try: | |
f = float(val); | |
i = int(f); | |
if f == i: | |
return i; | |
else: | |
return f; | |
except (ValueError, TypeError) as e: | |
if val.lower() == "true": | |
return True; | |
if val.lower() == "false": | |
return False; | |
return val; | |
def coerceArrayOfStrings(parsed): | |
ret = []; | |
hasString = False; | |
for elt in parsed: | |
val = coerceStringToType(elt.strip()); | |
# This is the case where the string is "...,foo,,bar,..." | |
if val == None: | |
print "Bad string cooersion:", parsed; | |
sys.exit(1); | |
# If there are any strings, treat all elements as strings | |
if isinstance(val, str) and not hasString: | |
hasString = True; | |
for i in range(0, len(ret)): | |
ret[i] = str(ret[i]); | |
if hasString: | |
ret.append(str(val)); | |
else: | |
ret.append(val); | |
if len(ret) == 1: | |
return ret[0]; | |
return ret; | |
def coerceUnknownValue(val): | |
# Don't think this can happen, but just in case | |
if not isinstance(val, str): | |
return val; | |
val = val.strip(); | |
# No empty strings | |
if not val: | |
return None; | |
return coerceArrayOfStrings(val.split(',')); | |
def coerceKeyName(name): | |
lowerFirst = lambda s: s[:1].lower() + s[1:] if s else ''; | |
if name.lower() == "id": | |
return name.lower(); | |
return lowerFirst(name); | |
def parseNode(root): | |
out = {}; | |
for child in root: | |
newName = coerceKeyName(child.tag); | |
if newName in out: | |
if not isinstance(out[newName], list): | |
out[newName] = list(out[newName]); | |
out[newName].append(child); | |
else: | |
out[newName] = child; | |
for key in out: | |
if isinstance(out[key], list): | |
for i in range(0, len(out[key])): | |
out[key][i] = parseNode(out[key][i]); | |
else: | |
out[key] = parseNode(out[key]); | |
for attr in root.attrib: | |
out[coerceKeyName(attr)] = coerceUnknownValue(root.attrib[attr]); | |
if root.text != None and root.text.strip(): | |
val = coerceUnknownValue(root.text); | |
if len(out) < 1: | |
out = val; | |
else: | |
out["value"] = val; | |
if isinstance(out, dict) or isinstance(out, list): | |
if len(out) < 1: | |
out = None; | |
return out; | |
def parseFile(fileName): | |
tree = ET.parse(fileName); | |
return parseNode(tree.getroot()); | |
if __name__ == "__main__": | |
start = time.clock(); | |
count = 0; | |
for root, dirs, files in os.walk('.'): | |
for file in files: | |
if fnmatch.fnmatch(file, '*.xml'): | |
out = parseFile(root + '/' + file); | |
f = open(root + '/' + file.split('.')[0] + '.json', 'w'); | |
json.dump(out, f, indent=2, sort_keys=True); | |
f.close(); | |
count = count + 1; | |
end = time.clock(); | |
print "Processed", count, "files in", (end - start), "seconds."; | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment