Last active
November 11, 2016 11:18
-
-
Save gbraccialli/6dacfdb16717adfe50b64c7c02e716b6 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import xml.etree.ElementTree as xml, sys | |
file = sys.argv[1] | |
tree = xml.parse(file) | |
#from lxml import etree | |
#import sys | |
#parser = etree.XMLParser(recover=True) | |
#tree = etree.parse(file,parser=parser) | |
root = tree.getroot() | |
max_level = -1 | |
show_attributes = True | |
if len(sys.argv) > 2 and sys.argv[2]: | |
max_level = int(sys.argv[2]) | |
if len(sys.argv) > 3 and sys.argv[3]: | |
show_attributes = bool(int(sys.argv[3])) | |
def count_tree(level, tag, tags): | |
label = tag.tag | |
#print 'start:' + label | |
if label not in tags: | |
tags[label] = {} | |
tags[label]['count'] = 0 | |
tags[label]['child'] = {} | |
tags[label]['attributes'] = {} | |
tags[label]['count'] += 1 | |
if show_attributes: | |
for attribute in tag.attrib: | |
if attribute not in tags[label]['attributes']: | |
tags[label]['attributes'][attribute] = 0 | |
tags[label]['attributes'][attribute] += 1 | |
if max_level == -1 or level < max_level: | |
for child in tag: | |
tags[label]['child'] = count_tree(level+1, child, tags[label]['child']) | |
return tags | |
def print_level(level,tags): | |
for tag in tags: | |
print str(level) + '-' * 4 * level + tag + ' : ' + str(tags[tag]['count']) | |
for attribute in tags[tag]['attributes']: | |
print str(level+1) + '-' * 4 * (level+1) + '@' + attribute + ' : ' + str(tags[tag]['attributes'][attribute]) | |
print_level(level+1,tags[tag]['child']) | |
def write_xml(file,level,tags): | |
for tag in tags: | |
file.write( '<' + tag + ' count="' + str(tags[tag]['count']) + '">') | |
for attribute in tags[tag]['attributes']: | |
file.write( '<_' + attribute + ' count="' + str(tags[tag]['attributes'][attribute]) + '"/>') | |
write_xml(file,level+1,tags[tag]['child']) | |
file.write( '</' + tag + '>') | |
all = {} | |
all = count_tree(0,root,all) | |
#print(all) | |
print_level(0, all) | |
outxml = open(file + '_xml_structure.xml', 'w') | |
print_level(0,all) | |
write_xml(outxml,0,all) | |
outxml.close() | |
#import json | |
#with open('result.json', 'w') as fp: | |
# json.dump(all, fp) | |
#import dicttoxml | |
#xml = dicttoxml.dicttoxml(all) | |
#print(xml) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment