Created
March 10, 2011 01:26
-
-
Save razamatan/863397 to your computer and use it in GitHub Desktop.
simple conversion from etree parsed xml to objects
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
''' xml utilities ''' | |
__author__ = '[email protected]' | |
from itertools import groupby | |
from operator import attrgetter | |
def split_ns(txt): | |
''' returns [ns, tag] for '{ns}tag' | |
>>> split_ns('{http://www.w3.org/2001/XMLSchema-instance}nil') | |
['http://www.w3.org/2001/XMLSchema-instance', 'nil'] | |
>>> split_ns('foobar') | |
['', 'foobar'] | |
''' | |
s = txt[txt.startswith('{'):].split('}', 1) | |
return s if len(s) > 1 else [''] + s | |
class protected(dict): | |
''' raises exception if you try to update an existing key | |
>>> x = protected(zip('abcd', '1234')) | |
>>> x['a'] = 3 | |
Traceback (most recent call last): | |
KeyError: ('already exists', 'a') | |
>>> x['e'] = 5 | |
''' | |
def __setitem__(self, k, v): | |
if k in self: raise KeyError('already exists', k) | |
dict.__setitem__(self, k, v) | |
def objectify(elm, prefix='', base=None): | |
''' converts a parsed etree element into an object. | |
if prefix is specified (str), it will prefix all the tags it inserts into | |
the base. usefuly for preventing namespace collisions. | |
when base is specified, it will create a heirarchy of bases to represent the | |
parse. base needs to implement MutableMapping (e.g. dict, OrderedDict, | |
etc.). if base is unspecified, it **WILL MODIFY THE ETREE ELEMENT OBJECTS** | |
it traverses by adding attributes to them. WARNING: if it wasn't clear, | |
this is potentially quite destructive on the etree elements! | |
see dictify to for an example. | |
''' | |
#print '--', elm.tag, elm | |
rval = elm.__dict__ if base is None else base() | |
# text | |
text = elm.text.strip() if elm.text else elm.text | |
tail = elm.tail.strip() if elm.tail else elm.tail | |
if not (len(elm) or elm.attrib or tail): return text | |
if text: rval[prefix + 'text'] = text | |
if tail: rval[prefix + 'tail'] = tail | |
# attributes | |
rval.update((prefix + split_ns(k)[1], v) for k,v in elm.attrib.items()) | |
# test for nil | |
if rval.get(prefix + 'nil') == 'true': return None | |
# children | |
c_groups = [ (split_ns(k)[1], [objectify(i, prefix, base) for i in g]) for k,g in groupby(list(elm), attrgetter('tag')) ] | |
for t, c in c_groups: | |
if len(c) > 1 and len(c_groups) == 1 and not (elm.attrib or elm.text): | |
# many children of one type and nothing else | |
return c | |
rval[prefix + t] = c if len(c) > 1 else c[0] | |
return elm if base is None else rval | |
def dictify(elm, prefix='', base=protected): | |
''' returns a dict representation of the xml ''' | |
return objectify(elm, prefix, base) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment