Created
August 11, 2010 19:47
-
-
Save Wilfred/519606 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python | |
import os | |
import lxml.etree | |
def get_words_from_kap(node): | |
flat_string = flatten_kap(node) | |
# now this is either one word 'foo' or multiple 'foo, bar' | |
words = flat_string.split(', ') | |
if len(words) > 1: | |
for i in range(len(words)): | |
# remove trailing/leading space | |
words[i] = words[i].strip() | |
return words | |
def flatten_kap(kap): | |
# take kap node ugliness and return a naked string | |
# convert text of the form 'ret<tld/>ejo<fnt>Z</fnt>, ret<tld/>o' | |
# to ['retetejo, 'reteto'] | |
assert kap != None | |
root = get_word_root(kap) | |
flat_string = "" | |
if kap.text != None: | |
flat_string += kap.text | |
# flatten, get all the text, throw away ofc, fnt | |
# this is not simple, but the xml structure is a pain | |
# offenders: nuks.xml | |
for child in kap.getchildren(): | |
if child.tag == 'tld': | |
flat_string += root | |
if child.text != None: | |
flat_string += child.text | |
elif child.tag == 'fnt': | |
# we throw away source of word, not interested right now | |
pass | |
elif child.tag == 'ofc': | |
# also throw away oficialness, not interested | |
pass | |
elif child.tag == 'var': | |
# recurse -- egads! Why isn't the xml simpler? | |
child_kap = child.getchildren()[0] | |
flat_string += flatten_kap(child_kap) | |
else: | |
# shouldn't get here | |
assert False | |
if child.tail != None: | |
flat_string += child.tail | |
return flat_string.strip() | |
def get_word_root(arbitrary_node): | |
# get the root without the ending | |
assert arbitrary_node != None | |
tree = arbitrary_node.getroottree() | |
return list(tree.iter('rad'))[0].text | |
def get_tree(xml_file): | |
parser = lxml.etree.XMLParser(load_dtd=True) | |
return lxml.etree.parse(xml_file, parser) | |
if __name__ == '__main__': | |
path = '/home/wilfred/languages/esperanto/reta_vortaro/xml' | |
for file in os.listdir(path): | |
tree = get_tree(path + '/' + file) | |
# each word is a drv | |
for drv_node in tree.iter('drv'): | |
words = get_words_from_kap(drv_node.find('kap')) | |
for word in words: | |
print word.encode('utf8') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment