Created
June 30, 2010 15:37
-
-
Save Wilfred/458809 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python | |
import os | |
import lxml.etree | |
def tld_to_word(node, hyphenated=False): | |
# convert text of the form 'ret<tld>ejo' | |
assert node != None | |
# beginning | |
if node.text != None: | |
start = node.text | |
else: | |
start = '' | |
# middle, assumes exactly one tld node | |
root = get_word_root(node) | |
tld_node = node.find('tld') | |
assert tld_node != None | |
assert len(node.findall('tld')) == 1 | |
# end | |
if node.tail != None: | |
ending = tld_node.tail | |
else: | |
ending = '' | |
if hyphenated: | |
if start != '': | |
word = start + '-' | |
else: | |
word = '' | |
if ending != '': | |
word += root + '-' + ending | |
else: | |
word += root | |
else: | |
word = start + root + ending | |
return word | |
def get_word_root(arbitrary_node): | |
# get the root without the ending | |
assert arbitrary_node != None | |
tree = arbitrary_node.getroottree() | |
return list(tree.iter('rad'))[0].text | |
def get_word(drv_node): | |
root = get_word_root(drv_node) | |
word = tld_to_word(drv_node.find('kap')) | |
return word | |
def get_hypenated_word(drv_node): | |
root = get_word_root(drv_node) | |
word = tld_to_word(drv_node.find('kap'), hyphenated=True) | |
return word | |
def get_definition(dif_node): | |
# get text string from a dif node, without the examples after | |
# text of form 'x<ref>foo<tld/>bar</ref>baz' | |
definition_parts = [] | |
# dif_node may also contain content | |
if dif_node.text != None and dif_node.text.strip() != '': | |
definition_parts.append(clean_text(dif_node.text)) | |
if dif_node.tail != None and dif_node.tail.strip() != '': | |
definition_parts.append(clean_text(dif_node.tail)) | |
for node in dif_node.iterdescendants(): | |
if node.tag == 'ekz': | |
break | |
if node.text != None and node.text.strip() != '': | |
definition_parts.append(clean_text(node.text)) | |
if node.tail != None and node.tail.strip() != '': | |
definition_parts.append(clean_text(node.tail)) | |
definition = ' '.join(definition_parts) | |
# replace : at end with . | |
definition = definition[:-1] + '.' | |
return definition | |
def get_definitions(drv_node): | |
# get all definitions for a word | |
# converting subsenses all to just definitions | |
senses = drv_node.findall('snc') | |
definitions = [] | |
for sense in senses: | |
if sense.find('subsnc') != None: | |
subsenses = sense.findall('subsnc') | |
for subsense in subsenses: | |
dif_node = subsense.find('dif') | |
definition = get_definition(dif_node) | |
definitions.append(definition) | |
elif sense.find('dif') != None: | |
# word has normal definition | |
dif_node = sense.find('dif') | |
definition = get_definition(dif_node) | |
definitions.append(definition) | |
else: | |
# word definition references another | |
ref_node = sense.find('ref') | |
assert ref_node != None | |
if ref_node.find('tld') is None: | |
# simple, just containts text | |
referent = ref_node.text.strip() | |
else: | |
referent = tld_to_word(ref_node) | |
definition = 'Sama kiel ' + referent + '.' | |
definitions.append(definition) | |
return definitions | |
def clean_text(raw_text): | |
assert type(raw_text) == str or type(raw_text) == unicode | |
#fix text of form '\n Foo \n bar ': | |
result = '' | |
is_start = True | |
for raw_line in raw_text.splitlines(): | |
line = raw_line.strip() | |
if line != '': | |
if is_start: | |
result = result + line | |
is_start = False | |
else: | |
result = result + ' ' + line | |
return result | |
parser = lxml.etree.XMLParser(load_dtd=True) | |
for file in os.listdir('../xml/'): | |
#for file in ['primit.xml']: | |
tree = lxml.etree.parse('../xml/' + file, parser) | |
print file | |
# each word is a drv | |
for drv_node in tree.iter('drv'): | |
word = get_word(drv_node) | |
hyphenated_word = get_hypenated_word(drv_node) | |
definitions = get_definitions(drv_node) | |
print word + ': ' + hyphenated_word + ' ' + str(definitions) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment