Skip to content

Instantly share code, notes, and snippets.

@Wilfred
Created June 30, 2010 15:37
Show Gist options
  • Save Wilfred/458809 to your computer and use it in GitHub Desktop.
Save Wilfred/458809 to your computer and use it in GitHub Desktop.
#!/usr/bin/python
import os
import lxml.etree
def tld_to_word(node, hyphenated=False):
# convert text of the form 'ret<tld>ejo'
assert node != None
# beginning
if node.text != None:
start = node.text
else:
start = ''
# middle, assumes exactly one tld node
root = get_word_root(node)
tld_node = node.find('tld')
assert tld_node != None
assert len(node.findall('tld')) == 1
# end
if node.tail != None:
ending = tld_node.tail
else:
ending = ''
if hyphenated:
if start != '':
word = start + '-'
else:
word = ''
if ending != '':
word += root + '-' + ending
else:
word += root
else:
word = start + root + ending
return word
def get_word_root(arbitrary_node):
# get the root without the ending
assert arbitrary_node != None
tree = arbitrary_node.getroottree()
return list(tree.iter('rad'))[0].text
def get_word(drv_node):
root = get_word_root(drv_node)
word = tld_to_word(drv_node.find('kap'))
return word
def get_hypenated_word(drv_node):
root = get_word_root(drv_node)
word = tld_to_word(drv_node.find('kap'), hyphenated=True)
return word
def get_definition(dif_node):
# get text string from a dif node, without the examples after
# text of form 'x<ref>foo<tld/>bar</ref>baz'
definition_parts = []
# dif_node may also contain content
if dif_node.text != None and dif_node.text.strip() != '':
definition_parts.append(clean_text(dif_node.text))
if dif_node.tail != None and dif_node.tail.strip() != '':
definition_parts.append(clean_text(dif_node.tail))
for node in dif_node.iterdescendants():
if node.tag == 'ekz':
break
if node.text != None and node.text.strip() != '':
definition_parts.append(clean_text(node.text))
if node.tail != None and node.tail.strip() != '':
definition_parts.append(clean_text(node.tail))
definition = ' '.join(definition_parts)
# replace : at end with .
definition = definition[:-1] + '.'
return definition
def get_definitions(drv_node):
# get all definitions for a word
# converting subsenses all to just definitions
senses = drv_node.findall('snc')
definitions = []
for sense in senses:
if sense.find('subsnc') != None:
subsenses = sense.findall('subsnc')
for subsense in subsenses:
dif_node = subsense.find('dif')
definition = get_definition(dif_node)
definitions.append(definition)
elif sense.find('dif') != None:
# word has normal definition
dif_node = sense.find('dif')
definition = get_definition(dif_node)
definitions.append(definition)
else:
# word definition references another
ref_node = sense.find('ref')
assert ref_node != None
if ref_node.find('tld') is None:
# simple, just containts text
referent = ref_node.text.strip()
else:
referent = tld_to_word(ref_node)
definition = 'Sama kiel ' + referent + '.'
definitions.append(definition)
return definitions
def clean_text(raw_text):
assert type(raw_text) == str or type(raw_text) == unicode
#fix text of form '\n Foo \n bar ':
result = ''
is_start = True
for raw_line in raw_text.splitlines():
line = raw_line.strip()
if line != '':
if is_start:
result = result + line
is_start = False
else:
result = result + ' ' + line
return result
parser = lxml.etree.XMLParser(load_dtd=True)
for file in os.listdir('../xml/'):
#for file in ['primit.xml']:
tree = lxml.etree.parse('../xml/' + file, parser)
print file
# each word is a drv
for drv_node in tree.iter('drv'):
word = get_word(drv_node)
hyphenated_word = get_hypenated_word(drv_node)
definitions = get_definitions(drv_node)
print word + ': ' + hyphenated_word + ' ' + str(definitions)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment