Skip to content

Instantly share code, notes, and snippets.

@nournia
Last active April 13, 2021 08:57
Show Gist options
  • Save nournia/5865646 to your computer and use it in GitHub Desktop.
Save nournia/5865646 to your computer and use it in GitHub Desktop.
Convert dependency tree to chunk
from pymongo import Connection
connection = Connection()
dependencies = connection.dadegan.dependencies
def traverse(parent, node, chunk, force=False):
words[int(node['index'])] = node
if not chunk:
node['chunk'] = ('O', '')
else:
if parent and chunk == parent['chunk'][1]:
if parent['chunk'][0] == 'B' and int(node['index']) < int(parent['index']):
parent['chunk'] = ('I', parent['chunk'][1])
node['chunk'] = ('B', chunk)
else:
node['chunk'] = ('I', chunk)
else:
node['chunk'] = ('B', chunk)
if 'childs' in node:
for edge, child in node['childs'].items():
pos = child['POS']
if pos == 'V' or edge == 'NVE' or edge == 'VPRT':
subtree = 'VP'
elif chunk != 'VP':
subtree = chunk
elif pos == 'N' or pos == 'PR' or pos == 'POSTP':
subtree = 'NP'
elif pos == 'PREP':
subtree = 'PP'
elif pos == 'ADJ':
subtree = 'AdvP'
elif pos == 'AdjP':
subtree = 'AdjP'
else:
subtree = ''
if force or (edge == 'NVE' or edge == 'VPRT'):
traverse(node, child, chunk, True)
else:
traverse(node, child, subtree, False)
if __name__ == '__main__':
for sentence in dependencies.find().limit(100):
words = {}
tree = sentence['tree']
traverse(None, tree, 'VP')
text, last_chunk = '', ('O', '')
for word in words.values():
chunk = word['chunk']
if chunk[0] == 'I' and chunk[1] != last_chunk[1]:
chunk = ('B', chunk[1])
word['chunk'] = chunk
if chunk[0] == 'I':
text += ' '+ word['surface']
else:
if last_chunk[0] != 'O':
text += ']%s ' % last_chunk[1]
if chunk[0] != 'O':
text += '['
text += word['surface']
last_chunk = chunk
# print word['surface'].encode('utf8'), '-'.join(word['chunk'])
print text.encode('utf8')
print
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment