Skip to content

Instantly share code, notes, and snippets.

@dimitrovs
Last active August 29, 2015 14:23
Show Gist options
  • Save dimitrovs/27556c0c1598e229cda2 to your computer and use it in GitHub Desktop.
Save dimitrovs/27556c0c1598e229cda2 to your computer and use it in GitHub Desktop.
Greedy matching of complex names in CoreNLP XML
# This script matches names like "Mr. Bennet" (and not just "Bennet")
# in the XML output of CoreNLP, which has one word per token
import xml.etree.ElementTree as ET
e = ET.parse('test.xml').getroot()
word_list = []
character_dict = dict()
for line in open('characters.tsv','r'):
for character in line.split('\t'):
character_parts = character.strip().encode('utf8').split()
element = character_dict
for part in reversed(character_parts):
if part not in element:
element[part] = dict()
element = element[part]
element['character_name'] = character.strip().encode('utf8')
element['mentions'] = []
for index,token_tag in enumerate(e.iter('token')):
word_tag = token_tag[0]
element = character_dict
word = word_tag.text
while word in element:
element = element[word]
if index == 0:
break
word = word_list[index-1]
if 'mentions' in element:
element['mentions'].append(index)
word_list.append(word_tag.text.encode('utf8'))
def get_mentions(element):
for key in element:
if key != 'mentions' and key != 'character_name':
get_mentions(element[key])
if 'character_name' in element:
print element['character_name'],element['mentions']
get_mentions(character_dict)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment