Skip to content

Instantly share code, notes, and snippets.

@millawell
Created May 10, 2022 18:47
Show Gist options
  • Save millawell/4752f0130852e57a16112bec72caa34b to your computer and use it in GitHub Desktop.
Save millawell/4752f0130852e57a16112bec72caa34b to your computer and use it in GitHub Desktop.
from lxml import etree
from pathlib import Path
from standoffconverter import Standoff, View
from tqdm import tqdm
def get_namespaces():
return {
"pos":"http:///de/tudarmstadt/ukp/dkpro/core/api/lexmorph/type/pos.ecore",
"tcas":"http:///uima/tcas.ecore",
"xmi":"http://www.omg.org/XMI",
"cas":"http:///uima/cas.ecore",
"tweet":"http:///de/tudarmstadt/ukp/dkpro/core/api/lexmorph/type/pos/tweet.ecore",
"morph":"http:///de/tudarmstadt/ukp/dkpro/core/api/lexmorph/type/morph.ecore",
"dependency":"http:///de/tudarmstadt/ukp/dkpro/core/api/syntax/type/dependency.ecore",
"type5":"http:///de/tudarmstadt/ukp/dkpro/core/api/semantics/type.ecore",
"type8":"http:///de/tudarmstadt/ukp/dkpro/core/api/transform/type.ecore",
"type7":"http:///de/tudarmstadt/ukp/dkpro/core/api/syntax/type.ecore",
"type2":"http:///de/tudarmstadt/ukp/dkpro/core/api/metadata/type.ecore",
"type9":"http:///org/dkpro/core/api/xml/type.ecore",
"type3":"http:///de/tudarmstadt/ukp/dkpro/core/api/ner/type.ecore",
"type4":"http:///de/tudarmstadt/ukp/dkpro/core/api/segmentation/type.ecore",
"type":"http:///de/tudarmstadt/ukp/dkpro/core/api/coref/type.ecore",
"type6":"http:///de/tudarmstadt/ukp/dkpro/core/api/structure/type.ecore",
"constituent":"http:///de/tudarmstadt/ukp/dkpro/core/api/syntax/type/constituent.ecore",
"chunk":"http:///de/tudarmstadt/ukp/dkpro/core/api/syntax/type/chunk.ecore",
}
def get_tag_mapping():
return {
"LOC":"placeName",
"PER": "persName",
"ORG": "orgName",
"LOCderiv":"placeName",
"PERderiv": "persName",
"ORGderiv": "orgName",
}
def load_xmi(xmi_input):
"""
Loads an XMI file and returns the root element and the namespaces.
"""
with open(xmi_input, "rb") as fin:
xml_str = fin.read()
tree = etree.fromstring(xml_str)
return tree
def get_lemma(token_el, tree, ns):
if (lemma_id := token_el.get("lemma")) is None:
return None
lemma = tree.xpath(f"//type4:Lemma[@xmi:id='{lemma_id}']",namespaces=ns)
if len(lemma) == 0:
return None
lemma = lemma[0]
return lemma.attrib.get("value")
def get_pos(token_el, tree, ns):
if (pos_id := token_el.get("pos")) is None:
return None
pos = tree.xpath(f"//pos:POS[@xmi:id='{pos_id}']",namespaces=ns)
if len(pos) == 0:
return None
pos = pos[0]
return pos.attrib.get("coarseValue")
def get_morph(token_el, tree, ns):
if (morph_id := token_el.get("morph")) is None:
return None
morph = tree.xpath(f"//morph:MorphologicalFeatures[@xmi:id='{morph_id}']",namespaces=ns)
if len(morph) == 0:
return None
morph = morph[0]
return morph.attrib.get("value")
def parse_tokens(tree, ns):
token_elements = tree.xpath(
'type4:Token',
namespaces=ns
)
tokens = []
for token_el in tqdm(token_elements, desc="parse tokens"):
begin_ind=int(token_el.attrib['begin'])
end_ind=int(token_el.attrib['end'])
tokens.append({
"start_char": begin_ind,
"end_char": end_ind,
"tag": "token",
"attrib": {}
})
lemma=get_lemma(token_el, tree, ns)
pos=get_pos(token_el, tree, ns)
morph=get_morph(token_el, tree, ns)
if lemma is not None:
tokens[-1]["attrib"]["lemma"] = lemma
if pos is not None:
tokens[-1]["attrib"]["pos"] = pos
if morph is not None:
tokens[-1]["attrib"]["morph"] = morph
return tokens
def parse_entities(tree, ns):
entity_elements = tree.xpath(
"type3:NamedEntity",
namespaces=ns
)
tag_map = get_tag_mapping()
entities = []
for entity in tqdm(entity_elements, desc="parse entities"):
if 'begin' in entity.attrib and 'end' in entity.attrib and 'value' in entity.attrib:
entities.append({
"start_char": int(entity.attrib['begin']),
"end_char": int(entity.attrib['end']),
"tag": tag_map[entity.attrib['value']],
"attrib": {}
})
if 'identifier' in entity.attrib:
entities[-1]["attrib"] = {
"ref": entity.attrib['identifier']
}
return entities
def add_annotations(so, view, new_elems):
for elem in tqdm(new_elems, desc="add annotations"):
start_ind = view.get_table_pos(elem["start_char"])
end_ind = view.get_table_pos(elem["end_char"]-1)+1
so.add_inline(
begin=start_ind,
end=end_ind,
tag=elem["tag"],
depth=None,
attrib=elem["attrib"],
)
def main():
# load information from the xmi format
xmi_tree = load_xmi(Path("in_data/Melanges.xmi"))
xmi_namespaces = get_namespaces()
tokens = parse_tokens(xmi_tree, xmi_namespaces)
entities = parse_entities(xmi_tree, xmi_namespaces)
# load tei raw file
xml_raw = Path("in_data/Melanges_raw.xml")
tei_namespaces = {
"tei": "http://www.tei-c.org/ns/1.0"
}
with open(xml_raw, "rb") as fin:
raw_tree = etree.fromstring(fin.read())
# create standoff data structure
so = Standoff(raw_tree, namespaces=tei_namespaces)
view = View(so)
# add tokens and entities to the standoff
add_annotations(so, view, entities)
# add_annotations(so, view, tokens)
# write standoff to file
out_str = etree.tostring(so.tree, encoding="unicode")
with open(Path("out_data/Melanges_entities.tei.xml"), "w") as fout:
fout.write(out_str)
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment