Created
May 10, 2022 18:47
-
-
Save millawell/4752f0130852e57a16112bec72caa34b to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from lxml import etree | |
from pathlib import Path | |
from standoffconverter import Standoff, View | |
from tqdm import tqdm | |
def get_namespaces(): | |
return { | |
"pos":"http:///de/tudarmstadt/ukp/dkpro/core/api/lexmorph/type/pos.ecore", | |
"tcas":"http:///uima/tcas.ecore", | |
"xmi":"http://www.omg.org/XMI", | |
"cas":"http:///uima/cas.ecore", | |
"tweet":"http:///de/tudarmstadt/ukp/dkpro/core/api/lexmorph/type/pos/tweet.ecore", | |
"morph":"http:///de/tudarmstadt/ukp/dkpro/core/api/lexmorph/type/morph.ecore", | |
"dependency":"http:///de/tudarmstadt/ukp/dkpro/core/api/syntax/type/dependency.ecore", | |
"type5":"http:///de/tudarmstadt/ukp/dkpro/core/api/semantics/type.ecore", | |
"type8":"http:///de/tudarmstadt/ukp/dkpro/core/api/transform/type.ecore", | |
"type7":"http:///de/tudarmstadt/ukp/dkpro/core/api/syntax/type.ecore", | |
"type2":"http:///de/tudarmstadt/ukp/dkpro/core/api/metadata/type.ecore", | |
"type9":"http:///org/dkpro/core/api/xml/type.ecore", | |
"type3":"http:///de/tudarmstadt/ukp/dkpro/core/api/ner/type.ecore", | |
"type4":"http:///de/tudarmstadt/ukp/dkpro/core/api/segmentation/type.ecore", | |
"type":"http:///de/tudarmstadt/ukp/dkpro/core/api/coref/type.ecore", | |
"type6":"http:///de/tudarmstadt/ukp/dkpro/core/api/structure/type.ecore", | |
"constituent":"http:///de/tudarmstadt/ukp/dkpro/core/api/syntax/type/constituent.ecore", | |
"chunk":"http:///de/tudarmstadt/ukp/dkpro/core/api/syntax/type/chunk.ecore", | |
} | |
def get_tag_mapping(): | |
return { | |
"LOC":"placeName", | |
"PER": "persName", | |
"ORG": "orgName", | |
"LOCderiv":"placeName", | |
"PERderiv": "persName", | |
"ORGderiv": "orgName", | |
} | |
def load_xmi(xmi_input): | |
""" | |
Loads an XMI file and returns the root element and the namespaces. | |
""" | |
with open(xmi_input, "rb") as fin: | |
xml_str = fin.read() | |
tree = etree.fromstring(xml_str) | |
return tree | |
def get_lemma(token_el, tree, ns): | |
if (lemma_id := token_el.get("lemma")) is None: | |
return None | |
lemma = tree.xpath(f"//type4:Lemma[@xmi:id='{lemma_id}']",namespaces=ns) | |
if len(lemma) == 0: | |
return None | |
lemma = lemma[0] | |
return lemma.attrib.get("value") | |
def get_pos(token_el, tree, ns): | |
if (pos_id := token_el.get("pos")) is None: | |
return None | |
pos = tree.xpath(f"//pos:POS[@xmi:id='{pos_id}']",namespaces=ns) | |
if len(pos) == 0: | |
return None | |
pos = pos[0] | |
return pos.attrib.get("coarseValue") | |
def get_morph(token_el, tree, ns): | |
if (morph_id := token_el.get("morph")) is None: | |
return None | |
morph = tree.xpath(f"//morph:MorphologicalFeatures[@xmi:id='{morph_id}']",namespaces=ns) | |
if len(morph) == 0: | |
return None | |
morph = morph[0] | |
return morph.attrib.get("value") | |
def parse_tokens(tree, ns): | |
token_elements = tree.xpath( | |
'type4:Token', | |
namespaces=ns | |
) | |
tokens = [] | |
for token_el in tqdm(token_elements, desc="parse tokens"): | |
begin_ind=int(token_el.attrib['begin']) | |
end_ind=int(token_el.attrib['end']) | |
tokens.append({ | |
"start_char": begin_ind, | |
"end_char": end_ind, | |
"tag": "token", | |
"attrib": {} | |
}) | |
lemma=get_lemma(token_el, tree, ns) | |
pos=get_pos(token_el, tree, ns) | |
morph=get_morph(token_el, tree, ns) | |
if lemma is not None: | |
tokens[-1]["attrib"]["lemma"] = lemma | |
if pos is not None: | |
tokens[-1]["attrib"]["pos"] = pos | |
if morph is not None: | |
tokens[-1]["attrib"]["morph"] = morph | |
return tokens | |
def parse_entities(tree, ns): | |
entity_elements = tree.xpath( | |
"type3:NamedEntity", | |
namespaces=ns | |
) | |
tag_map = get_tag_mapping() | |
entities = [] | |
for entity in tqdm(entity_elements, desc="parse entities"): | |
if 'begin' in entity.attrib and 'end' in entity.attrib and 'value' in entity.attrib: | |
entities.append({ | |
"start_char": int(entity.attrib['begin']), | |
"end_char": int(entity.attrib['end']), | |
"tag": tag_map[entity.attrib['value']], | |
"attrib": {} | |
}) | |
if 'identifier' in entity.attrib: | |
entities[-1]["attrib"] = { | |
"ref": entity.attrib['identifier'] | |
} | |
return entities | |
def add_annotations(so, view, new_elems): | |
for elem in tqdm(new_elems, desc="add annotations"): | |
start_ind = view.get_table_pos(elem["start_char"]) | |
end_ind = view.get_table_pos(elem["end_char"]-1)+1 | |
so.add_inline( | |
begin=start_ind, | |
end=end_ind, | |
tag=elem["tag"], | |
depth=None, | |
attrib=elem["attrib"], | |
) | |
def main(): | |
# load information from the xmi format | |
xmi_tree = load_xmi(Path("in_data/Melanges.xmi")) | |
xmi_namespaces = get_namespaces() | |
tokens = parse_tokens(xmi_tree, xmi_namespaces) | |
entities = parse_entities(xmi_tree, xmi_namespaces) | |
# load tei raw file | |
xml_raw = Path("in_data/Melanges_raw.xml") | |
tei_namespaces = { | |
"tei": "http://www.tei-c.org/ns/1.0" | |
} | |
with open(xml_raw, "rb") as fin: | |
raw_tree = etree.fromstring(fin.read()) | |
# create standoff data structure | |
so = Standoff(raw_tree, namespaces=tei_namespaces) | |
view = View(so) | |
# add tokens and entities to the standoff | |
add_annotations(so, view, entities) | |
# add_annotations(so, view, tokens) | |
# write standoff to file | |
out_str = etree.tostring(so.tree, encoding="unicode") | |
with open(Path("out_data/Melanges_entities.tei.xml"), "w") as fout: | |
fout.write(out_str) | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment