millawell · May 10, 2022 18:47
diff --git a/convert xmi to tei.py b/convert xmi to tei.py
 from lxml import etree
 from pathlib import Path
 from standoffconverter import  Standoff, View
 from tqdm import tqdm

 def get_namespaces():
    return {
        "pos":"http:///de/tudarmstadt/ukp/dkpro/core/api/lexmorph/type/pos.ecore", 
        "tcas":"http:///uima/tcas.ecore", 
        "xmi":"http://www.omg.org/XMI", 
        "cas":"http:///uima/cas.ecore", 
        "tweet":"http:///de/tudarmstadt/ukp/dkpro/core/api/lexmorph/type/pos/tweet.ecore", 
        "morph":"http:///de/tudarmstadt/ukp/dkpro/core/api/lexmorph/type/morph.ecore", 
        "dependency":"http:///de/tudarmstadt/ukp/dkpro/core/api/syntax/type/dependency.ecore", 
        "type5":"http:///de/tudarmstadt/ukp/dkpro/core/api/semantics/type.ecore", 
        "type8":"http:///de/tudarmstadt/ukp/dkpro/core/api/transform/type.ecore", 
        "type7":"http:///de/tudarmstadt/ukp/dkpro/core/api/syntax/type.ecore", 
        "type2":"http:///de/tudarmstadt/ukp/dkpro/core/api/metadata/type.ecore", 
        "type9":"http:///org/dkpro/core/api/xml/type.ecore", 
        "type3":"http:///de/tudarmstadt/ukp/dkpro/core/api/ner/type.ecore", 
        "type4":"http:///de/tudarmstadt/ukp/dkpro/core/api/segmentation/type.ecore", 
        "type":"http:///de/tudarmstadt/ukp/dkpro/core/api/coref/type.ecore", 
        "type6":"http:///de/tudarmstadt/ukp/dkpro/core/api/structure/type.ecore", 
        "constituent":"http:///de/tudarmstadt/ukp/dkpro/core/api/syntax/type/constituent.ecore", 
        "chunk":"http:///de/tudarmstadt/ukp/dkpro/core/api/syntax/type/chunk.ecore",
    }


 def get_tag_mapping():

    return {
        "LOC":"placeName",
        "PER": "persName", 
        "ORG": "orgName",
        "LOCderiv":"placeName",
        "PERderiv": "persName", 
        "ORGderiv": "orgName",
    }


 def load_xmi(xmi_input):
    """
    Loads an XMI file and returns the root element and the namespaces.
    """

    with open(xmi_input, "rb") as fin:
        xml_str = fin.read()

    tree = etree.fromstring(xml_str)   

    

    return tree

 def get_lemma(token_el, tree, ns):
    if (lemma_id := token_el.get("lemma")) is None:
        return None
    
    lemma = tree.xpath(f"//type4:Lemma[@xmi:id='{lemma_id}']",namespaces=ns)
    
    if len(lemma) == 0:
        return None
    
    lemma = lemma[0]

    return lemma.attrib.get("value")


 def get_pos(token_el, tree, ns):
    if (pos_id := token_el.get("pos")) is None:
        return None
    
    pos = tree.xpath(f"//pos:POS[@xmi:id='{pos_id}']",namespaces=ns)
    
    if len(pos) == 0:
        return None
    
    pos = pos[0]
    
    return pos.attrib.get("coarseValue")


 def get_morph(token_el, tree, ns):
    if (morph_id := token_el.get("morph")) is None:
        return None
    
    morph = tree.xpath(f"//morph:MorphologicalFeatures[@xmi:id='{morph_id}']",namespaces=ns)
    
    if len(morph) == 0:
        return None
    
    morph = morph[0]

    return morph.attrib.get("value")


 def parse_tokens(tree, ns):
    
    token_elements = tree.xpath(
        'type4:Token',
        namespaces=ns
    )
    
    tokens = []
    for token_el in tqdm(token_elements, desc="parse tokens"):
        
        begin_ind=int(token_el.attrib['begin'])
        end_ind=int(token_el.attrib['end'])

        tokens.append({
            "start_char": begin_ind,
            "end_char": end_ind,
            "tag": "token",
            "attrib": {}
        })

        lemma=get_lemma(token_el, tree, ns)
        pos=get_pos(token_el, tree, ns)
        morph=get_morph(token_el, tree, ns)

        if lemma is not None:
            tokens[-1]["attrib"]["lemma"] = lemma
        if pos is not None:
            tokens[-1]["attrib"]["pos"] = pos
        if morph is not None:
            tokens[-1]["attrib"]["morph"] = morph


    return tokens

 def parse_entities(tree, ns):
    
    entity_elements = tree.xpath(
        "type3:NamedEntity",
        namespaces=ns
    )

    tag_map = get_tag_mapping()

    entities = []
    for entity in tqdm(entity_elements, desc="parse entities"):
        if 'begin' in entity.attrib and 'end' in entity.attrib and 'value' in entity.attrib:
            entities.append({
                "start_char": int(entity.attrib['begin']),
                "end_char": int(entity.attrib['end']),
                "tag": tag_map[entity.attrib['value']],
                "attrib": {}
            })

            if 'identifier' in entity.attrib:
                entities[-1]["attrib"] = {
                    "ref": entity.attrib['identifier']
                }
            
    return entities


 def add_annotations(so, view, new_elems):
    for elem in tqdm(new_elems, desc="add annotations"):
        
        start_ind = view.get_table_pos(elem["start_char"])
        end_ind = view.get_table_pos(elem["end_char"]-1)+1

        so.add_inline(
            begin=start_ind,
            end=end_ind,
            tag=elem["tag"],
            depth=None,
            attrib=elem["attrib"],
        )


 def main():

    # load information from the xmi format
    xmi_tree = load_xmi(Path("in_data/Melanges.xmi"))
    xmi_namespaces = get_namespaces()

    tokens = parse_tokens(xmi_tree, xmi_namespaces)
    entities = parse_entities(xmi_tree, xmi_namespaces)

    # load tei raw file
    xml_raw = Path("in_data/Melanges_raw.xml")
    tei_namespaces = {
        "tei": "http://www.tei-c.org/ns/1.0"
    }
    with open(xml_raw, "rb") as fin:
        raw_tree = etree.fromstring(fin.read())
    # create standoff data structure
    so = Standoff(raw_tree, namespaces=tei_namespaces)
    view = View(so)

    # add tokens and entities to the standoff
    add_annotations(so, view, entities)
    # add_annotations(so, view, tokens)

    # write standoff to file
    out_str = etree.tostring(so.tree, encoding="unicode")
    with open(Path("out_data/Melanges_entities.tei.xml"), "w") as fout:
        fout.write(out_str)


 if __name__ == "__main__":
    main()
	from lxml import etree
	from pathlib import Path
	from standoffconverter import Standoff, View
	from tqdm import tqdm

	def get_namespaces():
	return {
	"pos":"http:///de/tudarmstadt/ukp/dkpro/core/api/lexmorph/type/pos.ecore",
	"tcas":"http:///uima/tcas.ecore",
	"xmi":"http://www.omg.org/XMI",
	"cas":"http:///uima/cas.ecore",
	"tweet":"http:///de/tudarmstadt/ukp/dkpro/core/api/lexmorph/type/pos/tweet.ecore",
	"morph":"http:///de/tudarmstadt/ukp/dkpro/core/api/lexmorph/type/morph.ecore",
	"dependency":"http:///de/tudarmstadt/ukp/dkpro/core/api/syntax/type/dependency.ecore",
	"type5":"http:///de/tudarmstadt/ukp/dkpro/core/api/semantics/type.ecore",
	"type8":"http:///de/tudarmstadt/ukp/dkpro/core/api/transform/type.ecore",
	"type7":"http:///de/tudarmstadt/ukp/dkpro/core/api/syntax/type.ecore",
	"type2":"http:///de/tudarmstadt/ukp/dkpro/core/api/metadata/type.ecore",
	"type9":"http:///org/dkpro/core/api/xml/type.ecore",
	"type3":"http:///de/tudarmstadt/ukp/dkpro/core/api/ner/type.ecore",
	"type4":"http:///de/tudarmstadt/ukp/dkpro/core/api/segmentation/type.ecore",
	"type":"http:///de/tudarmstadt/ukp/dkpro/core/api/coref/type.ecore",
	"type6":"http:///de/tudarmstadt/ukp/dkpro/core/api/structure/type.ecore",
	"constituent":"http:///de/tudarmstadt/ukp/dkpro/core/api/syntax/type/constituent.ecore",
	"chunk":"http:///de/tudarmstadt/ukp/dkpro/core/api/syntax/type/chunk.ecore",
	}


	def get_tag_mapping():

	return {
	"LOC":"placeName",
	"PER": "persName",
	"ORG": "orgName",
	"LOCderiv":"placeName",
	"PERderiv": "persName",
	"ORGderiv": "orgName",
	}


	def load_xmi(xmi_input):
	"""
	Loads an XMI file and returns the root element and the namespaces.
	"""

	with open(xmi_input, "rb") as fin:
	xml_str = fin.read()

	tree = etree.fromstring(xml_str)



	return tree

	def get_lemma(token_el, tree, ns):
	if (lemma_id := token_el.get("lemma")) is None:
	return None

	lemma = tree.xpath(f"//type4:Lemma[@xmi:id='{lemma_id}']",namespaces=ns)

	if len(lemma) == 0:
	return None

	lemma = lemma[0]

	return lemma.attrib.get("value")


	def get_pos(token_el, tree, ns):
	if (pos_id := token_el.get("pos")) is None:
	return None

	pos = tree.xpath(f"//pos:POS[@xmi:id='{pos_id}']",namespaces=ns)

	if len(pos) == 0:
	return None

	pos = pos[0]

	return pos.attrib.get("coarseValue")


	def get_morph(token_el, tree, ns):
	if (morph_id := token_el.get("morph")) is None:
	return None

	morph = tree.xpath(f"//morph:MorphologicalFeatures[@xmi:id='{morph_id}']",namespaces=ns)

	if len(morph) == 0:
	return None

	morph = morph[0]

	return morph.attrib.get("value")


	def parse_tokens(tree, ns):

	token_elements = tree.xpath(
	'type4:Token',
	namespaces=ns
	)

	tokens = []
	for token_el in tqdm(token_elements, desc="parse tokens"):

	begin_ind=int(token_el.attrib['begin'])
	end_ind=int(token_el.attrib['end'])

	tokens.append({
	"start_char": begin_ind,
	"end_char": end_ind,
	"tag": "token",
	"attrib": {}
	})

	lemma=get_lemma(token_el, tree, ns)
	pos=get_pos(token_el, tree, ns)
	morph=get_morph(token_el, tree, ns)

	if lemma is not None:
	tokens[-1]["attrib"]["lemma"] = lemma
	if pos is not None:
	tokens[-1]["attrib"]["pos"] = pos
	if morph is not None:
	tokens[-1]["attrib"]["morph"] = morph


	return tokens

	def parse_entities(tree, ns):

	entity_elements = tree.xpath(
	"type3:NamedEntity",
	namespaces=ns
	)

	tag_map = get_tag_mapping()

	entities = []
	for entity in tqdm(entity_elements, desc="parse entities"):
	if 'begin' in entity.attrib and 'end' in entity.attrib and 'value' in entity.attrib:
	entities.append({
	"start_char": int(entity.attrib['begin']),
	"end_char": int(entity.attrib['end']),
	"tag": tag_map[entity.attrib['value']],
	"attrib": {}
	})

	if 'identifier' in entity.attrib:
	entities[-1]["attrib"] = {
	"ref": entity.attrib['identifier']
	}

	return entities


	def add_annotations(so, view, new_elems):
	for elem in tqdm(new_elems, desc="add annotations"):

	start_ind = view.get_table_pos(elem["start_char"])
	end_ind = view.get_table_pos(elem["end_char"]-1)+1

	so.add_inline(
	begin=start_ind,
	end=end_ind,
	tag=elem["tag"],
	depth=None,
	attrib=elem["attrib"],
	)


	def main():

	# load information from the xmi format
	xmi_tree = load_xmi(Path("in_data/Melanges.xmi"))
	xmi_namespaces = get_namespaces()

	tokens = parse_tokens(xmi_tree, xmi_namespaces)
	entities = parse_entities(xmi_tree, xmi_namespaces)

	# load tei raw file
	xml_raw = Path("in_data/Melanges_raw.xml")
	tei_namespaces = {
	"tei": "http://www.tei-c.org/ns/1.0"
	}
	with open(xml_raw, "rb") as fin:
	raw_tree = etree.fromstring(fin.read())
	# create standoff data structure
	so = Standoff(raw_tree, namespaces=tei_namespaces)
	view = View(so)

	# add tokens and entities to the standoff
	add_annotations(so, view, entities)
	# add_annotations(so, view, tokens)

	# write standoff to file
	out_str = etree.tostring(so.tree, encoding="unicode")
	with open(Path("out_data/Melanges_entities.tei.xml"), "w") as fout:
	fout.write(out_str)


	if __name__ == "__main__":
	main()