Created
December 18, 2014 18:03
-
-
Save frederik-elwert/6abd6b5b07d210b040bd to your computer and use it in GitHub Desktop.
TEI to TCF conversion using TCFlib
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
# -*- coding: utf-8 -*- | |
import os | |
import re | |
from glob import glob | |
import logging | |
from lxml import etree | |
from tcflib.service import ImportingWorker, run_as_cli | |
from tcflib.tcf import (TextCorpus, Text, Tokens, Token, TextStructure, TextSpan, | |
Lemmas, POStags, NamedEntities, NamedEntity, References, Reference, | |
Entity, Wsd, Sentences, Sentence) | |
NSS = { | |
'tei': 'http://www.tei-c.org/ns/1.0', | |
'dcr': 'http://www.isocat.org/ns/dcr', | |
} | |
NAME_TAGS = ('persName', 'addName', 'roleName', 'placeName', 'name') | |
# tag, type, subtype | |
NAME2TYPE = { | |
('persName', None, None) : 'Person', | |
('persName', 'royal', None): 'Royal', | |
('persName', 'deity', None): 'Deity', | |
('addName', 'epithet', 'royal'): 'EpithetRoyal', | |
('addName', 'epithet', 'deity'): 'EpithetDeity', | |
('roleName', None, None): 'Title', | |
('placeName', None, None): 'Place', | |
} | |
class TEIImporter(ImportingWorker): | |
__options__ = { | |
'basedir': '', | |
} | |
def resolve_pointer(self, pointer): | |
for pattern, repl in self.patterns.items(): | |
match = re.match(pattern, pointer) | |
if match: | |
pointer = re.sub(pattern, repl, pointer) | |
break | |
try: | |
file_, id_ = pointer.split('#') | |
except ValueError: | |
raise ValueError('pointer "{}" contains no ID reference'.format( | |
pointer)) | |
if not file_: | |
return self.input_tree.xpath('id($id)', id=id_)[0] | |
else: | |
if not file_ in self.support_files: | |
logging.debug('Loading support file {}.'.format(file_)) | |
filepath = os.path.join(self.options.basedir, file_) | |
_, self.support_files[file_] = etree.parseid(filepath, self.parser) | |
return self.support_files[file_][id_] | |
def setup(self, input_data): | |
self.support_files = {} | |
self.parser = etree.XMLParser(load_dtd=True, remove_blank_text=True) | |
input_root = etree.fromstring(input_data, parser=self.parser) | |
self.input_tree = etree.ElementTree(input_root) | |
self.patterns = {} | |
for pattern in self.input_tree.xpath('//tei:refsDecl[1]/tei:cRefPattern', | |
namespaces=NSS): | |
self.patterns[pattern.get('matchPattern')] = \ | |
pattern.get('replacementPattern').replace('$', '\\') | |
def import_(self): | |
self.corpus = TextCorpus() | |
self.corpus.add_layer(Text('')) | |
self.corpus.add_layer(Tokens()) | |
self.corpus.add_layer(Sentences()) | |
self.corpus.add_layer(Lemmas()) | |
self.corpus.add_layer(NamedEntities('TLA')) | |
self.corpus.add_layer(Wsd('TLA')) | |
self.corpus.add_layer(POStags('DC-1345')) | |
self.corpus.add_layer(TextStructure()) | |
# Process text line by line | |
self.text = [] | |
root = self.input_tree.getroot() | |
if root.tag == etree.QName(NSS['tei'], 'TEI').text: | |
# Single TEI | |
logging.debug('Found single TEI document.') | |
self.add_tei(root) | |
elif root.tag == etree.QName(NSS['tei'], 'TEICorpus').text: | |
# TEI corpus | |
raise NotImplementedError | |
self.corpus.text.text = '\n'.join(self.text) | |
return self.corpus | |
def add_tei(self, teiroot): | |
for text in teiroot.xpath('//tei:text[@type="text"]', namespaces=NSS): | |
span_text = TextSpan(type='text') | |
sentences = text.xpath('.//tei:s', namespaces=NSS) | |
for sentence in sentences: | |
self.text.append(' '.join([word.xpath('normalize-space(.)') | |
for word in sentence])) | |
span_sent = Sentence() | |
for word in sentence.xpath('.//tei:w[@lemmaRef]', | |
namespaces=NSS): | |
# Get corresponding dictionary entry | |
entry = self.resolve_pointer(word.get('lemmaRef')) | |
# Token | |
token = Token(word.xpath('normalize-space(.)')) | |
# Lemma | |
token.lemma = entry.xpath('normalize-space(tei:form/tei:orth)', | |
namespaces=NSS) | |
# Wordsenses | |
token.wordsenses = [entry.xpath('@xml:id')[0]] | |
# POS | |
# Look up POS in features. | |
# try: | |
# features = [self.resolve_pointer(f) for f in | |
# word.xpath('tei:fs/@feats')[0].split()] | |
# except IndexError: | |
# # TODO: Words without POS, fix this! | |
# raise | |
# pos = None | |
# for feature in features: | |
# if (feature.get(etree.QName(NSS['dcr'], 'datcat')) | |
# == 'http://www.isocat.org/datcat/DC-1345'): | |
# pos = feature.xpath('tei:symbol/@value', | |
# namespaces=NSS)[0] | |
# break | |
# token.tag = pos | |
# Look up POS in dict. | |
token.tag = entry.xpath('normalize-space(tei:gramGrp/tei:pos)', | |
namespaces=NSS) | |
# Named Entity | |
parent = word.getparent() | |
parent_tag = etree.QName(parent).localname | |
if parent_tag in NAME_TAGS: | |
class_ = NAME2TYPE[(parent_tag, parent.get('type'), | |
parent.get('subtype'))] | |
self.corpus.namedentities.append( | |
NamedEntity(class_=class_, tokens=[token])) | |
# Add token to text spans and corpus | |
span_sent.tokens.append(token) | |
span_text.tokens.append(token) | |
self.corpus.tokens.append(token) | |
self.corpus.sentences.append(span_sent) | |
self.corpus.textstructure.append(span_text) | |
if __name__ == '__main__': | |
run_as_cli(TEIImporter) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Hi, thanks for your code !! Do you know if there exists a TCF to TEI version of your script ?
THanks in advance,
Djamé