Skip to content

Instantly share code, notes, and snippets.

@kba
Created October 2, 2020 12:05
Show Gist options
  • Save kba/78783ea23e1e28989fa91b643a2fd15d to your computer and use it in GitHub Desktop.
Save kba/78783ea23e1e28989fa91b643a2fd15d to your computer and use it in GitHub Desktop.
# tei2textpages.py
import lxml.etree as ET
import sys
import re
tei_file = sys.argv[1]
with open(tei_file, 'r') as f:
tei_xml = f.read()
tei_xml = re.sub(r'<\?xml version="1.0" encoding="UTF-8"\?>', '', tei_xml)
tei_xml = re.sub(r'<teiHeader>.*</teiHeader>', '', tei_xml)
# tei_xml = re.sub(r'<pb ?facs="#([^"]*)"? ?[^>]*>', r'!!FORMFEED!!\1', tei_xml)
tei_xml = re.sub(r'<pb[^>]*>', '!!FORMFEED!!', tei_xml)
tei_str = ''.join(ET.fromstring(tei_xml).itertext())
for n, chunk in enumerate(tei_str.split('!!FORMFEED!!')):
with open('%s_%04d.txt' % (tei_file, n), 'w') as outf:
outf.write(chunk)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment