Created
April 19, 2013 23:02
-
-
Save JGVerdugo/5423836 to your computer and use it in GitHub Desktop.
Extracting a TEI bilingual dictionary with BeautifulSoup
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
"""Extracts a TEI bilingual vocabulary to a term-tab-term plain-text structure""" | |
"""Note: this script uses the BeautifulSoup library for TEI parsing.""" | |
"""See http://www.crummy.com/software/BeautifulSoup/bs4 for details.""" | |
from bs4 import BeautifulSoup | |
import sys | |
import codecs | |
if len(sys.argv) < 2: | |
print "\n Usage: extract-tei.py filename\n" | |
sys.exit() | |
filename = sys.argv[1] | |
# Basic error control | |
try: | |
document = open(filename).read() | |
except IOError: | |
print "\nSorry, I can't find that file. Exiting..." | |
sys.exit() | |
# Parse the file, get a list of entries | |
soup = BeautifulSoup(document) | |
listOfEntries = soup.find_all('entry') | |
# Processing entries | |
pairsOfTerms = [] | |
for element in listOfEntries: | |
source = element.find_all('orth') | |
targets = element.find_all('quote') | |
# Check if more than 1 source (there are none, but just in case) | |
if len(source) > 1: | |
print source[0].text, "<<<<<<< This entry has more than 1 source!!" | |
sys.exit() | |
# Match each target with its parent source | |
for target in targets: | |
pairsOfTerms.append((source[0].text, target.text)) | |
# Dump into a text file | |
with codecs.open("dump", "w", "UTF-8") as dumpFile: | |
for pair in sorted(set(pairsOfTerms)): # Do not process duplicates (there are many!) | |
dumpFile.write(pair[0]) | |
dumpFile.write("\t") | |
dumpFile.write(pair[1]) | |
dumpFile.write("\n") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment