JGVerdugo · April 19, 2013 23:02
diff --git a/extract-tei.py b/extract-tei.py
 """Extracts a TEI bilingual vocabulary to a term-tab-term plain-text structure"""

 """Note: this script uses the BeautifulSoup library for TEI parsing."""
 """See http://www.crummy.com/software/BeautifulSoup/bs4 for details."""

 from bs4 import BeautifulSoup
 import sys
 import codecs

 if len(sys.argv) < 2:
    print "\n    Usage: extract-tei.py filename\n"
    sys.exit()
    
 filename = sys.argv[1]

 # Basic error control

 try:
    document = open(filename).read()
 except IOError:
    print "\nSorry, I can't find that file. Exiting..."
    sys.exit()

 # Parse the file, get a list of entries
   
 soup = BeautifulSoup(document)
 listOfEntries = soup.find_all('entry')

 # Processing entries
 pairsOfTerms = []

 for element in listOfEntries:

    source = element.find_all('orth')
    targets = element.find_all('quote')
    
    # Check if more than 1 source (there are none, but just in case)
    if len(source) > 1:
        print source[0].text, "<<<<<<< This entry has more than 1 source!!"
        sys.exit()
    
    # Match each target with its parent source
    for target in targets:
        pairsOfTerms.append((source[0].text, target.text))

 # Dump into a text file

 with codecs.open("dump", "w", "UTF-8") as dumpFile:
    for pair in sorted(set(pairsOfTerms)):  # Do not process duplicates (there are many!)
        dumpFile.write(pair[0])
        dumpFile.write("\t")
        dumpFile.write(pair[1])
        dumpFile.write("\n")
	"""Extracts a TEI bilingual vocabulary to a term-tab-term plain-text structure"""

	"""Note: this script uses the BeautifulSoup library for TEI parsing."""
	"""See http://www.crummy.com/software/BeautifulSoup/bs4 for details."""

	from bs4 import BeautifulSoup
	import sys
	import codecs

	if len(sys.argv) < 2:
	print "\n Usage: extract-tei.py filename\n"
	sys.exit()

	filename = sys.argv[1]

	# Basic error control

	try:
	document = open(filename).read()
	except IOError:
	print "\nSorry, I can't find that file. Exiting..."
	sys.exit()

	# Parse the file, get a list of entries

	soup = BeautifulSoup(document)
	listOfEntries = soup.find_all('entry')

	# Processing entries
	pairsOfTerms = []

	for element in listOfEntries:

	source = element.find_all('orth')
	targets = element.find_all('quote')

	# Check if more than 1 source (there are none, but just in case)
	if len(source) > 1:
	print source[0].text, "<<<<<<< This entry has more than 1 source!!"
	sys.exit()

	# Match each target with its parent source
	for target in targets:
	pairsOfTerms.append((source[0].text, target.text))

	# Dump into a text file

	with codecs.open("dump", "w", "UTF-8") as dumpFile:
	for pair in sorted(set(pairsOfTerms)): # Do not process duplicates (there are many!)
	dumpFile.write(pair[0])
	dumpFile.write("\t")
	dumpFile.write(pair[1])
	dumpFile.write("\n")