Last active
August 16, 2022 19:17
-
-
Save mayhewsw/944907b968ead28f8e67 to your computer and use it in GitHub Desktop.
Data Preparer for Bible Corpus
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/sh | |
for f in *.xml.gz; do | |
if [ $f == "English.xml.gz" ] ; then | |
continue; | |
fi | |
python makeplaintext.py $f English.xml.gz; | |
done |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python | |
import xml.etree.ElementTree as ET | |
import gzip | |
import os | |
import sys | |
# This is for reading and processing files from Christos Christodoulopoulos' Bible corpus: http://christos-c.com/bible/ | |
# The intent of this script is to prepare files to be aligned by Giza++ (https://github.com/moses-smt/giza-pp) | |
# This will intelligently select only those verses that exist in both versions of the Bible. | |
if len(sys.argv) < 3: | |
print "Usage: python makeplaintext.py L1.xml.gz L2.xml.gz" | |
exit() | |
a = sys.argv[1] | |
b = sys.argv[2] | |
aname = a.split(".")[0] | |
bname = b.split(".")[0] | |
# mkdir aname_bname | |
outdir = aname + "_" + bname | |
os.mkdir(outdir) | |
adict = {} | |
averses = [] | |
with gzip.open(a, 'rb') as f: | |
file_content = f.read() | |
root = ET.fromstring(file_content) | |
print root | |
for n in root.iter('seg'): | |
print n.attrib, n.text.strip() | |
adict[n.attrib["id"]] = n.text.strip() | |
averses.append(n.attrib["id"]) | |
bdict = {} | |
bverses = [] | |
with gzip.open(b, 'rb') as f: | |
file_content = f.read() | |
root = ET.fromstring(file_content) | |
print root | |
for n in root.iter('seg'): | |
if(n.text is not None): | |
print n.attrib, n.text.strip() | |
bdict[n.attrib["id"]] = n.text.strip() | |
bverses.append(n.attrib["id"]) | |
with open(os.path.join(outdir, aname + ".txt"), "wb") as aout: | |
with open(os.path.join(outdir, bname + ".txt"), "wb") as bout: | |
for bv in bverses: | |
if(bv in adict): | |
print bv | |
aout.write((adict[bv] + "\n").encode("UTF-8")) | |
bout.write((bdict[bv] + "\n").encode("UTF-8")) | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
from subprocess import call | |
exe="/path/to/giza-pp/GIZA++-v2/plain2snt.out" | |
for root, dirs, files in os.walk("."): | |
print root, dirs, files | |
if len(files) == 2: | |
print root | |
os.chdir(root) | |
call([exe, files[0], files[1]]) | |
os.chdir(".. |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment