Skip to content

Instantly share code, notes, and snippets.

@mayhewsw
Last active August 16, 2022 19:17
Show Gist options
  • Save mayhewsw/944907b968ead28f8e67 to your computer and use it in GitHub Desktop.
Save mayhewsw/944907b968ead28f8e67 to your computer and use it in GitHub Desktop.
Data Preparer for Bible Corpus
#!/bin/sh
for f in *.xml.gz; do
if [ $f == "English.xml.gz" ] ; then
continue;
fi
python makeplaintext.py $f English.xml.gz;
done
#!/usr/bin/python
import xml.etree.ElementTree as ET
import gzip
import os
import sys
# This is for reading and processing files from Christos Christodoulopoulos' Bible corpus: http://christos-c.com/bible/
# The intent of this script is to prepare files to be aligned by Giza++ (https://github.com/moses-smt/giza-pp)
# This will intelligently select only those verses that exist in both versions of the Bible.
if len(sys.argv) < 3:
print "Usage: python makeplaintext.py L1.xml.gz L2.xml.gz"
exit()
a = sys.argv[1]
b = sys.argv[2]
aname = a.split(".")[0]
bname = b.split(".")[0]
# mkdir aname_bname
outdir = aname + "_" + bname
os.mkdir(outdir)
adict = {}
averses = []
with gzip.open(a, 'rb') as f:
file_content = f.read()
root = ET.fromstring(file_content)
print root
for n in root.iter('seg'):
print n.attrib, n.text.strip()
adict[n.attrib["id"]] = n.text.strip()
averses.append(n.attrib["id"])
bdict = {}
bverses = []
with gzip.open(b, 'rb') as f:
file_content = f.read()
root = ET.fromstring(file_content)
print root
for n in root.iter('seg'):
if(n.text is not None):
print n.attrib, n.text.strip()
bdict[n.attrib["id"]] = n.text.strip()
bverses.append(n.attrib["id"])
with open(os.path.join(outdir, aname + ".txt"), "wb") as aout:
with open(os.path.join(outdir, bname + ".txt"), "wb") as bout:
for bv in bverses:
if(bv in adict):
print bv
aout.write((adict[bv] + "\n").encode("UTF-8"))
bout.write((bdict[bv] + "\n").encode("UTF-8"))
import os
from subprocess import call
exe="/path/to/giza-pp/GIZA++-v2/plain2snt.out"
for root, dirs, files in os.walk("."):
print root, dirs, files
if len(files) == 2:
print root
os.chdir(root)
call([exe, files[0], files[1]])
os.chdir("..
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment