Skip to content

Instantly share code, notes, and snippets.

@vchahun
Created October 18, 2011 16:56
Show Gist options
  • Save vchahun/1295938 to your computer and use it in GitHub Desktop.
Save vchahun/1295938 to your computer and use it in GitHub Desktop.
Parallel batch tokenization
#!/usr/bin/env bash
set -e
set -o pipefail
myDir=$(dirname $0)
if [[ $# != 4 && $# != 5 ]]; then
echo >&2 "Usage: $0 myfine.fst inText outToks tempDir [--text]"
echo >&2 " --text is for use with recitation lowercaser"
exit 1
fi
fst=$1
inText=$2
outToks=$3
tmp=$4/batch.temporary.$RANDOM$RANDOM$RANDOM # let's minimize the probability of a collision
if [ -z $5 ]; then
textMode=0
else
textMode=1
fi
set -u
trap "echo 'ERROR (see log above)'; exit 1" EXIT
batchSize=5000
mkdir -p $tmp
$myDir/ascii-syms.py > $tmp/ascii.syms
COMPILE="fstcompile --isymbols=$tmp/ascii.syms --osymbols=$tmp/ascii.syms"
ARCSORT_I="fstarcsort --sort_type=ilabel"
ARCSORT_O="fstarcsort --sort_type=olabel"
SHORTEST="fstshortestpath"
COMPOSE="fstcompose --connect=false"
TRIM="fstconnect"
TOPSORT="fsttopsort"
PRINT="fstprint --osymbols=$tmp/ascii.syms"
TXT2FST="$myDir/text2txtfst.py"
if [[ $textMode == 1 ]]; then
FST2TOKS="$myDir/txtfst2text.py"
else
FST2TOKS="$myDir/outtxtfst2tokens.py"
fi
split -l $batchSize $inText $tmp/txt.
process() {
echo >&2 "Processing batch $1 -> $2"
cat $1 | $TXT2FST | $COMPILE | $ARCSORT_O | $COMPOSE - $fst | $TRIM | $SHORTEST | $TOPSORT | $PRINT | $FST2TOKS > $2
}
rm -f $outToks
for file in $tmp/txt.*; do
process $file `echo $file | sed "s/txt/out/"` &
done
wait
for file in $tmp/txt.*; do
cat `echo $file | sed "s/txt/out/"` >> $outToks
done
#rm -rf $tmp
trap - EXIT
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment