Created
January 15, 2016 20:38
-
-
Save bartvm/4fcc6229594a35d728c0 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
SOURCE=en | |
TARGET=fr | |
SCRIPTS="$(pwd)/mosesdecoder/scripts" | |
# Create the data directory | |
mkdir -p IWSLT15 && cd IWSLT15 | |
# Download and unpack the data | |
# wget "https://wit3.fbk.eu/archive/2015-01/texts/$SOURCE/$TARGET/$SOURCE-$TARGET.tgz" | |
tar xvfz "$SOURCE-$TARGET.tgz" | |
cd "$SOURCE-$TARGET" | |
for lang in $SOURCE $TARGET; do | |
# Filename | |
base="train.$SOURCE-$TARGET" | |
# Strip the XML tags | |
grep -v "^<" train.tags.$SOURCE-$TARGET.$lang > "$base.$lang" | |
# Tokenize | |
eval "$SCRIPTS/tokenizer/tokenizer.perl -threads $(nproc) -l $lang < $base.$lang > $base.tok.$lang" | |
# Truecase | |
eval "$SCRIPTS/recaser/train-truecaser.perl --model truecase-model.$lang --corpus $base.tok.$lang" | |
eval "$SCRIPTS/recaser/truecase.perl --model truecase-model.$lang < $base.tok.$lang > $base.true.$lang" | |
done | |
# Clean | |
eval "$SCRIPTS/training/clean-corpus-n.perl $base.true $SOURCE $TARGET $base.clean 1 80" | |
# Shuffle | |
dd if=/dev/urandom of=tmprand count=1024 status=none | |
shuf --random-source=tmprand $base.clean.$SOURCE > $base.shuff.$SOURCE | |
shuf --random-source=tmprand $base.clean.$TARGET > $base.shuff.$TARGET | |
# Create vocabularies (to output counts, add $1 to awk) | |
for lang in $SOURCE $TARGET; do | |
tr "[:blank:]" "\n" < $base.clean.$lang | sort | uniq -c | sort -k 1nr | awk '{print $2}' > $base.vocab.$lang | |
done |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment