Last active
November 16, 2020 14:51
-
-
Save ikegami-yukino/6e16bcd404e6005cddc6 to your computer and use it in GitHub Desktop.
Tutial of Machine Translation for Mac OSX Mountain Lion
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
mkdir ~/smt | |
cd ~/smt | |
# Install Moses | |
export BOOST_ROOT=/usr/local/Cellar/boost/1.57.0 | |
export BOOST_BUILD_PATH=/usr/local/share/boost-build | |
ln /usr/local/Cellar/boost/1.57.0/lib/libboost_thread-mt.a /usr/local/Cellar/boost/1.57.0/lib/libboost_thread.a | |
ln /usr/local/Cellar/boost/1.57.0/lib/libboost_thread-mt.dylib /usr/local/Cellar/boost/1.57.0/lib/libboost_thread.dylib | |
ln -s /usr/local/Cellar/boost/1.57.0/lib /usr/local/Cellar/boost/1.57.0/lib64 | |
wget https://github.com/moses-smt/mosesdecoder/archive/master.zip | |
unzip master.zip | |
cd mosesdecoder-master | |
./bjam --libdir=/usr/local/lib --link=shared -j 3 # j means the number of CPU cores | |
cd contrib/python | |
2to3 -w setup.py | |
python setup.py build_ext -i --moses-lib=/usr/local/lib | |
# Build MGiza | |
wget https://github.com/moses-smt/mgiza/archive/master.zip | |
unzip master.zip | |
rm master.zip | |
cd mgiza-master/mgizapp | |
cmake . | |
make | |
# Copy to moses dir. | |
mkdir -p ~/smt/mosesdecoder/training-tools/mgizapp | |
cp bin/* ~/smt/mosesdecoder/training-tools/mgizapp | |
cp scripts/merge_alignment.py ~/smt/mosesdecoder/training-tools/ | |
# Utilize corpus | |
mkdir ~/smt/corpus | |
cd ~/smt/corpus | |
wget ftp://ftp.monash.edu.au/pub/nihongo/examples.utf.gz | |
gzip -dc examples.utf.gz | grep ^A: | cut -f1 | sed 's/^A: //' | mecab -Owakati >tanaka.ja | |
gzip -dc examples.utf.gz | grep ^A: | cut -f2 | sed 's/#.*$//' >tanaka.en | |
# Generate language model by KenLM | |
../mosesdecoder/bin/lmplz -o 5 -S 80% -T /tmp <tanaka.ja >tanaka.ja.arpa | |
# Generate translation model | |
./mosesdecoder/scripts/training/train-model.perl \ | |
--root-dir . \ | |
--corpus corpus/tanaka \ | |
--f en \ | |
--e ja \ | |
--lm 0:5:$HOME/smt/corpus/tanaka.ja.arpa \ | |
--external-bin-dir ./mosesdecoder/training-tools \ | |
-mgiza \ | |
-mgiza-cpus 3 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment