wget https://dumps.wikimedia.org/fiwiki/20170701/fiwiki-20170701-pages-articles-multistream.xml.bz2
bzip2 -d fiwiki-20170701-pages-articles-multistream.xml.bz2
python ../wikiextractor/WikiExtractor.py fiwiki-20170701-pages-articles-multistream.xml -o fiwiki-20170701
find ./fiwiki-20170701 -type f | xargs cat | sed 's/<.*>//g' | sed 's/["«»()]//g' | perl -CSAD -pe 's/ ?(\p{P}) ?/ $1 /g' | sed '/^$/d' | tr '[:upper:]' '[:lower:]' >> wiki_fin_clean.txt
fastText/fasttext skipgram -input wiki/wiki_rus_clean.txt -dim 100 -output data/model_rus_wiki
python3 vecmap/normalize_embeddings.py unit center -i data/model_rus_wiki.vec -o data/norm/model_rus_100_norm.vec
wget https://dumps.wikimedia.org/ruwiki/20170701/ruwiki-20170701-pages-articles-multistream.xml.bz2
bzip2 -d ruwiki-20170701-pages-articles-multistream.xml.bz2
python ../wikiextractor/WikiExtractor.py ruwiki-20170701-pages-articles-multistream.xml -o ruwiki-20170701
find ./ruwiki-20170701 -type f | xargs cat | sed 's/<.*>//g' | sed 's/["«»()]//g' | perl -CSAD -pe 's/ ?(\p{P}) ?/ $1 /g' | sed '/^$/d' | tr '[:upper:]' '[:lower:]' >> wiki_rus_clean.txt
fastText/fasttext skipgram -input wiki/wiki_fin_clean.txt -dim 100 -output data/model_fin_wiki
python3 vecmap/normalize_embeddings.py unit center -i data/model_fin_wiki.vec -o data/norm/model_fin_100_norm.vec
cat kpv-lit/txt/*txt | \
sed '/^<.*$/d' | \
sed '/^#$/d' | \
sed '/^[A-Za-z ]*$/d' | \
sed '/^.*\.\.\.\..*$/d' | \
sed '/^.*…\..*$/d' | \
sed '/^.*lab.*$/d' | \
perl -CSAD -pe 's/ ?(\p{P}) ?/ $1 /g' | \
tr '[:upper:]' '[:lower:]' > data/kpv-training-corpus.txt
fastText/fasttext skipgram -input data/kpv-training-corpus.txt -dim 300 -output data/model_kpv
fastText/fasttext skipgram -input data/kpv-training-corpus.txt -dim 100 -output data/model_kpv_100
python3 vecmap/normalize_embeddings.py unit center -i data/model_kpv_100.vec -o data/norm/model_kpv_100_norm.vec
In this point we have three comparable models model_kpv_100_norm.vec
, model_fin_100_norm.vec
and model_rus_100_norm.vec
. As these are all 100-dimensional, they are small enough that it is possible to combine them into one larger model. We use two modified versions from project_embeddings.py
script, where the first variant writes both Russian and Komi embeddings into one file, whereas the latter script writes only Finnish source embeddings, skipping the Komi target, which already is present from the first run.
python3 vecmap/project_embeddings_rus_kpv.py --orthogonal data/norm/model_rus_100_norm.vec data/norm/model_kpv_100_norm.vec -d data/multimodel/rus_kpv_mm.tsv -o data/multimodel/rus_kpv_mm.vec
python3 vecmap/project_embeddings_fin_kpv.py --orthogonal data/norm/model_fin_100_norm.vec data/norm/model_kpv_100_norm.vec -d data/multimodel/fin_kpv_mm.tsv -o data/multimodel/fin_mm.vec
In this point the models can already be concatenated.
cat data/multimodel/*.vec >> data/multimodel/multimodel.vec
The model now have thee separate headers within the files, so those have to be corrected.
sed -i '1!{/ 300$/d;}' data/multimodel/multimodel.vec
wc -l data/multimodel/multimodel.vec
2017885 data/multimodel/multimodel.vec
sed -i "1s/.*/2017884 100/" data/multimodel/multimodel.vec
python3.6 vecmap/project_embeddings_rus_kpv.py --orthogonal data/norm/model_rus_norm.vec data/norm/model_kpv_norm.vec -d data/multimodel/rus_kpv_mm.tsv -o data/multimodel-large/rus_kpv_large.vec
python3.6 vecmap/project_embeddings_fin_kpv.py --orthogonal data/norm/model_fin_norm.vec data/norm/model_kpv_norm.vec -d data/multimodel/fin_kpv_mm.tsv -o data/multimodel-large/fin_large.vec
cat data/multimodel-large/*.vec >> data/multimodel-large/multimodel-large.vec
sed -i '1!{/ 300$/d;}' data/multimodel-large/multimodel-large.vec
wc -l data/multimodel-large/multimodel-large.vec
2635250
sed -i "1s/.*/2635249 300/" data/multimodel-large/multimodel-large.vec
python3.6 vecmap/project_embeddings_fin_kpv_new.py --orthogonal data/norm/model_fin_norm.vec data/norm/model_kpv_norm.vec -d data/multimodel/fin_kpv_mm.tsv -o data/fin_kpv_new.vec
sed -i '1!{/ 300$/d;}' data/fin_kpv_new.vec
wc -l data/fin_kpv_new.vec
> 746827 data/fin_kpv_new.vec
sed -i "1s/.*/746826 300/" data/fin_kpv_new.vec