for file in ls */*bleu.json
do
echo "$file:"
cat "$file" | sed -n '/^\s*$/!{p;q}'
echo "------"
done
enro test bleu (distil-mbart unless otherwise specified, before post processing).
python finetune.py \ | |
--task summarization \ | |
--learning_rate=3e-4 \ | |
--do_train \ | |
--do_predict \ | |
--val_check_interval 0.25 --n_val 1000 \ | |
--data_dir xsum \ | |
--max_source_length 512 --max_target_length=56 \ | |
--freeze_embeds \ | |
--model_name_or_path google/pegasus-large \ |
for file in ls */*bleu.json
do
echo "$file:"
cat "$file" | sed -n '/^\s*$/!{p;q}'
echo "------"
done
enro test bleu (distil-mbart unless otherwise specified, before post processing).
based on this table: https://github.com/Helsinki-NLP/Tatoeba-Challenge/blob/master/models/released-models.txt and ISO mappings.
{'hf_name': 'zlw-zlw',
'source_languages': 'zlw',
'target_languages': 'zlw',
'opus_readme_url': 'https://github.com/Helsinki-NLP/Tatoeba-Challenge/tree/master/models/zlw-zlw/README.md',
# three letter code -> (group/language name, {constituents...} | |
# if this language is on the target side the constituents can be used as target language codes. | |
# if the language is on the source side they are supported natively without special codes. | |
{'aav': ('Austro-Asiatic languages', | |
{'hoc', 'hoc_Latn', 'kha', 'khm', 'khm_Latn', 'mnw', 'vie', 'vie_Hani'}), | |
'afa': ('Afro-Asiatic languages', | |
{'acm', 'afb', 'amh', 'apc', 'ara', 'arq', 'ary', 'arz', 'hau_Latn', 'heb', 'kab', 'mlt', 'rif_Latn', 'shy_Latn', 'som', 'thv', 'tir'}), | |
'afr': ('Afrikaans', {'afr'}), | |
'alv': ('Atlantic-Congo languages', | |
{'ewe', 'fuc', 'fuv', 'ibo', 'kin', 'lin', 'lug', 'nya', 'run', 'sag', 'sna', 'swh', 'toi_Latn', 'tso', 'umb', 'wol', 'xho', 'yor', 'zul'}), |
(first, wget fairseq_wmt_enro.tgz from s3)
During training, fairseq passes mbart dynamically sized batches (up to 128 tokens), in a dict called sample
with the following relevant keys:
labels
): no bos, ends with [2, tgt_lang_code]
input_ids
): ends with [2, 250004]
decoder_input_ids
): startswith 250020, ends with 2 . This is the "shift_tokens_right" version of target
.Here are the logs from my breakpoint:
tar -czvf wmt16_en_ru.tgz wmt16_en_ru | |
# wmt16_en_ru/ | |
# wmt16_en_ru/train.source | |
# wmt16_en_ru/train.target | |
# wmt16_en_ru/test.target | |
# wmt16_en_ru/test.source | |
# wmt16_en_ru/val.source | |
# wmt16_en_ru/val.target |
"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""" | |
" Filename: .vimrc " | |
" Maintainer: Sam Shleifer <[email protected]> " | |
" URL: http://github.com/sshlefier/dotfiles " | |
" " | |
" " | |
" Sections: " | |
" 01. Plugins ................. using vundle " | |
" 02. python .................. General autocmd events " | |
" 03. Vim options ............ Colors, fonts, etc. " |
export langs=ar_AR,cs_CZ,de_DE,en_XX,es_XX,et_EE,fi_FI,fr_XX,gu_IN,hi_IN,it_IT,ja_XX,kk_KZ,ko_KR,lt_LT,lv_LV,my_MM,ne_NP,nl_XX,ro_RO,ru_RU,si_LK,tr_TR,vi_VN,zh_CN | |
export CC25=/Users/shleifer/cc25_pretrain | |
export outfile=pred_en_ro.txt | |
export PRETRAIN=$CC25/model.pt | |
fairseq-generate tmp/ --path $PRETRAIN \ | |
--task translation_from_pretrained_bart -t en_XX -s ro_RO --bpe 'sentencepiece' \ | |
--sentencepiece-vocab $CC25/sentence.bpe.model --sacrebleu --remove-bpe 'sentencepiece' \ | |
--max-sentences 32 --langs $langs --beam 5 > $outfile |