ptb_tokenize () {
cat $1 | java edu.stanford.nlp.process.PTBTokenizer -ioFileList -preserveLines > $2
}
sudo apt install openjdk-8-jre-headless
sudo apt-get install ant
wget http://nlp.stanford.edu/software/stanford-corenlp-full-2018-10-05.zip
unzip stanford-corenlp-full-2018-10-05.zip
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# by stas00 and sshleifer | |
import nlp | |
from tqdm import tqdm | |
dataset = 'wmt19' | |
s = 'ru' | |
t = 'en' | |
pair = f'{s}-{t}' |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
model | batch_size | sequence_length | MB | |
---|---|---|---|---|
t5-large | 1 | 128 | 6558 | |
t5-large | 1 | 512 | 9568 | |
t5-large | 1 | 1024 | 23124 | |
facebook/bart-large | 1 | 128 | 3758 | |
facebook/bart-large | 1 | 512 | 4670 | |
facebook/bart-large | 1 | 1024 | 8888 | |
t5-base | 1 | 128 | 2242 | |
t5-base | 1 | 512 | 3776 | |
t5-base | 1 | 1024 | 9056 |
Workflow: git: pre-commit hook to check style Cleanup: gpt and gpt2 attention vv similar besides caching
- fine to just add caching to GPT
Infra: test coverage for t5 causal mask
Infra: add test to ModelTesterMixin that loss doesn't change if pad tokens are introduced
save_pretrained: should mkdir if save_path doesn't exist
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from transformers import BartConfig, BartForConditionalGeneration, BartTokenizer | |
from torch import nn | |
from typing import List | |
layers_to_copy = { # maps # layers in student -> which teacher layers to copy | |
6: [0, 2, 4, 7, 9, 11], | |
1: [11], | |
3: [0, 6, 11], | |
2: [0, 11], | |
4: [0, 4, 8, 11], | |
9: [0, 1, 2, 4, 5, 7, 9, 10, 11], |
- everything must go under bert besides datasets
- Put random models under your own namespace, like
sshleifer/bart-tiny-random
- use the
--dry-run
command line arg - [FIXME] You can login to the portal and do things manually at this URLwith your kibana creds (??)
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
- for mention in tweet.grep('@'): assert twitter.get(mention) == expected_person | |
- assert photo has tags | |
- if thread: numbers make sense or down emoji | |
- all links work | |
- read it over once more |
See Model List, Docs
en_tweet = ["Today we are releasing 1,008 machine translation models, covering combinations of 140 different languages.",
"They were trained by @jorgtiedemann with @marian, and converted by @sam_shleifer.",
"Find your language here"]
en-da: I dag frigiver vi 1.008 maskinoversættelsesmodeller, der dækker kombinationer af 140 forskellige sprog. De blev uddannet af @jorgtiedemann med @marian, og konverteret af @sam_shleifer. Find dit sprog her:
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
GROUP_MEMBERS = { | |
'ZH': ['cmn', 'cn', 'yue', 'ze_zh', 'zh_cn', 'zh_CN', 'zh_HK', 'zh_tw', 'zh_TW', 'zh_yue', 'zhs', 'zht', 'zh'], | |
'ROMANCE': ['fr', 'fr_BE', 'fr_CA', 'fr_FR', 'wa', 'frp', 'oc', 'ca', 'rm', 'lld', 'fur', 'lij', 'lmo', | |
'es', 'es_AR', 'es_CL', 'es_CO', 'es_CR', 'es_DO', 'es_EC', 'es_ES', 'es_GT', 'es_HN', 'es_MX', 'es_NI', 'es_PA', 'es_PE', 'es_PR', 'es_SV', 'es_UY', 'es_VE', | |
'pt', 'pt_br', 'pt_BR', 'pt_PT', 'gl', 'lad', 'an', 'mwl', 'it', 'it_IT', 'co', 'nap', 'scn', 'vec', 'sc', 'ro', 'la'], | |
'NORTH_EU': ['de', 'nl', 'fy', 'af', 'da', 'fo', 'is', 'no', 'nb', 'nn', 'sv'], | |
'SCANDINAVIA': ['da', 'fo', 'is', 'no', 'nb', 'nn', 'sv'], | |
'SAMI': ['se', 'sma', 'smj', 'smn', 'sms'], | |
'NORWAY': ['nb_NO', 'nb', 'nn_NO', 'nn', 'nog', 'no_nb', 'no'], | |
'CELTIC': ['ga', 'cy', 'br', 'gd', 'kw', 'gv'] |