Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Save bfarzin/b8adf1f2f0e85a4f898c51b5e68f1ada to your computer and use it in GitHub Desktop.
Save bfarzin/b8adf1f2f0e85a4f898c51b5e68f1ada to your computer and use it in GitHub Desktop.
IMDB Data Example SpaCy and SentencePiece
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"%reload_ext autoreload\n",
"%autoreload 2"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"from fastai import *\n",
"from fastai.text import *\n",
"import re\n",
"import sentencepiece as spm #https://github.com/google/sentencepiece\n",
"\n",
"np.random.seed(20180311)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Sentencepiece tokenizer & LM Build"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Data processing to DataBunch"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Get IMDB Data and build DF"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"PosixPath('/home/farzin/.fastai/data/imdb')"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"IMDB_PATH = untar_data(URLs.IMDB)\n",
"IMDB_PATH"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(75000, 25000)"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"## raw data does not have the unsup in the train directory. Just move it there\n",
"# ~/.fastai/data/imdb$ mv unsup/ ./train/\n",
"CLASSES = ['neg', 'pos', 'unsup']\n",
"\n",
"def get_texts(path):\n",
" texts,labels = [],[]\n",
" for idx,label in enumerate(CLASSES):\n",
" for fname in (path/label).glob('*.*'):\n",
" texts.append(fname.open('r', encoding='utf8').read())\n",
" labels.append([0]) ## all unsupervised case\n",
"\n",
" return np.array(texts),np.array(labels)\n",
"\n",
"train_texts,train_labels = get_texts(IMDB_PATH/'train')\n",
"valid_texts,valid_labels = get_texts(IMDB_PATH/'test')\n",
"\n",
"len(train_texts),len(valid_texts)"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"## we can combine the \"train\" and \"valid\" for the LM step\n",
"all_texts = np.concatenate([train_texts,valid_texts])\n",
"all_texts_df = pd.DataFrame({'text':all_texts, 'labels':[0]*len(all_texts)}, columns=['labels','text'])"
]
},
{
"cell_type": "raw",
"metadata": {},
"source": [
"raw_text = all_texts_df.loc[:,'text']\n",
"\n",
"print(\"Default Rules:\\n\",[x.__name__ for x in defaults.text_pre_rules],\"\\n\\n\")\n",
"\n",
"for rule in defaults.text_pre_rules:\n",
" print(rule)\n",
" raw_text = raw_text.apply(lambda x: rule(str(x)))"
]
},
{
"cell_type": "raw",
"metadata": {},
"source": [
"## SentencePiece expects <s> ... </s> so we add that here:\n",
"all_texts_df['new_text'] = '<s>' + raw_text + '</s>'\n",
"all_texts_df['new_text'].head()"
]
},
{
"cell_type": "raw",
"metadata": {},
"source": [
"## save as a file for SP\n",
"formatted_text_file = 'tmp_IMDB_SP_example'\n",
"all_texts_df['new_text'].to_frame().to_csv(formatted_text_file, header=False,index=False,quotechar=' ')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### SP Tokenizer SWIG wrapper"
]
},
{
"cell_type": "raw",
"metadata": {},
"source": [
"# m.model and m.vocab created in the pwd\n",
"vocab_size = 500\n",
"model_prefix = 'wk103m_example'\n",
"\n",
"spm.SentencePieceTrainer.Train(f'--input={formatted_text_file}'\\\n",
" f' --model_prefix={model_prefix}'\\\n",
" f' --vocab_size={vocab_size}')\n",
"# f'--unk_piece={UNK} --bos_piece={BOS} --eos_id=-1 --pad_piece={PAD}')"
]
},
{
"cell_type": "raw",
"metadata": {},
"source": [
"## Vocab in order of frequency\n",
"!head -n50 {model_prefix}.vocab | nl"
]
},
{
"cell_type": "raw",
"metadata": {},
"source": [
"## load up the Processor\n",
"sp = spm.SentencePieceProcessor()\n",
"sp.load(f'{model_prefix}.model')\n",
"\n",
"## itos from m.vocab file: just read directly and populate the dictionary\n",
"itos = {}\n",
"with open(f'{model_prefix}.vocab','r') as f:\n",
" for line_num,line in enumerate(f):\n",
" itos[line_num] = line.split(\"\\t\")[0]\n",
" \n",
"stoi = {v:k for k,v in itos.items()}"
]
},
{
"cell_type": "raw",
"metadata": {},
"source": [
"#Based on Tokenizer() class: https://github.com/fastai/fastai/blob/master/fastai/text/transform.py#L87\n",
"class CustomTokenizer():\n",
" '''Wrapper for SentencePiece toeknizer to fit into Fast.ai V1'''\n",
" def __init__(self,sp_processor,pre_rules:ListRules=None,post_rules:ListRules=None):\n",
" self.sp = sp_processor\n",
" self.pre_rules = ifnone(pre_rules, defaults.text_pre_rules )\n",
" \n",
" def __repr__(self) -> str:\n",
" return \"Custom Tokenizer\"\n",
"\n",
" def process_text(self, t:str) -> List[str]:\n",
" \"Processe one text `t` with tokenizer `tok`.\"\n",
" for rule in self.pre_rules: t = rule(t) \n",
" toks = sp.EncodeAsIds(t)\n",
" \n",
" return toks \n",
" \n",
" def _process_all_1(self,texts:Collection[str]) -> List[List[str]]:\n",
" 'Process a list of `texts` in one process'\n",
" return [self.process_text(t) for t in texts]\n",
" \n",
" def process_all(self, texts:Collection[str]) -> List[List[str]]: \n",
" \"Process a list of `texts`.\" \n",
" return self._process_all_1(texts)\n",
" \n",
"mycust_tok = CustomTokenizer(sp)\n",
"#setup Vocab object for use in LM \n",
"# Vocab source: https://github.com/fastai/fastai/blob/master/fastai/text/transform.py#L122\n",
"sp_vocab = Vocab(itos)"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>labels</th>\n",
" <th>text</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>0</td>\n",
" <td>Detective Frank Keller(Al Pacino, in a perfect...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>0</td>\n",
" <td>This is a VERY good movie. I give it a 10.&lt;br ...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>0</td>\n",
" <td>I can't say I enjoyed this as much as \"The Big...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>0</td>\n",
" <td>I sat last night to see this film being played...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>0</td>\n",
" <td>First of all, I agree that the plot left somet...</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" labels text\n",
"0 0 Detective Frank Keller(Al Pacino, in a perfect...\n",
"1 0 This is a VERY good movie. I give it a 10.<br ...\n",
"2 0 I can't say I enjoyed this as much as \"The Big...\n",
"3 0 I sat last night to see this film being played...\n",
"4 0 First of all, I agree that the plot left somet..."
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"idx = np.random.permutation(len(all_texts))\n",
"cut = int(0.1 * len(idx))\n",
"LM_train_df = pd.DataFrame({'text':all_texts[idx[cut:]], 'labels':[0] * (len(all_texts)-cut)}, columns=['labels','text'])\n",
"LM_valid_df = pd.DataFrame({'text':all_texts[idx[:cut]], 'labels':[0] * cut}, columns=['labels','text'])\n",
"LM_train_df.head()"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"PATH = Path(\"./tmp_IMDB_example\")\n",
"\n",
"data = TextLMDataBunch.from_df(PATH, LM_train_df, LM_valid_df, \n",
"# tokenizer=mycust_tok, vocab=sp_vocab, \n",
" text_cols='text', label_cols='labels')"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"60003"
]
},
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"len(data.vocab.itos)"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['xxunk',\n",
" 'xxpad',\n",
" 'xxbos',\n",
" 'xxfld',\n",
" 'xxmaj',\n",
" 'xxup',\n",
" 'xxrep',\n",
" 'xxwrep',\n",
" 'the',\n",
" '.',\n",
" ',',\n",
" 'and',\n",
" 'a',\n",
" 'of',\n",
" 'to',\n",
" 'is',\n",
" 'it',\n",
" 'in',\n",
" 'i',\n",
" 'this',\n",
" 'that',\n",
" '\"',\n",
" \"'s\",\n",
" '-',\n",
" '\\n \\n ',\n",
" 'was',\n",
" 'as',\n",
" 'with',\n",
" 'for',\n",
" 'movie',\n",
" 'but',\n",
" 'film',\n",
" 'you',\n",
" ')',\n",
" 'on',\n",
" \"n't\",\n",
" '(',\n",
" 'not',\n",
" 'are',\n",
" 'he',\n",
" 'his',\n",
" 'have',\n",
" 'one',\n",
" 'be',\n",
" 'all',\n",
" 'at',\n",
" 'they',\n",
" 'by',\n",
" 'an',\n",
" 'who']"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data.vocab.itos[:50]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## LM Learner"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [],
"source": [
"learn = language_model_learner(data, AWD_LSTM, drop_mult=0.5)\n",
"learn.freeze()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## LM Train"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [],
"text/plain": [
"<IPython.core.display.HTML object>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"LR Finder is complete, type {learner_name}.recorder.plot() to see the graph.\n"
]
},
{
"data": {
"image/png": "\n",
"text/plain": [
"<Figure size 432x288 with 1 Axes>"
]
},
"metadata": {
"needs_background": "light"
},
"output_type": "display_data"
}
],
"source": [
"lr_find(learn)\n",
"learn.recorder.plot()"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"Total time: 20:17 <p><table style='width:375px; margin-bottom:10px'>\n",
" <tr>\n",
" <th>epoch</th>\n",
" <th>train_loss</th>\n",
" <th>valid_loss</th>\n",
" <th>accuracy</th>\n",
" <th>time</th>\n",
" </tr>\n",
" <tr>\n",
" <th>0</th>\n",
" <th>4.283346</th>\n",
" <th>4.043954</th>\n",
" <th>0.292471</th>\n",
" <th>20:17</th>\n",
" </tr>\n",
"</table>\n"
],
"text/plain": [
"<IPython.core.display.HTML object>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"learn.fit_one_cycle(1,1e-2)"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [
{
"data": {
"image/png": "\n",
"text/plain": [
"<Figure size 432x288 with 1 Axes>"
]
},
"metadata": {
"needs_background": "light"
},
"output_type": "display_data"
}
],
"source": [
"learn.recorder.plot_losses()"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3.7 fasta.ai1 DEV",
"language": "python",
"name": "fastai1_dev"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.1"
},
"varInspector": {
"cols": {
"lenName": 16,
"lenType": 16,
"lenVar": 40
},
"kernels_config": {
"python": {
"delete_cmd_postfix": "",
"delete_cmd_prefix": "del ",
"library": "var_list.py",
"varRefreshCmd": "print(var_dic_list())"
},
"r": {
"delete_cmd_postfix": ") ",
"delete_cmd_prefix": "rm(",
"library": "var_list.r",
"varRefreshCmd": "cat(var_dic_list()) "
}
},
"types_to_exclude": [
"module",
"function",
"builtin_function_or_method",
"instance",
"_Feature"
],
"window_display": false
}
},
"nbformat": 4,
"nbformat_minor": 2
}
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"%reload_ext autoreload\n",
"%autoreload 2"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"from fastai import *\n",
"from fastai.text import *\n",
"import re\n",
"import sentencepiece as spm #https://github.com/google/sentencepiece\n",
"\n",
"np.random.seed(20180311)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Sentencepiece tokenizer & LM Build"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Data processing to DataBunch"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Get IMDB Data and build DF"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"PosixPath('/home/farzin/.fastai/data/imdb')"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"IMDB_PATH = untar_data(URLs.IMDB)\n",
"IMDB_PATH"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(75000, 25000)"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"## raw data does not have the unsup in the train directory. Just move it there\n",
"# ~/.fastai/data/imdb$ mv unsup/ ./train/\n",
"CLASSES = ['neg', 'pos', 'unsup']\n",
"\n",
"def get_texts(path):\n",
" texts,labels = [],[]\n",
" for idx,label in enumerate(CLASSES):\n",
" for fname in (path/label).glob('*.*'):\n",
" texts.append(fname.open('r', encoding='utf8').read())\n",
" labels.append([0]) ## all unsupervised case\n",
"\n",
" return np.array(texts),np.array(labels)\n",
"\n",
"train_texts,train_labels = get_texts(IMDB_PATH/'train')\n",
"valid_texts,valid_labels = get_texts(IMDB_PATH/'test')\n",
"\n",
"len(train_texts),len(valid_texts)"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"## we can combine the \"train\" and \"valid\" for the LM step\n",
"all_texts = np.concatenate([train_texts,valid_texts])\n",
"all_texts_df = pd.DataFrame({'text':all_texts, 'labels':[0]*len(all_texts)}, columns=['labels','text'])"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Default Rules:\n",
" ['fix_html', 'replace_rep', 'replace_wrep', 'spec_add_spaces', 'rm_useless_spaces'] \n",
"\n",
"\n",
"<function fix_html at 0x7f1058e22ea0>\n",
"<function replace_rep at 0x7f1058e22d90>\n",
"<function replace_wrep at 0x7f1058e22e18>\n",
"<function spec_add_spaces at 0x7f105dbc2730>\n",
"<function rm_useless_spaces at 0x7f1058e22d08>\n"
]
}
],
"source": [
"raw_text = all_texts_df.loc[:,'text']\n",
"\n",
"print(\"Default Rules:\\n\",[x.__name__ for x in defaults.text_pre_rules],\"\\n\\n\")\n",
"\n",
"for rule in defaults.text_pre_rules:\n",
" print(rule)\n",
" raw_text = raw_text.apply(lambda x: rule(str(x)))"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0 <s>\"Fred Claus\" somehow avoids becoming this y...\n",
"1 <s>Firstly I would like to point out that I on...\n",
"2 <s>I finally got to have a look at this experi...\n",
"3 <s>After having red the overwhelming reviews t...\n",
"4 <s>About three minutes into this thing I start...\n",
"Name: new_text, dtype: object"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"## SentencePiece expects <s> ... </s> so we add that here:\n",
"all_texts_df['new_text'] = '<s>' + raw_text + '</s>'\n",
"all_texts_df['new_text'].head()"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
"## save as a file for SP\n",
"formatted_text_file = 'tmp_IMDB_SP_example'\n",
"all_texts_df['new_text'].to_frame().to_csv(formatted_text_file, header=False,index=False,quotechar=' ')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### SP Tokenizer SWIG wrapper"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"True"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# m.model and m.vocab created in the pwd\n",
"vocab_size = 500\n",
"model_prefix = 'wk103m_example'\n",
"\n",
"spm.SentencePieceTrainer.Train(f'--input={formatted_text_file}'\\\n",
" f' --model_prefix={model_prefix}'\\\n",
" f' --vocab_size={vocab_size}')\n",
"# f'--unk_piece={UNK} --bos_piece={BOS} --eos_id=-1 --pad_piece={PAD}')"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" 1\t<unk>\t0\r\n",
" 2\t<s>\t0\r\n",
" 3\t</s>\t0\r\n",
" 4\t▁\t-2.84395\r\n",
" 5\ts\t-2.97\r\n",
" 6\tt\t-3.53742\r\n",
" 7\t▁the\t-3.7748\r\n",
" 8\t.\t-3.86454\r\n",
" 9\t,\t-3.91818\r\n",
" 10\t▁a\t-4.07048\r\n",
" 11\tn\t-4.18876\r\n",
" 12\ta\t-4.24687\r\n",
" 13\te\t-4.25059\r\n",
" 14\to\t-4.27973\r\n",
" 15\ty\t-4.44583\r\n",
" 16\t▁to\t-4.48563\r\n",
" 17\ting\t-4.50046\r\n",
" 18\t▁and\t-4.50961\r\n",
" 19\t▁of\t-4.5516\r\n",
" 20\t'\t-4.64682\r\n",
" 21\td\t-4.69638\r\n",
" 22\t▁in\t-4.76258\r\n",
" 23\ti\t-4.77796\r\n",
" 24\ted\t-4.78844\r\n",
" 25\tc\t-4.81265\r\n",
" 26\tr\t-4.84313\r\n",
" 27\t▁is\t-4.84642\r\n",
" 28\tre\t-4.896\r\n",
" 29\tp\t-4.94282\r\n",
" 30\t▁I\t-4.97998\r\n",
" 31\ter\t-4.9895\r\n",
" 32\tu\t-4.99602\r\n",
" 33\t▁it\t-5.11542\r\n",
" 34\tor\t-5.14418\r\n",
" 35\tb\t-5.17535\r\n",
" 36\tar\t-5.22629\r\n",
" 37\tm\t-5.2283\r\n",
" 38\tle\t-5.23756\r\n",
" 39\t▁f\t-5.23919\r\n",
" 40\tw\t-5.28417\r\n",
" 41\tf\t-5.28488\r\n",
" 42\tly\t-5.29174\r\n",
" 43\t▁that\t-5.29407\r\n",
" 44\tal\t-5.30172\r\n",
" 45\tg\t-5.30389\r\n",
" 46\tl\t-5.37982\r\n",
" 47\tin\t-5.40889\r\n",
" 48\t▁this\t-5.4378\r\n",
" 49\t-\t-5.48409\r\n",
" 50\tk\t-5.5797\r\n"
]
}
],
"source": [
"## Vocab in order of frequency\n",
"!head -n50 {model_prefix}.vocab | nl"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [],
"source": [
"## load up the Processor\n",
"sp = spm.SentencePieceProcessor()\n",
"sp.load(f'{model_prefix}.model')\n",
"\n",
"## itos from m.vocab file: just read directly and populate the dictionary\n",
"itos = {}\n",
"with open(f'{model_prefix}.vocab','r') as f:\n",
" for line_num,line in enumerate(f):\n",
" itos[line_num] = line.split(\"\\t\")[0]\n",
" \n",
"stoi = {v:k for k,v in itos.items()}"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [],
"source": [
"#Based on Tokenizer() class: https://github.com/fastai/fastai/blob/master/fastai/text/transform.py#L87\n",
"class CustomTokenizer():\n",
" '''Wrapper for SentencePiece toeknizer to fit into Fast.ai V1'''\n",
" def __init__(self,sp_processor,pre_rules:ListRules=None,post_rules:ListRules=None):\n",
" self.sp = sp_processor\n",
" self.pre_rules = ifnone(pre_rules, defaults.text_pre_rules )\n",
" \n",
" def __repr__(self) -> str:\n",
" return \"Custom Tokenizer\"\n",
"\n",
" def process_text(self, t:str) -> List[str]:\n",
" \"Processe one text `t` with tokenizer `tok`.\"\n",
" for rule in self.pre_rules: t = rule(t) \n",
" toks = sp.EncodeAsIds(t)\n",
" \n",
" return toks \n",
" \n",
" def _process_all_1(self,texts:Collection[str]) -> List[List[str]]:\n",
" 'Process a list of `texts` in one process'\n",
" return [self.process_text(t) for t in texts]\n",
" \n",
" def process_all(self, texts:Collection[str]) -> List[List[str]]: \n",
" \"Process a list of `texts`.\" \n",
" return self._process_all_1(texts)\n",
" \n",
"mycust_tok = CustomTokenizer(sp)\n",
"#setup Vocab object for use in LM \n",
"# Vocab source: https://github.com/fastai/fastai/blob/master/fastai/text/transform.py#L122\n",
"sp_vocab = Vocab(itos)"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>labels</th>\n",
" <th>text</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>0</td>\n",
" <td>Detective Frank Keller(Al Pacino, in a perfect...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>0</td>\n",
" <td>This is a VERY good movie. I give it a 10.&lt;br ...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>0</td>\n",
" <td>I can't say I enjoyed this as much as \"The Big...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>0</td>\n",
" <td>I sat last night to see this film being played...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>0</td>\n",
" <td>First of all, I agree that the plot left somet...</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" labels text\n",
"0 0 Detective Frank Keller(Al Pacino, in a perfect...\n",
"1 0 This is a VERY good movie. I give it a 10.<br ...\n",
"2 0 I can't say I enjoyed this as much as \"The Big...\n",
"3 0 I sat last night to see this film being played...\n",
"4 0 First of all, I agree that the plot left somet..."
]
},
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"idx = np.random.permutation(len(all_texts))\n",
"cut = int(0.1 * len(idx))\n",
"LM_train_df = pd.DataFrame({'text':all_texts[idx[cut:]], 'labels':[0] * (len(all_texts)-cut)}, columns=['labels','text'])\n",
"LM_valid_df = pd.DataFrame({'text':all_texts[idx[:cut]], 'labels':[0] * cut}, columns=['labels','text'])\n",
"LM_train_df.head()"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [],
"source": [
"PATH = Path(\"./tmp_IMDB_example\")\n",
"\n",
"data = TextLMDataBunch.from_df(PATH, LM_train_df, LM_valid_df, \n",
" tokenizer=mycust_tok, vocab=sp_vocab, \n",
" text_cols='text', label_cols='labels')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## LM Learner"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [],
"source": [
"learn = language_model_learner(data, AWD_LSTM, drop_mult=0.5)\n",
"learn.freeze()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## LM Train"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [],
"text/plain": [
"<IPython.core.display.HTML object>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"LR Finder is complete, type {learner_name}.recorder.plot() to see the graph.\n"
]
},
{
"data": {
"image/png": "\n",
"text/plain": [
"<Figure size 432x288 with 1 Axes>"
]
},
"metadata": {
"needs_background": "light"
},
"output_type": "display_data"
}
],
"source": [
"lr_find(learn)\n",
"learn.recorder.plot()"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"Total time: 14:24 <p><table style='width:375px; margin-bottom:10px'>\n",
" <tr>\n",
" <th>epoch</th>\n",
" <th>train_loss</th>\n",
" <th>valid_loss</th>\n",
" <th>accuracy</th>\n",
" <th>time</th>\n",
" </tr>\n",
" <tr>\n",
" <th>0</th>\n",
" <th>3.747901</th>\n",
" <th>3.465941</th>\n",
" <th>0.260205</th>\n",
" <th>14:24</th>\n",
" </tr>\n",
"</table>\n"
],
"text/plain": [
"<IPython.core.display.HTML object>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"learn.fit_one_cycle(1,1e-2)"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAYUAAAEKCAYAAAD9xUlFAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDMuMC4yLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvOIA7rQAAIABJREFUeJzt3Xl4VPXZ//H3nY3sCUtYAwZBZF9T3BfctS1WRSs/fSq2lmpr61O7aTet3Wxrfay1jz5q1ap1b9211CpurYpBFlkFBSQQIIQtEpYs9++POYxDSEgImcxM5vO6rrly5pzvOblPDsxnzvY95u6IiIgApMS6ABERiR8KBRERCVMoiIhImEJBRETCFAoiIhKmUBARkTCFgoiIhCkUREQkTKEgIiJhabEu4ED16NHDS0pKYl2GiEhCmT179kZ3L2qpXcKFQklJCWVlZbEuQ0QkoZjZqta00+EjEREJUyiIiEiYQkFERMIS7pyCiHQetbW1lJeXs3PnzliX0mlkZmZSXFxMenp6m+ZXKIhIzJSXl5OXl0dJSQlmFutyEp67U1VVRXl5OQMHDmzTMnT4SERiZufOnXTv3l2B0E7MjO7dux/UnpdCQURiSoHQvg7275k0obBk3TZumrGUTdt3x7oUEZG4lTShsHLjdm6buZx1W3VCS0RCqqqqGDt2LGPHjqV3797069cv/H737tZ9gbz00ktZunRplCvtOElzojk/M3QmftvO2hhXIiLxonv37sydOxeA66+/ntzcXL773e/u1cbdcXdSUpr+Dn3vvfdGvc6OlDR7CvlZQSjsUCiIyP4tX76c4cOHc9FFFzFixAgqKiqYPn06paWljBgxghtuuCHc9thjj2Xu3LnU1dVRWFjINddcw5gxYzjqqKPYsGFDDNeibZJwT6EuxpWISFN+9uxCFq3d1q7LHN43n+s+P6JN8y5ZsoT777+f0tJSAG688Ua6detGXV0dkyZNYsqUKQwfPnyvebZu3coJJ5zAjTfeyNVXX80999zDNddcc9Dr0ZGSaE8hlH/aUxCR1hg0aFA4EAAefvhhxo8fz/jx41m8eDGLFi3aZ56srCzOPPNMACZMmMDKlSs7qtx2kzR7CrldglDQOQWRuNTWb/TRkpOTEx5etmwZf/jDH5g1axaFhYVcfPHFTd4LkJGRER5OTU2lri7xjkwkzZ5CWmoKuV3S2LYj8TaSiMTWtm3byMvLIz8/n4qKCmbMmBHrkqImafYUAPIz07SnICIHbPz48QwfPpyhQ4dyyCGHcMwxx8S6pKgxd491DQektLTU2/qQnTNueZ0B3bK580ulLTcWkahbvHgxw4YNi3UZnU5Tf1czm+3uLX74Jc3hIwhdgaQ9BRGR5iVXKGTpnIKIyP4kVyhoT0FEZL+iGgpmVmhmT5jZEjNbbGZHNZpuZnarmS03s/lmNj6a9eRnpes+BRGR/Yj21Ud/AP7h7lPMLAPIbjT9TOCw4HUEcHvwMyryM9Oo3lVHQ4OTkqLuekVEGovanoKZFQDHA38GcPfd7r6lUbOzgfs95G2g0Mz6RKum/Kx03KF6l84riIg0JZqHjwYClcC9ZjbHzO42s5xGbfoBqyPelwfjoqIwO3S34dYaHUISEZg0adI+N6LdcsstXHHFFc3Ok5ubC8DatWuZMmVKk21OPPFEWrp0/pZbbqGmpib8/qyzzmLLlsbfmzteNEMhDRgP3O7u44DtQJt6hjKz6WZWZmZllZWVbS6oa3aoU7xNNXrQjojA1KlTeeSRR/Ya98gjjzB16tQW5+3bty9PPPFEm39341B44YUXKCwsbPPy2ks0Q6EcKHf3d4L3TxAKiUhrgP4R74uDcXtx9zvdvdTdS4uKitpc0J49hc0KBREBpkyZwvPPPx9+oM7KlStZu3Yt48aN4+STT2b8+PGMGjWKp59+ep95V65cyciRIwHYsWMHF154IcOGDeOcc85hx44d4XZXXHFFuMvt6667DoBbb72VtWvXMmnSJCZNmgRASUkJGzduBODmm29m5MiRjBw5kltuuSX8+4YNG8ZXv/pVRowYwWmnnbbX72kvUTvR7O7rzGy1mR3u7kuBk4HG3Qo+A1xpZo8QOsG81d0rolXTnj2FLQoFkfjz4jWw7v32XWbvUXDmjc1O7tatGxMnTuTFF1/k7LPP5pFHHuGCCy4gKyuLJ598kvz8fDZu3MiRRx7J5MmTm33+8e233052djaLFy9m/vz5jB//6fffX/7yl3Tr1o36+npOPvlk5s+fz7e+9S1uvvlmZs6cSY8ePfZa1uzZs7n33nt55513cHeOOOIITjjhBLp27cqyZct4+OGHueuuu7jgggv429/+xsUXX9w+f6tAtO9T+CbwVzObD4wFfmVml5vZ5cH0F4CPgOXAXcDXo1lM1z17Ctt1TkFEQiIPIe05dOTu/PCHP2T06NGccsoprFmzhvXr1ze7jNdffz384Tx69GhGjx4dnvbYY48xfvx4xo0bx8KFC5vscjvSm2++yTnnnENOTg65ubmce+65vPHGGwAMHDiQsWPHAtHrmjuql6S6+1ygcV8bd0RMd+Ab0awhUn5WOimmPQWRuLSfb/TRdPbZZ/Ptb3+b9957j5qaGiZMmMB9991HZWUls2fPJj09nZKSkia7ym7JihUruOmmm3j33Xfp2rUr06ZNa9Ny9ujSpUt4ODU1NSqHj5LqjubUFKMgK10nmkUkLDc3l0mTJvHlL385fIJ569at9OzZk/T0dGbOnMmqVav2u4zjjz+ehx56CIAFCxYwf/58INTldk5ODgUFBaxfv54XX3wxPE9eXh7V1dX7LOu4447jqaeeoqamhu3bt/Pkk09y3HHHtdfqtiipus6G0CGkzbokVUQiTJ06lXPOOSd8GOmiiy7i85//PKNGjaK0tJShQ4fud/4rrriCSy+9lGHDhjFs2DAmTJgAwJgxYxg3bhxDhw6lf//+e3W5PX36dM444wz69u3LzJkzw+PHjx/PtGnTmDhxIgCXXXYZ48aN67CnuCVV19kA5/7vv8nKSOWvlx3ZjlWJSFuo6+zoUNfZB6BrdoZONIuINCP5QiEnQ/cpiIg0I/lCITtdoSASRxLtEHa8O9i/Z9KFQmF2BjtrG9hZWx/rUkSSXmZmJlVVVQqGduLuVFVVkZmZ2eZlJOXVRxDq6qJPQVaMqxFJbsXFxZSXl3MwfZrJ3jIzMykuLm7z/EkYCkGneNsVCiKxlp6ezsCBA2NdhkRIusNHXXNCewpbdK+CiMg+ki4UugehULVdJ5tFRBpLulAoygv1HVJZvSvGlYiIxJ+kC4WCrHTSU40N1W3vlEpEpLNKulAwM3rlZ7J+q0JBRKSxpAsFgL6FWazdolAQEWksKUOhX2EWa7a0fz/kIiKJLilDoW9hJuu37aS+QXdRiohESspQ6FOQRV2D6wokEZFGkjIU+hWG7mRes6UmxpWIiMSXpAyFQUW5AHyw/pMYVyIiEl+SMhSKu2aR2yWNxRXbYl2KiEhcScpQSEkxhvbOUyiIiDSSlKEAMLRPHksqqtWPu4hIhKQNhWF98qneVUf5Zt2vICKyR1KHAqBDSCIiEZI2FIb2zsMMFldUx7oUEZG4kbShkJ2RRkn3HO0piIhEiOrjOM1sJVAN1AN17l7aaPqJwNPAimDU3939hmjWFGlYnzwWrlUoiIjs0RHPaJ7k7hv3M/0Nd/9cB9SxjxF9C3jh/XWs37aTXvmZsShBRCSuJO3hI4CzRvUB4G/vlce4EhGR+BDtUHDgn2Y228ymN9PmKDObZ2YvmtmIKNezl4E9cphY0o3Hy8p1v4KICNEPhWPdfTxwJvANMzu+0fT3gEPcfQzwR+CpphZiZtPNrMzMyiorK9u1wAs+058VG7dTtmpzuy5XRCQRRTUU3H1N8HMD8CQwsdH0be7+STD8ApBuZj2aWM6d7l7q7qVFRUXtWuNZo3pjBlc/NrddlysikoiiFgpmlmNmeXuGgdOABY3a9DYzC4YnBvVURaumpmRnpDFlfDGrN+3gvx+ZQ119Q0f+ehGRuBLNq496AU8Gn/lpwEPu/g8zuxzA3e8ApgBXmFkdsAO40GNwcP/nXxjJhupdPDV3LWWrNvPiVceRl5ne0WWIiMScJdoJ1tLSUi8rK4vKsr//xDweKwtdiTSkVy6/P38so4oLovK7REQ6kpnNbnyvWFOS+pLUxn47ZQx/vqSUMcUFfLD+Ez5/25vc8+aKlmcUEekkFAqNnDysF09feSx/vewIAG54bhG//ccS6hsSa49KRKQtFArNOGZwD+b+9FQO75XH/776IZc/OJsGBYOIdHIKhf0ozM5gxreP5wdnDOWlReu5/bUPY12SiEhUKRRa4fITDuX0Eb343YylfFT5SazLERGJGoVCK5gZPzhjKAAn/f41HUYSkU5LodBKhxbl8o1JgwB4Z8WmGFcjIhIdCoUDcOWkw8jtksbf1auqiHRSCoUDkJWRypkje/OPBevYXafuMESk81EoHKDTRvSmelcds3QISUQ6IYXCATpmcHcyUlN4demGWJciItLuFAoHKDsjjSMO7cZMhYKIdEIKhTY48fCefFi5nYqtO2JdiohIu1IotMGEQ7oCMOfjLTGuRESkfSkU2mB4n3y6pKXwnh7hKSKdjEKhDTLSUhjZr4A5q7WnICKdi0KhjUYXF7Bo7TZ1qS0inYpCoY2G9clnR209H2+qiXUpIiLtRqHQRkN75wGwpGJbjCsREWk/CoU2GtIrDzNYsq461qWIiLQbhUIbZaanMqBbNss2KBREpPNQKByEw3rm8cF6PXRHRDoPhcJBGNIrl5Ubt6vHVBHpNBQKB2FIrzzqGpwVG7fHuhQRkXahUDgIh/XKBeCD9TqvICKdg0LhIAwqyiXFYNkGnVcQkc5BoXAQMtNTOaR7Dsu0pyAinURUQ8HMVprZ+2Y218zKmphuZnarmS03s/lmNj6a9UTDYT1zdfhIRDqNjthTmOTuY929tIlpZwKHBa/pwO0dUE+7GtIrj5VVNeyqq491KSIiBy3Wh4/OBu73kLeBQjPrE+OaDshhvXKp1xVIItJJRDsUHPinmc02s+lNTO8HrI54Xx6MSxhDeoX6QNJNbCLSGaRFefnHuvsaM+sJvGRmS9z99QNdSBAo0wEGDBjQ3jUelIE9ckJXIOm8goh0AlHdU3D3NcHPDcCTwMRGTdYA/SPeFwfjGi/nTncvdffSoqKiaJXbJpnpqZR0z9HJZhHpFFoVCmY2yMy6BMMnmtm3zKywhXlyzCxvzzBwGrCgUbNngC8FVyEdCWx194oDXosYO6xXLst0+EhEOoHW7in8Dag3s8HAnYS+3T/Uwjy9gDfNbB4wC3je3f9hZpeb2eVBmxeAj4DlwF3A1w90BeLBkF55rNpUw85aXYEkIomttecUGty9zszOAf7o7n80szn7m8HdPwLGNDH+johhB75xIAXHo2F98qlvcJauq2ZM//3uQImIxLXW7inUmtlU4BLguWBcenRKSjxjgyCY8/HmGFciInJwWhsKlwJHAb909xVmNhB4IHplJZa+hVn0yO3CgrV6NKeIJLZWHT5y90XAtwDMrCuQ5+6/iWZhiWZE33wWKRREJMG19uqjV80s38y6Ae8Bd5nZzdEtLbEM75vPsg3VeuCOiCS01h4+KnD3bcC5hLqlOAI4JXplJZ4RffOprXc9s1lEElprQyEt6JPoAj490SwRhvfJB2ChDiGJSAJrbSjcAMwAPnT3d83sUGBZ9MpKPCXdc8jOSNV5BRFJaK090fw48HjE+4+A86JVVCJKSTGG9dHJZhFJbK090VxsZk+a2Ybg9TczK452cYlmeJ98FlVso6HBY12KiEibtPbw0b2E+inqG7yeDcZJhBF98/lkVx2rN9fEuhQRkTZpbSgUufu97l4XvO4D4qu70jgwvG/oZLMOIYlIomptKFSZ2cVmlhq8LgaqollYIhrSK4/UFNMVSCKSsFobCl8mdDnqOqACmAJMi1JNCSszPZXBRbksqlAoiEhialUouPsqd5/s7kXu3tPdv4CuPmrSiL75LFy7NdZliIi0ycE8ee3qdquiExneN5/123axoXpnrEsRETlgBxMK1m5VdCLjBoS60X5v1ZYYVyIicuAOJhR0MX4TRvYrICM1hff0bAURSUD7vaPZzKpp+sPfgKyoVJTguqSlMqq4gNmrFAoiknj2GwruntdRhXQmY/sX8uDbq6itbyA99WB2xkREOpY+saJgbP9CdtU1sKRC3WiLSGJRKETBnmc2z12tQ0giklgUClFQ3DWLHrkZzFmtK5BEJLEoFKLAzBjbv5C5CgURSTAKhSgZ27+Qjyq3s7WmNtaliIi0mkIhSsb27wrAvHLtLYhI4lAoRMno/gWYoUNIIpJQFApRkp+ZzuCiXObozmYRSSBRD4Xg+QtzzOy5JqZNM7NKM5sbvC6Ldj0dac/JZnf1CCIiiaEj9hSuAhbvZ/qj7j42eN3dAfV0mLEDCtlcU8vHm/R4ThFJDFENBTMrBj4LdKoP+9YaUxy6iW1euZ6vICKJIdp7CrcA3wca9tPmPDObb2ZPmFn/KNfToQ7vnUdmeorOK4hIwohaKJjZ54AN7j57P82eBUrcfTTwEvCXZpY13czKzKyssrIyCtVGR3pqCmP7F/Luyk2xLkVEpFWiuadwDDDZzFYCjwAnmdmDkQ3cvcrddwVv7wYmNLUgd7/T3UvdvbSoqCiKJbe/McWFLFizjc3bd8e6FBGRFkUtFNz9WncvdvcS4ELgFXe/OLKNmfWJeDuZ/Z+QTkgj+hUA8PKSDTGuRESkZft9nkI0mNkNQJm7PwN8y8wmA3XAJmBaR9cTbaeP6EVGWgpL122LdSkiIi3qkFBw91eBV4Phn0aMvxa4tiNqiJUuaamM7V/Ifz6sinUpIiIt0h3NHeC4wT1YVLGNTTqvICJxTqHQAY49rAfu8MayxLlySkSSk0KhA4wuLqRrdjqvLVUoiEh8Uyh0gNQU4/ghRbz2QSUNDeoHSUTil0Khg0w6vCdV23fz/hp1eSEi8Uuh0EGOH1KEGbyqQ0giEscUCh2kW04Go4sLmblUN7GJSPxSKHSgE4YUMb98C1t36LnNIhKfFAod6OhB3WlweHeFOsgTkfikUOhAY/sXkpGWwmsf6LyCiMQnhUIHykxP5fQRvXlm3lpqdtfFuhwRkX0oFDrYRUcMYOuOWp6bVxHrUkRE9qFQ6GBHDOzGoUU5/PWdVbjrRjYRiS8KhQ5mZlx6dAnzyrcySyecRSTOKBRiYMqE/uR2SePqx+bFuhQRkb0oFGIgKyOVU4f3Ys2WHazeVBPrckREwhQKMXL1qUNIMfjjK8tiXYqISJhCIUb6d8vmpKE9eaysnPnlW2JdjogIoFCIqZ+dPRKAybf9m+27dN+CiMSeQiGG+hVmMe3oEgDueO3D2BYjIoJCIeaunzyCowd15+m5a6mtb4h1OSKS5BQKceDLxwzk4001nH7L63oym4jElEIhDpw8rCcj++XzUeV2fjNjSazLEZEkplCIA2bGs1cey+Ceufzfax/x9kdVsS5JRJKUQiFOmBk3TB4BwIV3vs0Nzy6KcUUikowUCnHk6ME9+PkXQpep3vPvFby7Un0jiUjHUijEmYuPGMDUif0BOP+Ot3h23toYVyQiySTqoWBmqWY2x8yea2JaFzN71MyWm9k7ZlYS7XrinZnx63NH851ThwDwzYfn8PTcNTGuSkSSRUfsKVwFLG5m2leAze4+GPgf4DcdUE9C+ObJh3Hb/xsHwFWPzOXDyk+o1+WqIhJlUQ0FMysGPgvc3UyTs4G/BMNPACebmUWzpkTyudF9uePiCQCc/PvXGPTDF1i6rjrGVYlIZxbtPYVbgO8Dzd2q2w9YDeDudcBWoHuUa0ooZ4zszah+BeH3p9/yOp+onyQRiZK0aC3YzD4HbHD32WZ24kEuazowHWDAgAHtUF1iefabxwJQcs3zAIy8bgbdcjLomp3OtKNLOHd8MTldorYpRSSJWLSeE2xmvwb+C6gDMoF84O/ufnFEmxnA9e7+lpmlAeuAIt9PUaWlpV5WVhaVmuPdtp21jL7+n01OS00xfjZ5BD9+agGj+hXwxBVH0SUttYMrFJF4ZWaz3b20xXYd8fD4YE/hu+7+uUbjvwGMcvfLzexC4Fx3v2B/y0rmUADYuqOWLmkpbNi2iz++sozHZ5c32e6Igd149GtHdXB1IhKv4jYUzOwGoMzdnzGzTOABYBywCbjQ3T/a37KSPRSasrO2niN//TJbamo5elB3/vPh3t1kTB7Tl+nHH8qNLy5hdHEBX580mFwdbhJJKnEVCu1JodCyhWu38tlb39xvm+F98hnQLZs/TB2rw0wiSUChIPxn+Uaee7+Ctz6sYsXG7c22++pxA/nhWcPQ1cAinZdCQZq1YdtOvvfEfF77oLLFtmbgDlnpqTz2taMo6ZFNXmY69Q1Oaoqxs7aeNVt2MKgotwMqF5G2UihIq7z2QSWX3DOr3ZY3pn8hAPd/eSIFWekA3PryMm5+6QOeufIYuud24fGy1Uw7uoTC7AwAduyu5/f/XMr3zjhch7JEokShIAesvsFZuq6aqu27mH7/bK48aTBjigt5Y3kldfXOn99cEfUafv6FkXx+dB+em1/Bj59awAlDijjusB78/p8f8MJVx/GzZxdy9alDGF1cSEODc+7t/+F7px/O0YO6U7O7nuyMVB0GE2mCQkGiwt2pb3BSzPjRUwv40WeHkZ5qPPruasYUF/LignX8a/F6lm/4ZK/5BhXl8GFl8+c12tMh3bO5+tQhjB/Qlf7dsptss2HbTib+6mVe+c4JzC/fyglDihj385cAePW7J1LSI2efedyd6l115GemR7V+kWhQKEjMVWzdwXn/+x/uuqSUEX0L9pq2taaWnz27kEO653DpsSXkZ6bz8KyPufbv74fbpKUYde3YCeDz3zqWtz6sCt3o14qHGH3zpMH88ZXlzU5//XuTGNA9FDrujjukpIT2UnbW1mOGDodJ3FAoSKfj7pgZS9dVs2n7bo4atHc3WVt31PLg26vISk9l7uotPBPlZ1Fkpqcw/7rTmXzbmyyJ6Kjw82P6hp+DcdP5YzhvfD/MjBkL1/G1B2Zz69RxTB7Tt1W/o2LrDnbWNjCwiT0XkQOhUBABlq6rZuuOWj7ZVcuX79v73827PzqForwu+8yzfVcdVz82l1krNlHSI4c5H2/hjBG9+cfCdTz81SPpW5jJY2Wr+dPMD9tc16wfncx/llcxaWhPHn33Y86f0J81W3aws7YeB4b2zmPz9lqO/93M8DwXlBYD8FhZ6C72Z688lpv+uRQPppUe0o3eBZlsqdnN2x9t4jMlXdm+K3R12J6LCQ4tyuGm88fQuyCT7bvqOKR702GzJ4Bbo3pnLW9/tIlThvXU+Zw4plAQiaLddQ1Mf6CMV5eGLuuddnQJ3zxpMFc8+B4De+Rw43mjqKzexcRfvRzjSlvn1OG9eGnR+r3GTZ04gF+fO6rFeSf+8l9sqN7FlAnF/Pa80VTvrKMgO50X36/gn4tC55feX7OVi48cwC++0PLyJDoUCiIdoLa+gfTU1vVAv+fb9wNvreQnTy9s1TwnHl7E788fw+Ozy/nzmytwd646+bBWz7/Hjz87jF8839yzrlr2nVOHcOVJg7n+mYX85a1VDOyRw7+uPoHHylbvdR6oJb8/fwyrNtWQn5nG6SN6s6O2nm8+NIf+3bL46nGHcsShzfec7+5UVu+iZ37mPtMaGpz11TvpU5B1QOu1fEM1/btlx9W5nysfeo/n5lcw87sntuthQ4WCSBxzdxocHnx7FV/8TH8y01P3md7SoZgdu+vJyvh0vvoG5/n3K+idn8lnSrq2+lDO3W98xC+eX8wDX5nI0YN68IeXl1Gzq467D+AS5P2FTlsvGPjVOaP47Kg+3P/WSpasr+b5+RUAdM1OZ3NNLRcfOYAH3/54n/le/e6JzFqxifGHdOW2V5bx1NxPzy1NOKQr4/oXcvebK7jzvyYw/YHZe83772tOol/hp8FSV99AWhD6X7pnFhmpxp3/VRq+oCBS4+3xxrJKnpwTepTu788fw4yF63hqzlqmTCjmsvs//Qw7ZVhPjh7Ug8lj+7K7roGjb3xlr+VectQhXPf5EU3+zgOhUBCRg/LC+xWs2LidvoWZfPvReeHxJwwp2utu+HPG9eN/vjiW5+dX8NsZS/jGpMF0SUth8pi+ewXT03PXcNUjc5v9fb3yu7B+267orMxB2nNnf3PyM9PYtrP9Hn7VPSeDqu279xpX0j2bf/z38ft8gWgthYKItKvddQ1kpIW+Ne+srWfoT/7BoUU5vHjVca0+/LKrrp70lJRmv/VWfbKLWSs2ccVf39tn2lUnH8awPvlc/uBsbvt/47jyoTlAaC/lmME9OLxXHtMfKONfizfsNd+PPzuMv7y1kvTUFD5qdK9MZnoKs350Cqs31XDVI3P3ub8mmt77yanM+Xgz33l8HltqasPjRxcX8NTXj+HY37xCQXYGiyu2haedOrwXd32pxc/1JikURCQp1dY3kGqGE3r4VKSGBueZeWs5bUQvsjP27T7+zWUbcZzyzTu49u/v8+6PTuHVpRt4YnY5A3vkcP3kESyu2Eb55h1s3VHLxIHdAHh58QYuPaaEOR9v4bUPKkM3Qw4oDH+r/6jyEzLTU+lbmMWqqu0M6Ja9z+G9/3y4kfzMdEb22/ueHndnZVUNZSs3ccKQoibPqbSGQkFERMJaGwqtu2xCRESSgkJBRETCFAoiIhKmUBARkTCFgoiIhCkUREQkTKEgIiJhCgUREQlTKIiISJhCQUREwhQKIiISplAQEZGwqIWCmWWa2Swzm2dmC83sZ020mWZmlWY2N3hdFq16RESkZfv2Hdt+dgEnufsnZpYOvGlmL7r7243aPeruV0axDhERaaWohYKH+uTe88SK9OCVWP10i4gkmaieUzCzVDObC2wAXnL3d5podp6ZzTezJ8ysfzPLmW5mZWZWVllZ2VQTERFpB1ENBXevd/exQDEw0cxGNmryLFDi7qOBl4C/NLOcO9291N1Li4qKolmyiEhS65Crj9x9CzA02ZI5AAAJaUlEQVQTOKPR+Cp33/Ok7ruBCR1Rj4iINC2aVx8VmVlhMJwFnAosadSmT8TbycDiaNUjIiIti+bVR32Av5hZKqHweczdnzOzG4Ayd38G+JaZTQbqgE3AtCjWIyIiLbDQRUKJo7S01MvKymJdhohIQjGz2e5e2lI73dEsIiJhCgUREQlTKIiISJhCQUREwhQKIiISplAQEZEwhYKIiIQl3H0KZlYJrGrj7D2Aje1YTjzRuiWezrpeoHWLR4e4e4udxyVcKBwMMytrzc0biUjrlng663qB1i2R6fCRiIiEKRRERCQs2ULhzlgXEEVat8TTWdcLtG4JK6nOKYiIyP4l256CiIjsR9KEgpmdYWZLzWy5mV0T63paYmb9zWymmS0ys4VmdlUwvpuZvWRmy4KfXYPxZma3Bus338zGRyzrkqD9MjO7JFbr1FjwDO85ZvZc8H6gmb0TrMOjZpYRjO8SvF8eTC+JWMa1wfilZnZ6bNbkU2ZWGDxvfImZLTazozrLNjOzbwf/FheY2cNmlpmo28zM7jGzDWa2IGJcu20nM5tgZu8H89xqZtaxa3gQ3L3Tv4BU4EPgUCADmAcMj3VdLdTcBxgfDOcBHwDDgd8C1wTjrwF+EwyfBbwIGHAk8E4wvhvwUfCzazDcNdbrF9R2NfAQ8Fzw/jHgwmD4DuCKYPjrwB3B8IXAo8Hw8GBbdgEGBts4Ncbr9BfgsmA4AyjsDNsM6AesALIittW0RN1mwPHAeGBBxLh2207ArKCtBfOeGcvtd0B/m1gX0EH/AI4CZkS8vxa4NtZ1HeA6PE3okaZLgT7BuD7A0mD4/4CpEe2XBtOnAv8XMX6vdjFcn2LgZeAk4LngP89GIK3xNgNmAEcFw2lBO2u8HSPbxWidCoIPTms0PuG3WRAKq4MPwLRgm52eyNsMKGkUCu2ynYJpSyLG79Uu3l/Jcvhozz/oPcqDcQkh2PUeB7wD9HL3imDSOqBXMNzcOsbrut8CfB9oCN53B7a4e13wPrLO8DoE07cG7eNt3QYClcC9wWGxu80sh06wzdx9DXAT8DFQQWgbzCbxt1mk9tpO/YLhxuMTQrKEQsIys1zgb8B/u/u2yGke+hqScJePmdnngA3uPjvWtbSzNEKHJG5393HAdkKHIcISeJt1Bc4mFHx9gRzgjJgWFUWJup3aQ7KEwhqgf8T74mBcXDOzdEKB8Fd3/3swer2Z9Qmm9wE2BOObW8d4XPdjgMlmthJ4hNAhpD8AhWaWFrSJrDO8DsH0AqCK+Fu3cqDc3d8J3j9BKCQ6wzY7BVjh7pXuXgv8ndB2TPRtFqm9ttOaYLjx+ISQLKHwLnBYcKVEBqETX8/EuKb9Cq5W+DOw2N1vjpj0DLDnKodLCJ1r2DP+S8GVEkcCW4Nd4RnAaWbWNfi2d1owLmbc/Vp3L3b3EkLb4hV3vwiYCUwJmjVetz3rPCVo78H4C4MrXQYChxE6wRcT7r4OWG1mhwejTgYW0Qm2GaHDRkeaWXbwb3PPuiX0NmukXbZTMG2bmR0Z/K2+FLGs+Bfrkxod9SJ0BcEHhK52+FGs62lFvccS2n2dD8wNXmcROi77MrAM+BfQLWhvwJ+C9XsfKI1Y1peB5cHr0livW6P1PJFPrz46lNAHxHLgcaBLMD4zeL88mH5oxPw/CtZ5KXFwhQcwFigLtttThK5K6RTbDPgZsARYADxA6AqihNxmwMOEzo3UEtrD+0p7biegNPg7fQjcRqOLD+L5pTuaRUQkLFkOH4mISCsoFEREJEyhICIiYQoFEREJUyiIiEiYQkHijpnVm9lcM5tnZu+Z2dEttC80s6+3YrmvmlmnfbZuW5jZfWY2peWWkiwUChKPdrj7WHcfQ6gDtV+30L6QUK+ccSnijl+RuKdQkHiXD2yGUD9QZvZysPfwvpmdHbS5ERgU7F38Lmj7g6DNPDO7MWJ555vZLDP7wMyOC9qmmtnvzOzdoL/8rwXj+5jZ68FyF+xpH8nMVprZb4PfNcvMBgfj7zOzO8zsHeC3QV/9TwXLf9vMRkes073B/PPN7Lxg/Glm9lawro8HfWBhZjda6Bkb883spmDc+UF988zs9RbWyczsNgs9y+BfQM/23FiS+PQNRuJRlpnNJXRXbB9CfSMB7ATOcfdtZtYDeNvMniHU6dxIdx8LYGZnEuq87Qh3rzGzbhHLTnP3iWZ2FnAdoT59vkKo64LPmFkX4N9m9k/gXELdFvzSzFKB7Gbq3eruo8zsS4R6f/1cML4YONrd683sj8Acd/+CmZ0E3E/o7uef7Jk/qL1rsG4/Bk5x9+1m9gPgajP7E3AOMNTd3cwKg9/zU+B0d18TMa65dRoHHE7ouQa9CHVVcU+rtookBYWCxKMdER/wRwH3m9lIQt0N/MrMjifU5XY/Pu3eONIpwL3uXgPg7psipu3pWHA2of70IdRnzeiIY+sFhPrkeRe4x0IdEz7l7nObqffhiJ//EzH+cXevD4aPBc4L6nnFzLqbWX5Q64V7ZnD3zRbqRXY4oQ9yCD2s5y1C3U/vBP5soafVPRfM9m/gPjN7LGL9mlun44GHg7rWmtkrzayTJCmFgsQ1d38r+OZcRKjvpyJggrvXWqiX1cwDXOSu4Gc9n/77N+Cb7r5Pp3NBAH2W0Ifuze5+f1NlNjO8/QBrC/9a4CV3n9pEPRMJdUY3BbgSOMndLzezI4I6Z5vZhObWKdhDEmmWzilIXDOzoYQep1pF6NvuhiAQJgGHBM2qCT2ydI+XgEvNLDtYRuTho6bMAK4I9ggwsyFmlmNmhwDr3f0u4G5C3WA35YsRP99qps0bwEXB8k8ENnro+RgvAd+IWN+uwNvAMRHnJ3KCmnKBAnd/Afg2MCaYPsjd33H3nxJ6yE//5tYJeB34YnDOoQ8wqYW/jSQZ7SlIPNpzTgFC33gvCY7L/xV41szeJ9QT6RIAd68ys39b6CHsL7r798xsLFBmZruBF4Af7uf33U3oUNJ7FjpeUwl8gVAPrt8zs1rgE0JdIDelq5nNJ7QXss+3+8D1hA5FzQdq+LSL5l8Afwpqrwd+5u5/N7NpwMPB+QAInWOoBp42s8zg73J1MO13ZnZYMO5lQs9Ant/MOj1J6BzNIkLdYTcXYpKk1EuqyEEIDmGVuvvGWNci0h50+EhERMK0pyAiImHaUxARkTCFgoiIhCkUREQkTKEgIiJhCgUREQlTKIiISNj/ByeoZfppHxwhAAAAAElFTkSuQmCC\n",
"text/plain": [
"<Figure size 432x288 with 1 Axes>"
]
},
"metadata": {
"needs_background": "light"
},
"output_type": "display_data"
}
],
"source": [
"learn.recorder.plot_losses()"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3.7 fasta.ai1 DEV",
"language": "python",
"name": "fastai1_dev"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.1"
},
"varInspector": {
"cols": {
"lenName": 16,
"lenType": 16,
"lenVar": 40
},
"kernels_config": {
"python": {
"delete_cmd_postfix": "",
"delete_cmd_prefix": "del ",
"library": "var_list.py",
"varRefreshCmd": "print(var_dic_list())"
},
"r": {
"delete_cmd_postfix": ") ",
"delete_cmd_prefix": "rm(",
"library": "var_list.r",
"varRefreshCmd": "cat(var_dic_list()) "
}
},
"types_to_exclude": [
"module",
"function",
"builtin_function_or_method",
"instance",
"_Feature"
],
"window_display": false
}
},
"nbformat": 4,
"nbformat_minor": 2
}
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"%reload_ext autoreload\n",
"%autoreload 2"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"from fastai import *\n",
"from fastai.text import *\n",
"import re\n",
"import sentencepiece as spm #https://github.com/google/sentencepiece\n",
"\n",
"np.random.seed(20180311)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Sentencepiece tokenizer & LM Build"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Data processing to DataBunch"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Get IMDB Data and build DF"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"PosixPath('/home/farzin/.fastai/data/imdb')"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"IMDB_PATH = untar_data(URLs.IMDB)\n",
"IMDB_PATH"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(75000, 25000)"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"## raw data does not have the unsup in the train directory. Just move it there\n",
"# ~/.fastai/data/imdb$ mv unsup/ ./train/\n",
"CLASSES = ['neg', 'pos', 'unsup']\n",
"\n",
"def get_texts(path):\n",
" texts,labels = [],[]\n",
" for idx,label in enumerate(CLASSES):\n",
" for fname in (path/label).glob('*.*'):\n",
" texts.append(fname.open('r', encoding='utf8').read())\n",
" labels.append([0]) ## all unsupervised case\n",
"\n",
" return np.array(texts),np.array(labels)\n",
"\n",
"train_texts,train_labels = get_texts(IMDB_PATH/'train')\n",
"valid_texts,valid_labels = get_texts(IMDB_PATH/'test')\n",
"\n",
"len(train_texts),len(valid_texts)"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"## we can combine the \"train\" and \"valid\" for the LM step\n",
"all_texts = np.concatenate([train_texts,valid_texts])\n",
"all_texts_df = pd.DataFrame({'text':all_texts, 'labels':[0]*len(all_texts)}, columns=['labels','text'])"
]
},
{
"cell_type": "raw",
"metadata": {},
"source": [
"raw_text = all_texts_df.loc[:,'text']\n",
"\n",
"print(\"Default Rules:\\n\",[x.__name__ for x in defaults.text_pre_rules],\"\\n\\n\")\n",
"\n",
"for rule in defaults.text_pre_rules:\n",
" print(rule)\n",
" raw_text = raw_text.apply(lambda x: rule(str(x)))"
]
},
{
"cell_type": "raw",
"metadata": {},
"source": [
"## SentencePiece expects <s> ... </s> so we add that here:\n",
"all_texts_df['new_text'] = '<s>' + raw_text + '</s>'\n",
"all_texts_df['new_text'].head()"
]
},
{
"cell_type": "raw",
"metadata": {},
"source": [
"## save as a file for SP\n",
"formatted_text_file = 'tmp_IMDB_SP_example'\n",
"all_texts_df['new_text'].to_frame().to_csv(formatted_text_file, header=False,index=False,quotechar=' ')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### SP Tokenizer SWIG wrapper"
]
},
{
"cell_type": "raw",
"metadata": {},
"source": [
"# m.model and m.vocab created in the pwd\n",
"vocab_size = 500\n",
"model_prefix = 'wk103m_example'\n",
"\n",
"spm.SentencePieceTrainer.Train(f'--input={formatted_text_file}'\\\n",
" f' --model_prefix={model_prefix}'\\\n",
" f' --vocab_size={vocab_size}')\n",
"# f'--unk_piece={UNK} --bos_piece={BOS} --eos_id=-1 --pad_piece={PAD}')"
]
},
{
"cell_type": "raw",
"metadata": {},
"source": [
"## Vocab in order of frequency\n",
"!head -n50 {model_prefix}.vocab | nl"
]
},
{
"cell_type": "raw",
"metadata": {},
"source": [
"## load up the Processor\n",
"sp = spm.SentencePieceProcessor()\n",
"sp.load(f'{model_prefix}.model')\n",
"\n",
"## itos from m.vocab file: just read directly and populate the dictionary\n",
"itos = {}\n",
"with open(f'{model_prefix}.vocab','r') as f:\n",
" for line_num,line in enumerate(f):\n",
" itos[line_num] = line.split(\"\\t\")[0]\n",
" \n",
"stoi = {v:k for k,v in itos.items()}"
]
},
{
"cell_type": "raw",
"metadata": {},
"source": [
"#Based on Tokenizer() class: https://github.com/fastai/fastai/blob/master/fastai/text/transform.py#L87\n",
"class CustomTokenizer():\n",
" '''Wrapper for SentencePiece toeknizer to fit into Fast.ai V1'''\n",
" def __init__(self,sp_processor,pre_rules:ListRules=None,post_rules:ListRules=None):\n",
" self.sp = sp_processor\n",
" self.pre_rules = ifnone(pre_rules, defaults.text_pre_rules )\n",
" \n",
" def __repr__(self) -> str:\n",
" return \"Custom Tokenizer\"\n",
"\n",
" def process_text(self, t:str) -> List[str]:\n",
" \"Processe one text `t` with tokenizer `tok`.\"\n",
" for rule in self.pre_rules: t = rule(t) \n",
" toks = sp.EncodeAsIds(t)\n",
" \n",
" return toks \n",
" \n",
" def _process_all_1(self,texts:Collection[str]) -> List[List[str]]:\n",
" 'Process a list of `texts` in one process'\n",
" return [self.process_text(t) for t in texts]\n",
" \n",
" def process_all(self, texts:Collection[str]) -> List[List[str]]: \n",
" \"Process a list of `texts`.\" \n",
" return self._process_all_1(texts)\n",
" \n",
"mycust_tok = CustomTokenizer(sp)\n",
"#setup Vocab object for use in LM \n",
"# Vocab source: https://github.com/fastai/fastai/blob/master/fastai/text/transform.py#L122\n",
"sp_vocab = Vocab(itos)"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>labels</th>\n",
" <th>text</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>0</td>\n",
" <td>Detective Frank Keller(Al Pacino, in a perfect...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>0</td>\n",
" <td>This is a VERY good movie. I give it a 10.&lt;br ...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>0</td>\n",
" <td>I can't say I enjoyed this as much as \"The Big...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>0</td>\n",
" <td>I sat last night to see this film being played...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>0</td>\n",
" <td>First of all, I agree that the plot left somet...</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" labels text\n",
"0 0 Detective Frank Keller(Al Pacino, in a perfect...\n",
"1 0 This is a VERY good movie. I give it a 10.<br ...\n",
"2 0 I can't say I enjoyed this as much as \"The Big...\n",
"3 0 I sat last night to see this film being played...\n",
"4 0 First of all, I agree that the plot left somet..."
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"idx = np.random.permutation(len(all_texts))\n",
"cut = int(0.1 * len(idx))\n",
"LM_train_df = pd.DataFrame({'text':all_texts[idx[cut:]], 'labels':[0] * (len(all_texts)-cut)}, columns=['labels','text'])\n",
"LM_valid_df = pd.DataFrame({'text':all_texts[idx[:cut]], 'labels':[0] * cut}, columns=['labels','text'])\n",
"LM_train_df.head()"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"PATH = Path(\"./tmp_IMDB_example\")\n",
"\n",
"data = TextLMDataBunch.from_df(PATH, LM_train_df, LM_valid_df, \n",
"# tokenizer=mycust_tok, vocab=sp_vocab, \n",
" text_cols='text', label_cols='labels')"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"60003"
]
},
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"len(data.vocab.itos)"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['xxunk',\n",
" 'xxpad',\n",
" 'xxbos',\n",
" 'xxfld',\n",
" 'xxmaj',\n",
" 'xxup',\n",
" 'xxrep',\n",
" 'xxwrep',\n",
" 'the',\n",
" '.',\n",
" ',',\n",
" 'and',\n",
" 'a',\n",
" 'of',\n",
" 'to',\n",
" 'is',\n",
" 'it',\n",
" 'in',\n",
" 'i',\n",
" 'this',\n",
" 'that',\n",
" '\"',\n",
" \"'s\",\n",
" '-',\n",
" '\\n \\n ',\n",
" 'was',\n",
" 'as',\n",
" 'with',\n",
" 'for',\n",
" 'movie',\n",
" 'but',\n",
" 'film',\n",
" 'you',\n",
" ')',\n",
" 'on',\n",
" \"n't\",\n",
" '(',\n",
" 'not',\n",
" 'are',\n",
" 'he',\n",
" 'his',\n",
" 'have',\n",
" 'one',\n",
" 'be',\n",
" 'all',\n",
" 'at',\n",
" 'they',\n",
" 'by',\n",
" 'an',\n",
" 'who']"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data.vocab.itos[:50]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## LM Learner"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [],
"source": [
"learn = language_model_learner(data, AWD_LSTM, drop_mult=0.5)\n",
"learn.freeze()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## LM Train"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [],
"text/plain": [
"<IPython.core.display.HTML object>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"LR Finder is complete, type {learner_name}.recorder.plot() to see the graph.\n"
]
},
{
"data": {
"image/png": "\n",
"text/plain": [
"<Figure size 432x288 with 1 Axes>"
]
},
"metadata": {
"needs_background": "light"
},
"output_type": "display_data"
}
],
"source": [
"lr_find(learn)\n",
"learn.recorder.plot()"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"Total time: 20:17 <p><table style='width:375px; margin-bottom:10px'>\n",
" <tr>\n",
" <th>epoch</th>\n",
" <th>train_loss</th>\n",
" <th>valid_loss</th>\n",
" <th>accuracy</th>\n",
" <th>time</th>\n",
" </tr>\n",
" <tr>\n",
" <th>0</th>\n",
" <th>4.283346</th>\n",
" <th>4.043954</th>\n",
" <th>0.292471</th>\n",
" <th>20:17</th>\n",
" </tr>\n",
"</table>\n"
],
"text/plain": [
"<IPython.core.display.HTML object>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"learn.fit_one_cycle(1,1e-2)"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [
{
"data": {
"image/png": "\n",
"text/plain": [
"<Figure size 432x288 with 1 Axes>"
]
},
"metadata": {
"needs_background": "light"
},
"output_type": "display_data"
}
],
"source": [
"learn.recorder.plot_losses()"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3.7 fasta.ai1 DEV",
"language": "python",
"name": "fastai1_dev"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.1"
},
"varInspector": {
"cols": {
"lenName": 16,
"lenType": 16,
"lenVar": 40
},
"kernels_config": {
"python": {
"delete_cmd_postfix": "",
"delete_cmd_prefix": "del ",
"library": "var_list.py",
"varRefreshCmd": "print(var_dic_list())"
},
"r": {
"delete_cmd_postfix": ") ",
"delete_cmd_prefix": "rm(",
"library": "var_list.r",
"varRefreshCmd": "cat(var_dic_list()) "
}
},
"types_to_exclude": [
"module",
"function",
"builtin_function_or_method",
"instance",
"_Feature"
],
"window_display": false
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Display the source blob
Display the rendered blob
Raw
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment