Last active
August 11, 2019 13:39
-
-
Save bfarzin/f9407c0b0f2690f36dd6d51c8ef56944 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"%reload_ext autoreload\n", | |
"%autoreload 2" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 2, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"from fastai import *\n", | |
"from fastai.text import *\n", | |
"\n", | |
"import re\n", | |
"import sentencepiece as spm #https://github.com/google/sentencepiece" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"# Sentencepiece tokenizer" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## Data processing to DataBunch" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"### Reading the texts" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 3, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"#https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-103-raw-v1.zip\n", | |
"PATH = Path('/home/farzin/rnn_python_code/wikitext-103-raw')" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 4, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stderr", | |
"output_type": "stream", | |
"text": [ | |
"/home/farzin/anaconda3/envs/fastaiv1_dev/lib/python3.7/site-packages/ipykernel_launcher.py:3: ParserWarning: Falling back to the 'python' engine because the 'c' engine does not support regex separators (separators > 1 char and different from '\\s+' are interpreted as regex); you can avoid this warning by specifying engine='python'.\n", | |
" This is separate from the ipykernel package so we can avoid doing imports until\n" | |
] | |
} | |
], | |
"source": [ | |
"#made up separator so it will not divide lines\n", | |
"#taking first 100 lines, just to run quickly and demonstrate functionality\n", | |
"all_texts_df = pd.read_csv(PATH/'wiki.train.raw',sep='%$#',header=None).head(100) \n", | |
"all_texts = all_texts_df.values.squeeze()" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"### Sentence Piece expects some extra tokens" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 5, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>0</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>= Valkyria Chronicles III =</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td>Senjō no Valkyria 3 : Unrecorded Chronicles ( ...</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td>The game began development in 2010 , carrying ...</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3</th>\n", | |
" <td>It met with positive sales in Japan , and was ...</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4</th>\n", | |
" <td>= = Gameplay = =</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" 0\n", | |
"0 = Valkyria Chronicles III =\n", | |
"1 Senjō no Valkyria 3 : Unrecorded Chronicles ( ...\n", | |
"2 The game began development in 2010 , carrying ...\n", | |
"3 It met with positive sales in Japan , and was ...\n", | |
"4 = = Gameplay = =" | |
] | |
}, | |
"execution_count": 5, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"all_texts_df.head()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 6, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"[<function fastai.text.transform.fix_html(x: str) -> str>,\n", | |
" <function fastai.text.transform.replace_rep(t: str) -> str>,\n", | |
" <function fastai.text.transform.replace_wrep(t: str) -> str>,\n", | |
" <function fastai.text.transform.spec_add_spaces(t: str) -> str>,\n", | |
" <function fastai.text.transform.rm_useless_spaces(t: str) -> str>]" | |
] | |
}, | |
"execution_count": 6, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"defaults.text_pre_rules" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 7, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"<function fix_html at 0x7fa78cbc0048>\n", | |
"<function replace_rep at 0x7fa78cbb4ea0>\n", | |
"<function replace_wrep at 0x7fa78cbb4f28>\n", | |
"<function spec_add_spaces at 0x7fa7a1307840>\n", | |
"<function rm_useless_spaces at 0x7fa78cbb4e18>\n" | |
] | |
} | |
], | |
"source": [ | |
"raw_text = all_texts_df.iloc[:,0]\n", | |
"for rule in defaults.text_pre_rules:\n", | |
" print(rule)\n", | |
" raw_text = raw_text.apply(lambda x: rule(str(x)))" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 8, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"all_texts_df['new_text'] = '<s>' + raw_text + '</s>' " | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 9, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"0 <s>= Valkyria Chronicles III =</s>\n", | |
"1 <s>Senjō no Valkyria 3 : Unrecorded Chronicles...\n", | |
"2 <s>The game began development in 2010 , carryi...\n", | |
"3 <s>It met with positive sales in Japan , and w...\n", | |
"4 <s>= = Gameplay = =</s>\n", | |
"Name: new_text, dtype: object" | |
] | |
}, | |
"execution_count": 9, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"all_texts_df['new_text'].head()" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"Save the file down so we can call the SWIG wrapped sentencepice app" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 10, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"formatted_text_file = 'wk103_text_example'" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 11, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"all_texts_df['new_text'].to_frame().to_csv(formatted_text_file, header=False,index=False,quotechar=' ')" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"### SentencePiece tokenizer wrapped appropriately" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 12, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# m.model and m.vocab created in the pwd\n", | |
"vocab_size = 500\n", | |
"model_prefix = 'wk103m_example'" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 13, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"True" | |
] | |
}, | |
"execution_count": 13, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"spm.SentencePieceTrainer.Train(f'--input={formatted_text_file}'\\\n", | |
" f' --model_prefix={model_prefix}'\\\n", | |
" f' --vocab_size={vocab_size}')\n", | |
"# f'--unk_piece={UNK} --bos_piece={BOS} --eos_id=-1 --pad_piece={PAD}')" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 14, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
" 1\t<unk>\t0\r\n", | |
" 2\t<s>\t0\r\n", | |
" 3\t</s>\t0\r\n", | |
" 4\t▁\t-2.23492\r\n", | |
" 5\ts\t-2.83873\r\n", | |
" 6\t▁the\t-3.45204\r\n", | |
" 7\t▁,\t-3.81317\r\n", | |
" 8\tt\t-3.81373\r\n", | |
" 9\te\t-3.93292\r\n", | |
" 10\ted\t-4.05182\r\n" | |
] | |
} | |
], | |
"source": [ | |
"#Head of the Vocab file: Line nums indicate index of vocab\n", | |
"!head -n10 {model_prefix}.vocab | nl" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 15, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"True" | |
] | |
}, | |
"execution_count": 15, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"## load up the Processor\n", | |
"sp = spm.SentencePieceProcessor()\n", | |
"sp.load(f'{model_prefix}.model')" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 16, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"itos = [] #{}\n", | |
"with open(f'{model_prefix}.vocab','r') as f:\n", | |
" for line_num,line in enumerate(f):\n", | |
" itos.append(line.split(\"\\t\")[0])\n", | |
"\n", | |
"class SPTokenizer(BaseTokenizer):\n", | |
" \"Wrapper around a SentncePiece tokenizer to make it a `BaseTokenizer`.\"\n", | |
" def __init__(self, model_prefix:str):\n", | |
" self.tok = spm.SentencePieceProcessor()\n", | |
" self.tok.load(f'{model_prefix}.model')\n", | |
"\n", | |
" def tokenizer(self, t:str) -> List[str]:\n", | |
" return self.tok.EncodeAsPieces(t)\n", | |
" \n", | |
"class CustomTokenizer():\n", | |
" '''Wrapper for SentencePiece toeknizer to fit into Fast.ai V1'''\n", | |
" def __init__(self,tok_func:Callable,model_prefix:str, pre_rules:ListRules=None):\n", | |
" self.tok_func,self.model_prefix = tok_func,model_prefix\n", | |
" self.pre_rules = ifnone(pre_rules, defaults.text_pre_rules )\n", | |
" \n", | |
" def __repr__(self) -> str:\n", | |
" res = f'Tokenizer {self.tok_func.__name__} using `{self.model_prefix}` model with the following rules:\\n'\n", | |
" for rule in self.pre_rules: res += f' - {rule.__name__}\\n'\n", | |
" return res \n", | |
"\n", | |
" def process_text(self, t:str,tok:BaseTokenizer) -> List[str]:\n", | |
" \"Processe one text `t` with tokenizer `tok`.\"\n", | |
" for rule in self.pre_rules: t = rule(t) \n", | |
" toks = tok.tokenizer(t)\n", | |
" #post rules?\n", | |
" return toks \n", | |
" \n", | |
" def _process_all_1(self,texts:Collection[str]) -> List[List[str]]:\n", | |
" 'Process a list of `texts` in one process'\n", | |
" tok = self.tok_func(self.model_prefix)\n", | |
" return [self.process_text(t,tok) for t in texts]\n", | |
" \n", | |
" def process_all(self, texts:Collection[str]) -> List[List[str]]: \n", | |
" \"Process a list of `texts`.\" \n", | |
" return self._process_all_1(texts)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 17, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"mycust_tok = CustomTokenizer(SPTokenizer, model_prefix)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 18, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"#setup Vocab object for use in LM\n", | |
"sp_vocab = Vocab(itos)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"### build DataBunchfrom tokenizer and Vocab" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 19, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"idx = np.random.permutation(len(all_texts))\n", | |
"cut = int(0.1 * len(idx))\n", | |
"train_df = pd.DataFrame({'text':all_texts[idx[cut:]], 'labels':[0] * (len(all_texts)-cut)}, columns=['labels','text'])\n", | |
"valid_df = pd.DataFrame({'text':all_texts[idx[:cut]], 'labels':[0] * cut}, columns=['labels','text'])\n", | |
"\n", | |
"train_df = train_df.dropna()\n", | |
"valid_df = valid_df.dropna()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 20, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"data = TextLMDataBunch.from_df(PATH, train_df, valid_df, \n", | |
" tokenizer=mycust_tok, vocab=sp_vocab, \n", | |
" text_cols='text', label_cols='labels')" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 21, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th>idx</th>\n", | |
" <th>text</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <td>0</td>\n", | |
" <td>▁ ) ▁wa s ▁ place d ▁in ▁ ch ar ge ▁of ▁the ▁A rsenal ▁. ▁Du nning t on ▁pre s u m a b ly ▁return ed ▁to ▁his ▁ n a v al ▁du t ie s ▁ and ▁the ▁P on ch ar tra in ▁. ▁ x x b o s ▁H all ▁' s ▁c ar b ine s ▁2 6 7 ▁</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <td>1</td>\n", | |
" <td>▁the ▁ e ar ly ▁su m m er ▁of ▁186 3 ▁. ▁ x x b o s ▁A s ▁the ▁Nam ele s s ▁ o ffici all y ▁do ▁not ▁ex is t ▁, ▁the ▁up per ▁ e ch el on s ▁of ▁the ▁Gallia n ▁Arm y ▁exp l o it ▁the ▁con cept ▁of ▁p l a us i ble ▁ d en i</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <td>2</td>\n", | |
" <td>▁the ir ▁ l ead er ▁, ▁D a hau ▁. ▁A t ▁the ▁ s ame ▁time ▁, ▁ ele ment s ▁with in ▁Gallia n ▁Arm y ▁C om m and ▁move ▁to ▁ er a se ▁the ▁Nam ele s s ▁in ▁order ▁to ▁pro t ect ▁the ir ▁ o w n ▁in t ere st s ▁. ▁H ound ed ▁b y ▁bo th ▁</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <td>3</td>\n", | |
" <td>us ed ▁Kur t ▁of ▁T reas on ▁. ▁ x x b o s ▁P er h a p s ▁the ▁most ▁ ill u m in at ing ▁point s ▁of ▁the ▁abo ve ▁\" ▁Summar y ▁of ▁Work ▁\" ▁ and ▁ th o se ▁for ▁follow ing ▁month s ▁are ▁tha t ▁the ▁ st and ar d ▁ am munition ▁ma de ▁wa s ▁.</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <td>4</td>\n", | |
" <td>▁original ▁Li ttle ▁R ock ▁A rsenal ▁ and ▁one ▁of ▁the ▁ ol de st ▁building s ▁in ▁c ent r al ▁Arkansas ▁, ▁it ▁wa s ▁a lso ▁the ▁b ir th place ▁of ▁Gen eral ▁D o u g l a s ▁Mac Ar th ur ▁, ▁who ▁be c ame ▁the ▁su p re m e ▁command er ▁of ▁ U S ▁force s ▁in ▁the</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>" | |
], | |
"text/plain": [ | |
"<IPython.core.display.HTML object>" | |
] | |
}, | |
"metadata": {}, | |
"output_type": "display_data" | |
} | |
], | |
"source": [ | |
"data.show_batch()" | |
] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3.7 fasta.ai1 DEV", | |
"language": "python", | |
"name": "fastai1_dev" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.7.3" | |
}, | |
"varInspector": { | |
"cols": { | |
"lenName": 16, | |
"lenType": 16, | |
"lenVar": 40 | |
}, | |
"kernels_config": { | |
"python": { | |
"delete_cmd_postfix": "", | |
"delete_cmd_prefix": "del ", | |
"library": "var_list.py", | |
"varRefreshCmd": "print(var_dic_list())" | |
}, | |
"r": { | |
"delete_cmd_postfix": ") ", | |
"delete_cmd_prefix": "rm(", | |
"library": "var_list.r", | |
"varRefreshCmd": "cat(var_dic_list()) " | |
} | |
}, | |
"types_to_exclude": [ | |
"module", | |
"function", | |
"builtin_function_or_method", | |
"instance", | |
"_Feature" | |
], | |
"window_display": false | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 2 | |
} |
I am not too sure. I have moved past this example and now have better code for the custom tokenizer. (which allows it to be saved and applies EncodeAsPieces
which will return components of the sub-word rather than the ID (the numericaliztion!) See if this help at all or if you get the same errors:
class SPTokenizer(BaseTokenizer):
"Wrapper around a SentncePiece tokenizer to make it a `BaseTokenizer`."
def __init__(self, model_prefix:str):
self.tok = spm.SentencePieceProcessor()
self.tok.load(f'{model_prefix}.model')
def tokenizer(self, t:str) -> List[str]:
return self.tok.EncodeAsPieces(t)
class CustomTokenizer():
'''Wrapper for SentencePiece toeknizer to fit into Fast.ai V1'''
def __init__(self,tok_func:Callable,model_prefix:str, pre_rules:ListRules=None):
self.tok_func,self.model_prefix = tok_func,model_prefix
self.pre_rules = ifnone(pre_rules, defaults.text_pre_rules )
def __repr__(self) -> str:
res = f'Tokenizer {self.tok_func.__name__} using `{self.model_prefix}` model with the following rules:\n'
for rule in self.pre_rules: res += f' - {rule.__name__}\n'
return res
def process_text(self, t:str,tok:BaseTokenizer) -> List[str]:
"Processe one text `t` with tokenizer `tok`."
for rule in self.pre_rules: t = rule(t)
toks = tok.tokenizer(t)
#post rules?
return toks
def _process_all_1(self,texts:Collection[str]) -> List[List[str]]:
'Process a list of `texts` in one process'
tok = self.tok_func(self.model_prefix)
return [self.process_text(t,tok) for t in texts]
def process_all(self, texts:Collection[str]) -> List[List[str]]:
"Process a list of `texts`."
return self._process_all_1(texts)
i believe this is how i should use it:
mycust_tok = CustomTokenizer(SPTokenizer,model_prefix)
But i still have the error. Can you help with the full modified code?
itos
was wrong also. I updated the example above.
Thank you so much. Sorry I didnt notice the change earlier. It works. Much appreciated.
No problem. I am glad I cleaned it up for my own good!
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Any Idea why I may be having this error?