Created
July 2, 2020 20:52
-
-
Save piegu/991c032d6eb25930fd96fdb5f1099266 to your computer and use it in GitHub Desktop.
en and pt BBPE tokenizers from Byte-Level-BPE_universal_tokenizer_but.ipynb
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Byte Level BPE (BBPE) tokenizers from Transformers and Tokenizers (Hugging Face libraries) | |
# 1. Get the pre-trained GPT2 Tokenizer (pre-training with an English corpus) | |
from transformers import GPT2TokenizerFast | |
pretrained_weights = 'gpt2' | |
tokenizer_en = GPT2TokenizerFast.from_pretrained(pretrained_weights) | |
tokenizer_en.pad_token = tokenizer_en.eos_token | |
# 2. Train a Byte Level BPE (BBPE) tokenizer on the Portuguese Wikipedia | |
# Get GPT2 tokenizer_en vocab size | |
ByteLevelBPE_tokenizer_pt_vocab_size = tokenizer_en.vocab_size | |
ByteLevelBPE_tokenizer_pt_vocab_size | |
# ByteLevelBPETokenizer Represents a Byte-level BPE as introduced by OpenAI with their GPT-2 model | |
from tokenizers import ByteLevelBPETokenizer | |
ByteLevelBPE_tokenizer_pt = ByteLevelBPETokenizer() | |
# Get list of paths to corpus files | |
paths = [str(path_data/'all_texts_ptwiki.txt')] | |
# Customize training with <|endoftext|> special GPT2 token | |
ByteLevelBPE_tokenizer_pt.train(files=paths, | |
vocab_size=ByteLevelBPE_tokenizer_pt_vocab_size, | |
min_frequency=2, | |
special_tokens=["<|endoftext|>"]) | |
# Get sequence length max of 1024 | |
ByteLevelBPE_tokenizer_pt.enable_truncation(max_length=1024) | |
# save tokenizer | |
ByteLevelBPE_tokenizer_pt_rep = 'ByteLevelBPE_tokenizer_pt' | |
path_to_ByteLevelBPE_tokenizer_pt_rep = path_data/ByteLevelBPE_tokenizer_pt_rep | |
if not (path_to_ByteLevelBPE_tokenizer_pt_rep).exists(): | |
path_to_ByteLevelBPE_tokenizer_pt_rep.mkdir(exist_ok=True, parents=True) | |
ByteLevelBPE_tokenizer_pt.save_model(str(path_to_ByteLevelBPE_tokenizer_pt_rep)) | |
# 3. Import the tokenizer config files in Portuguese into the pre-trained GPT2 Tokenizer | |
# Get the path to ByteLevelBPE_tokenizer_pt config files | |
ByteLevelBPE_tokenizer_pt_rep = 'ByteLevelBPE_tokenizer_pt' | |
path_to_ByteLevelBPE_tokenizer_pt_rep = path_data/ByteLevelBPE_tokenizer_pt_rep | |
# import the pre-trained GPT2TokenizerFast tokenizer with the tokenizer_pt config files | |
tokenizer_pt = GPT2TokenizerFast.from_pretrained( | |
str(path_to_ByteLevelBPE_tokenizer_pt_rep), | |
pad_token='<|endoftext|>') | |
# Get sequence length max of 1024 | |
tokenizer_pt.model_max_length = 1024 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment