Last active
August 21, 2022 14:08
-
-
Save kasperjunge/65c01b361f27a732b667d3119233cc4a to your computer and use it in GitHub Desktop.
Tokenize text and print encoded + decoded wordpiece tokens.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from transformers import AutoTokenizer | |
# define sample text | |
text = "Rødgrød med fløde." | |
# init tokenizer | |
model_id = "Maltehb/danish-bert-botxo" | |
tokenizer = AutoTokenizer.from_pretrained(model_id) | |
# encode text | |
encoding = tokenizer(text) | |
print(encoding) # print raw encoding | |
# Output: | |
# { | |
# 'input_ids': [2, 2132, 29310, 61, 10726, 771, 3], | |
# 'token_type_ids': [0, 0, 0, 0, 0, 0, 0], | |
# 'attention_mask': [1, 1, 1, 1, 1, 1, 1] | |
# } | |
tokens = encoding.tokens() | |
print(tokens) # print decoded word piece tokens | |
# Output: | |
# ['[CLS]', 'rød', '##grød', 'med', 'fløde', '.', '[SEP]'] |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment