Created
January 9, 2021 02:18
-
-
Save howard-haowen/73352d602b86e35a8edf2499b8483ba4 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Taken from https://clay-atlas.com/blog/2020/06/30/pytorch-如何使用-hugging-face-所提供的-transformers-以-bert-為例/ | |
# coding: utf-8 | |
import torch | |
from transformers import AutoTokenizer, AutoModel | |
from keras.preprocessing.sequence import pad_sequences | |
# Tokenizer and Bert Model | |
MODEL_NAME = '' # e.g. 'bert-base-chinese' | |
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME) | |
embedding = AutoModel.from_pretrained(MODEL_NAME) | |
# Preprocess | |
SENT = '' # e.g. '今天天氣真 Good。' | |
sent_token = ['[CLS]'] + tokenizer.tokenize(SENT) + ['[SEP]'] | |
sent_token_encode = tokenizer.convert_tokens_to_ids(sent_token) | |
sent_token_decode = tokenizer.convert_ids_to_tokens(sent_token_encode) | |
# Output: | |
# sent: 今天天氣真 Good。 | |
# sent_token: ['[CLS]', '今', '天', '天', '氣', '真', '[UNK]', '。', '[SEP]'] | |
# encode: [101, 791, 1921, 1921, 3706, 4696, 100, 511, 102] | |
# decode: ['[CLS]', '今', '天', '天', '氣', '真', '[UNK]', '。', '[SEP]'] |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment