Last active
January 9, 2021 02:12
-
-
Save howard-haowen/aff4313e3f44872d4ca539b863e9c263 to your computer and use it in GitHub Desktop.
Load a pretrained model from Hugging Face
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
!pip install transformers | |
# Taken from https://huggingface.co/transformers/model_doc/auto.html | |
#=====Ways to initiate a tokenizer and embedding model===== | |
### Three ways to initiate a tokenizer ### | |
from transformers import AutoTokenizer | |
# From official models hosted on HuggingFace (just the model name) | |
tokenizer = AutoTokenizer.from_pretrained('bert-base-chinese') | |
# From comunity models hosted on HuggingFace (with additonal "path/") | |
tokenizer = AutoTokenizer.from_pretrained('voidful/albert_chinese_small') | |
# From a directory (e.g. tokenizer was saved using `save_pretrained('./test/saved_model/')`) | |
tokenizer = AutoTokenizer.from_pretrained('./test/bert_saved_model/') | |
### Three ways to initiate a model ### | |
from transformers import AutoModel, AutoConfig | |
# Download model and configuration from S3 and cache. | |
model = AutoModel.from_pretrained('bert-base-chinese') | |
# Update configuration during loading | |
model = AutoModel.from_pretrained('bert-base-chinese', output_attentions=True) | |
# check if model.config.output_attentions is True | |
# Loading from a TF checkpoint file instead of a PyTorch model (slower) | |
config = AutoConfig.from_json_file('./tf_model/bert_tf_model_config.json') | |
model = AutoModel.from_pretrained('./tf_model/bert_tf_checkpoint.ckpt.index', from_tf=True, config=config) | |
# Taken from https://clay-atlas.com/blog/2020/06/30/pytorch-如何使用-hugging-face-所提供的-transformers-以-bert-為例/ | |
#=====Specific example for Chinese using PyTorch===== | |
# coding: utf-8 | |
import torch | |
from transformers import AutoTokenizer, AutoModel | |
from keras.preprocessing.sequence import pad_sequences | |
# Tokenizer and Bert Model | |
tokenizer = AutoTokenizer.from_pretrained('bert-base-chinese') | |
embedding = AutoModel.from_pretrained('bert-base-chinese') | |
# Preprocess | |
sent = '今天天氣真 Good。' | |
sent_token = tokenizer.encode(sent) | |
sent_token_padding = pad_sequences([sent_token], maxlen=10, padding='post', dtype='int') | |
masks = [[float(value>0) for value in values] for values in sent_token_padding] | |
# print('sent:', sent) >>> 今天天氣真 Good。 | |
# print('sent_token:', sent_token) >>> [101, 791, 1921, 1921, 3706, 4696, 100, 511, 102] | |
# print('sent_token_padding:', sent_token_padding) >>> [[ 101 791 1921 1921 3706 4696 100 511 102 0]] | |
# print('masks:', masks) >>> [[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0]] | |
# Convert | |
inputs = torch.tensor(sent_token_padding) | |
masks = torch.tensor(masks) | |
embedded, _ = embedding(inputs, attention_mask=masks) | |
# print('embedded shape:', embedded.shape) >>> torch.Size([1, 10, 768]) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment