Skip to content

Instantly share code, notes, and snippets.

@howard-haowen
Last active January 9, 2021 02:12
Show Gist options
  • Save howard-haowen/aff4313e3f44872d4ca539b863e9c263 to your computer and use it in GitHub Desktop.
Save howard-haowen/aff4313e3f44872d4ca539b863e9c263 to your computer and use it in GitHub Desktop.
Load a pretrained model from Hugging Face
!pip install transformers
# Taken from https://huggingface.co/transformers/model_doc/auto.html
#=====Ways to initiate a tokenizer and embedding model=====
### Three ways to initiate a tokenizer ###
from transformers import AutoTokenizer
# From official models hosted on HuggingFace (just the model name)
tokenizer = AutoTokenizer.from_pretrained('bert-base-chinese')
# From comunity models hosted on HuggingFace (with additonal "path/")
tokenizer = AutoTokenizer.from_pretrained('voidful/albert_chinese_small')
# From a directory (e.g. tokenizer was saved using `save_pretrained('./test/saved_model/')`)
tokenizer = AutoTokenizer.from_pretrained('./test/bert_saved_model/')
### Three ways to initiate a model ###
from transformers import AutoModel, AutoConfig
# Download model and configuration from S3 and cache.
model = AutoModel.from_pretrained('bert-base-chinese')
# Update configuration during loading
model = AutoModel.from_pretrained('bert-base-chinese', output_attentions=True)
# check if model.config.output_attentions is True
# Loading from a TF checkpoint file instead of a PyTorch model (slower)
config = AutoConfig.from_json_file('./tf_model/bert_tf_model_config.json')
model = AutoModel.from_pretrained('./tf_model/bert_tf_checkpoint.ckpt.index', from_tf=True, config=config)
# Taken from https://clay-atlas.com/blog/2020/06/30/pytorch-如何使用-hugging-face-所提供的-transformers-以-bert-為例/
#=====Specific example for Chinese using PyTorch=====
# coding: utf-8
import torch
from transformers import AutoTokenizer, AutoModel
from keras.preprocessing.sequence import pad_sequences
# Tokenizer and Bert Model
tokenizer = AutoTokenizer.from_pretrained('bert-base-chinese')
embedding = AutoModel.from_pretrained('bert-base-chinese')
# Preprocess
sent = '今天天氣真 Good。'
sent_token = tokenizer.encode(sent)
sent_token_padding = pad_sequences([sent_token], maxlen=10, padding='post', dtype='int')
masks = [[float(value>0) for value in values] for values in sent_token_padding]
# print('sent:', sent) >>> 今天天氣真 Good。
# print('sent_token:', sent_token) >>> [101, 791, 1921, 1921, 3706, 4696, 100, 511, 102]
# print('sent_token_padding:', sent_token_padding) >>> [[ 101 791 1921 1921 3706 4696 100 511 102 0]]
# print('masks:', masks) >>> [[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0]]
# Convert
inputs = torch.tensor(sent_token_padding)
masks = torch.tensor(masks)
embedded, _ = embedding(inputs, attention_mask=masks)
# print('embedded shape:', embedded.shape) >>> torch.Size([1, 10, 768])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment