howard-haowen · January 9, 2021 02:12
diff --git a/HuggingFace_models.py b/HuggingFace_models.py
 !pip install transformers 

 # Taken from https://huggingface.co/transformers/model_doc/auto.html
 #=====Ways to initiate a tokenizer and embedding model=====
 ### Three ways to initiate a tokenizer ###
 from transformers import AutoTokenizer

 # From official models hosted on HuggingFace (just the model name)
 tokenizer = AutoTokenizer.from_pretrained('bert-base-chinese')

 # From comunity models hosted on HuggingFace (with additonal "path/")
 tokenizer = AutoTokenizer.from_pretrained('voidful/albert_chinese_small')

 # From a directory (e.g. tokenizer was saved using `save_pretrained('./test/saved_model/')`)
 tokenizer = AutoTokenizer.from_pretrained('./test/bert_saved_model/')

 ### Three ways to initiate a model ###
 from transformers import AutoModel, AutoConfig

 # Download model and configuration from S3 and cache.
 model = AutoModel.from_pretrained('bert-base-chinese')

 # Update configuration during loading
 model = AutoModel.from_pretrained('bert-base-chinese', output_attentions=True)
 # check if model.config.output_attentions is True

 # Loading from a TF checkpoint file instead of a PyTorch model (slower)
 config = AutoConfig.from_json_file('./tf_model/bert_tf_model_config.json')
 model = AutoModel.from_pretrained('./tf_model/bert_tf_checkpoint.ckpt.index', from_tf=True, config=config)

 # Taken from https://clay-atlas.com/blog/2020/06/30/pytorch-如何使用-hugging-face-所提供的-transformers-以-bert-為例/
 #=====Specific example for Chinese using PyTorch=====
 # coding: utf-8
 import torch
 from transformers import AutoTokenizer, AutoModel
 from keras.preprocessing.sequence import pad_sequences

 # Tokenizer and Bert Model
 tokenizer = AutoTokenizer.from_pretrained('bert-base-chinese')
 embedding = AutoModel.from_pretrained('bert-base-chinese')

 # Preprocess
 sent = '今天天氣真 Good。'
 sent_token = tokenizer.encode(sent)
 sent_token_padding = pad_sequences([sent_token], maxlen=10, padding='post', dtype='int')
 masks = [[float(value>0) for value in values] for values in sent_token_padding]

 # print('sent:', sent) >>> 今天天氣真 Good。
 # print('sent_token:', sent_token) >>> [101, 791, 1921, 1921, 3706, 4696, 100, 511, 102]
 # print('sent_token_padding:', sent_token_padding) >>> [[ 101 791 1921 1921 3706 4696 100 511 102 0]]
 # print('masks:', masks) >>> [[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0]]

 # Convert
 inputs = torch.tensor(sent_token_padding)
 masks = torch.tensor(masks)
 embedded, _ = embedding(inputs, attention_mask=masks)
 # print('embedded shape:', embedded.shape) >>> torch.Size([1, 10, 768])
	!pip install transformers

	# Taken from https://huggingface.co/transformers/model_doc/auto.html
	#=====Ways to initiate a tokenizer and embedding model=====
	### Three ways to initiate a tokenizer ###
	from transformers import AutoTokenizer

	# From official models hosted on HuggingFace (just the model name)
	tokenizer = AutoTokenizer.from_pretrained('bert-base-chinese')

	# From comunity models hosted on HuggingFace (with additonal "path/")
	tokenizer = AutoTokenizer.from_pretrained('voidful/albert_chinese_small')

	# From a directory (e.g. tokenizer was saved using `save_pretrained('./test/saved_model/')`)
	tokenizer = AutoTokenizer.from_pretrained('./test/bert_saved_model/')

	### Three ways to initiate a model ###
	from transformers import AutoModel, AutoConfig

	# Download model and configuration from S3 and cache.
	model = AutoModel.from_pretrained('bert-base-chinese')

	# Update configuration during loading
	model = AutoModel.from_pretrained('bert-base-chinese', output_attentions=True)
	# check if model.config.output_attentions is True

	# Loading from a TF checkpoint file instead of a PyTorch model (slower)
	config = AutoConfig.from_json_file('./tf_model/bert_tf_model_config.json')
	model = AutoModel.from_pretrained('./tf_model/bert_tf_checkpoint.ckpt.index', from_tf=True, config=config)

	# Taken from https://clay-atlas.com/blog/2020/06/30/pytorch-如何使用-hugging-face-所提供的-transformers-以-bert-為例/
	#=====Specific example for Chinese using PyTorch=====
	# coding: utf-8
	import torch
	from transformers import AutoTokenizer, AutoModel
	from keras.preprocessing.sequence import pad_sequences

	# Tokenizer and Bert Model
	tokenizer = AutoTokenizer.from_pretrained('bert-base-chinese')
	embedding = AutoModel.from_pretrained('bert-base-chinese')

	# Preprocess
	sent = '今天天氣真 Good。'
	sent_token = tokenizer.encode(sent)
	sent_token_padding = pad_sequences([sent_token], maxlen=10, padding='post', dtype='int')
	masks = [[float(value>0) for value in values] for values in sent_token_padding]

	# print('sent:', sent) >>> 今天天氣真 Good。
	# print('sent_token:', sent_token) >>> [101, 791, 1921, 1921, 3706, 4696, 100, 511, 102]
	# print('sent_token_padding:', sent_token_padding) >>> [[ 101 791 1921 1921 3706 4696 100 511 102 0]]
	# print('masks:', masks) >>> [[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0]]

	# Convert
	inputs = torch.tensor(sent_token_padding)
	masks = torch.tensor(masks)
	embedded, _ = embedding(inputs, attention_mask=masks)
	# print('embedded shape:', embedded.shape) >>> torch.Size([1, 10, 768])
No results found