Skip to content

Instantly share code, notes, and snippets.

@allenanie
Last active January 26, 2019 02:13
Show Gist options
  • Select an option

  • Save allenanie/7f7bbaecc71c767c7cba17986310dc5d to your computer and use it in GitHub Desktop.

Select an option

Save allenanie/7f7bbaecc71c767c7cba17986310dc5d to your computer and use it in GitHub Desktop.
Run BERT to extract features of a sentence
# adapted from
# https://github.com/huggingface/pytorch-pretrained-BERT/blob/master/examples/extract_features.py
class InputFeatures(object):
"""A single set of features of data."""
def __init__(self, tokens, input_ids, input_mask, input_type_ids):
self.tokens = tokens
self.input_ids = input_ids
self.input_mask = input_mask
self.input_type_ids = input_type_ids
def convert_examples_to_features(sentences, seq_length, tokenizer, silent=True):
"""Loads a data file into a list of `InputBatch`s."""
features = []
for (ex_index, example) in enumerate(sentences):
tokens_a = tokenizer.tokenize(example)
# Account for [CLS] and [SEP] with "- 2"
if len(tokens_a) > seq_length - 2:
tokens_a = tokens_a[0:(seq_length - 2)]
# The convention in BERT is:
# (a) For sequence pairs:
# tokens: [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]
# type_ids: 0 0 0 0 0 0 0 0 1 1 1 1 1 1
# (b) For single sequences:
# tokens: [CLS] the dog is hairy . [SEP]
# type_ids: 0 0 0 0 0 0 0
#
# Where "type_ids" are used to indicate whether this is the first
# sequence or the second sequence. The embedding vectors for `type=0` and
# `type=1` were learned during pre-training and are added to the wordpiece
# embedding vector (and position vector). This is not *strictly* necessary
# since the [SEP] token unambigiously separates the sequences, but it makes
# it easier for the model to learn the concept of sequences.
#
# For classification tasks, the first vector (corresponding to [CLS]) is
# used as as the "sentence vector". Note that this only makes sense because
# the entire model is fine-tuned.
tokens = []
input_type_ids = []
tokens.append("[CLS]")
input_type_ids.append(0)
for token in tokens_a:
tokens.append(token)
input_type_ids.append(0)
tokens.append("[SEP]")
input_type_ids.append(0)
input_ids = tokenizer.convert_tokens_to_ids(tokens)
# The mask has 1 for real tokens and 0 for padding tokens. Only real
# tokens are attended to.
input_mask = [1] * len(input_ids)
# Zero-pad up to the sequence length.
while len(input_ids) < seq_length:
input_ids.append(0)
input_mask.append(0)
input_type_ids.append(0)
assert len(input_ids) == seq_length
assert len(input_mask) == seq_length
assert len(input_type_ids) == seq_length
if ex_index < 5 and not silent:
print("*** Example ***")
print("tokens: %s" % " ".join([str(x) for x in tokens]))
print("input_ids: %s" % " ".join([str(x) for x in input_ids]))
print("input_mask: %s" % " ".join([str(x) for x in input_mask]))
print("input_type_ids: %s" % " ".join([str(x) for x in input_type_ids]))
features.append(
InputFeatures(
tokens=tokens,
input_ids=input_ids,
input_mask=input_mask,
input_type_ids=input_type_ids))
return features
# only need to pass in a list of sentences
def bert_encode(sentences, max_seq_length=128, is_cuda=False):
features = convert_examples_to_features(
sentences=sentences, seq_length=max_seq_length, tokenizer=tokenizer)
if is_cuda:
input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long).cuda(gpu_id)
input_masks = torch.tensor([f.input_mask for f in features], dtype=torch.long).cuda(gpu_id)
else:
input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
input_masks = torch.tensor([f.input_mask for f in features], dtype=torch.long)
final_encoder_layers, _ = model(input_ids, token_type_ids=None, attention_mask=input_masks, output_all_encoded_layers=False)
# the [CLS] position
return final_encoder_layers[:, 0].data
import torch
from pytorch_pretrained_bert import BertTokenizer, BertModel, BertForMaskedLM, BertForSequenceClassification
# Load pre-trained model tokenizer (vocabulary)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')
model.eval()
model.to(gpu_id)
bert_encode(corpora[:32], is_cuda=True)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment