Created
July 22, 2019 21:49
-
-
Save shubhamagarwal92/37ccb747f7130a35a8e76aa66d60e014 to your computer and use it in GitHub Desktop.
Use pytorch-transformers from hugging face to get bert embeddings in pytorch
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import torch | |
import logging | |
from pytorch_transformers import BertModel, BertTokenizer | |
from pytorch_transformers import * | |
from typing import List | |
CLS_TOKEN = "[CLS]" | |
SEP_TOKEN = "[SEP]" | |
logger = logging.getLogger(__name__) | |
class InputExample(object): | |
"""A single training/test example for simple sequence classification.""" | |
def __init__(self, guid, text_a, text_b=None, label=None): | |
"""Constructs a InputExample. | |
Args: | |
guid: Unique id for the example. | |
text_a: string. The untokenized text of the first sequence. For single | |
sequence tasks, only this sequence must be specified. | |
text_b: (Optional) string. The untokenized text of the second sequence. | |
Only must be specified for sequence pair tasks. | |
label: (Optional) string. The label of the example. This should be | |
specified for train and dev examples, but not for test examples. | |
""" | |
self.guid = guid | |
self.text_a = text_a | |
self.text_b = text_b | |
self.label = label | |
class InputFeatures(object): | |
"""A single set of features of data.""" | |
def __init__(self, input_ids, input_mask, segment_ids): | |
self.input_ids = input_ids | |
self.input_mask = input_mask | |
self.segment_ids = segment_ids | |
def convert_examples_to_features(examples, tokenizer, max_seq_length, | |
cls_token_at_end=False, pad_on_left=False, | |
cls_token='[CLS]', sep_token='[SEP]', pad_token=0, | |
sequence_a_segment_id=0, sequence_b_segment_id=1, | |
cls_token_segment_id=1, pad_token_segment_id=0, | |
mask_padding_with_zero=True): | |
""" Loads a data file into a list of `InputBatch`s | |
`cls_token_at_end` define the location of the CLS token: | |
- False (Default, BERT/XLM pattern): [CLS] + A + [SEP] + B + [SEP] | |
- True (XLNet/GPT pattern): A + [SEP] + B + [SEP] + [CLS] | |
`cls_token_segment_id` define the segment id associated to the CLS token (0 for BERT, 2 for XLNet) | |
""" | |
features = [] | |
for (ex_index, example) in enumerate(examples): | |
if ex_index % 10000 == 0: | |
logger.info("Writing example %d of %d" % (ex_index, len(examples))) | |
tokens_a = tokenizer.tokenize(example.text_a) | |
tokens_b = None | |
if example.text_b: | |
tokens_b = tokenizer.tokenize(example.text_b) | |
# Modifies `tokens_a` and `tokens_b` in place so that the total | |
# length is less than the specified length. | |
# Account for [CLS], [SEP], [SEP] with "- 3" | |
_truncate_seq_pair(tokens_a, tokens_b, max_seq_length - 3) | |
else: | |
# Account for [CLS] and [SEP] with "- 2" | |
if len(tokens_a) > max_seq_length - 2: | |
tokens_a = tokens_a[:(max_seq_length - 2)] | |
# The convention in BERT is: | |
# (a) For sequence pairs: | |
# tokens: [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP] | |
# type_ids: 0 0 0 0 0 0 0 0 1 1 1 1 1 1 | |
# (b) For single sequences: | |
# tokens: [CLS] the dog is hairy . [SEP] | |
# type_ids: 0 0 0 0 0 0 0 | |
# | |
# Where "type_ids" are used to indicate whether this is the first | |
# sequence or the second sequence. The embedding vectors for `type=0` and | |
# `type=1` were learned during pre-training and are added to the wordpiece | |
# embedding vector (and position vector). This is not *strictly* necessary | |
# since the [SEP] token unambiguously separates the sequences, but it makes | |
# it easier for the model to learn the concept of sequences. | |
# | |
# For classification tasks, the first vector (corresponding to [CLS]) is | |
# used as as the "sentence vector". Note that this only makes sense because | |
# the entire model is fine-tuned. | |
tokens = tokens_a + [sep_token] | |
segment_ids = [sequence_a_segment_id] * len(tokens) | |
if tokens_b: | |
tokens += tokens_b + [sep_token] | |
segment_ids += [sequence_b_segment_id] * (len(tokens_b) + 1) | |
if cls_token_at_end: | |
tokens = tokens + [cls_token] | |
segment_ids = segment_ids + [cls_token_segment_id] | |
else: | |
tokens = [cls_token] + tokens | |
segment_ids = [cls_token_segment_id] + segment_ids | |
input_ids = tokenizer.convert_tokens_to_ids(tokens) | |
# The mask has 1 for real tokens and 0 for padding tokens. Only real | |
# tokens are attended to. | |
input_mask = [1 if mask_padding_with_zero else 0] * len(input_ids) | |
# Zero-pad up to the sequence length. | |
padding_length = max_seq_length - len(input_ids) | |
if pad_on_left: | |
input_ids = ([pad_token] * padding_length) + input_ids | |
input_mask = ([0 if mask_padding_with_zero else 1] * padding_length) + input_mask | |
segment_ids = ([pad_token_segment_id] * padding_length) + segment_ids | |
else: | |
input_ids = input_ids + ([pad_token] * padding_length) | |
input_mask = input_mask + ([0 if mask_padding_with_zero else 1] * padding_length) | |
segment_ids = segment_ids + ([pad_token_segment_id] * padding_length) | |
assert len(input_ids) == max_seq_length | |
assert len(input_mask) == max_seq_length | |
assert len(segment_ids) == max_seq_length | |
if ex_index < 5: | |
logger.info("*** Example ***") | |
logger.info("guid: %s" % (example.guid)) | |
logger.info("tokens: %s" % " ".join( | |
[str(x) for x in tokens])) | |
logger.info("input_ids: %s" % " ".join([str(x) for x in input_ids])) | |
logger.info("input_mask: %s" % " ".join([str(x) for x in input_mask])) | |
logger.info("segment_ids: %s" % " ".join([str(x) for x in segment_ids])) | |
features.append( | |
InputFeatures(input_ids=input_ids, | |
input_mask=input_mask, | |
segment_ids=segment_ids)) | |
return features | |
def _truncate_seq_pair(tokens_a, tokens_b, max_length): | |
"""Truncates a sequence pair in place to the maximum length.""" | |
# This is a simple heuristic which will always truncate the longer sequence | |
# one token at a time. This makes more sense than truncating an equal percent | |
# of tokens from each, since if one sequence is very short then each token | |
# that's truncated likely contains more information than a longer sequence. | |
while True: | |
total_length = len(tokens_a) + len(tokens_b) | |
if total_length <= max_length: | |
break | |
if len(tokens_a) > len(tokens_b): | |
tokens_a.pop() | |
else: | |
tokens_b.pop() | |
def select_field(features, field): | |
"""As the output is dic, return relevant field""" | |
return [[choice[field] for choice in feature.choices_features] for feature in features] | |
def create_examples(_list, set_type="train"): | |
"""Creates examples for the training and dev sets.""" | |
examples = [] | |
for (i, line) in enumerate(_list): | |
guid = "%s-%s" % (set_type, i) | |
text_a = line | |
# text_b = line[1] | |
examples.append( | |
InputExample(guid=guid, text_a=text_a)) | |
return examples | |
class BertEmbedder: | |
def __init__(self, | |
pretrained_weights='bert-base-uncased', | |
tokenizer_class=BertTokenizer, | |
model_class=BertModel, | |
max_seq_len=20): | |
super().__init__() | |
self.pretrained_weights = pretrained_weights | |
self.tokenizer_class = tokenizer_class | |
self.model_class = model_class | |
self.tokenizer = self.tokenizer_class.from_pretrained(pretrained_weights) | |
self.model = self.model_class.from_pretrained(pretrained_weights) | |
self.max_seq_len = max_seq_len | |
# tokenizer = BertTokenizer.from_pretrained(pretrained_weights) | |
# model = BertModel.from_pretrained(pretrained_weights) | |
def get_bert_embeddings(self, | |
raw_text: List[str]) -> torch.tensor: | |
examples = create_examples(raw_text) | |
features = convert_examples_to_features( | |
examples, self.tokenizer, self.max_seq_len, True) | |
all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long) | |
all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long) | |
all_segment_ids = torch.tensor([f.segment_ids for f in features], dtype=torch.long) | |
last_hidden_states = self.model(all_input_ids)[0] # Models outputs are now tuples | |
print(last_hidden_states.size()) | |
return last_hidden_states | |
if __name__=="__main__": | |
embedder = BertEmbedder() | |
raw_text = ["[CLS] This is first element [SEP] continuing statement", | |
"[CLS] second element of the list."] | |
bert_embedding = embedder.get_bert_embeddings(raw_text) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment