Created
November 8, 2019 16:18
-
-
Save LysandreJik/bac0134c4518c7ea64f16083086d77c8 to your computer and use it in GitHub Desktop.
Comparing ALBERT TF1 and HUB
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import tensorflow_hub as hub | |
import tensorflow as tf | |
import modeling | |
import os | |
import numpy as np | |
import tokenization | |
# Model size and paths | |
model_size = 'large'.upper() | |
version = 2 | |
vocab_path = "path_to_vocab/30k-clean.model" | |
config_path = "path_to_config/config.json" | |
albert_path = "path_to_albert/albert" | |
# Init tokenizer | |
tftok = tokenization.FullTokenizer(vocab_path, spm_model_file=vocab_path) | |
# Create inputs | |
input_sentence = "this is nice".lower() | |
tf_input_ids_init = [tftok.convert_tokens_to_ids(tftok.tokenize(input_sentence))] | |
input_mask = [[1] * len(tf_input_ids_init[0])] | |
segment_ids = [[0] * len(tf_input_ids_init[0])] | |
tf_input_ids = tf.constant(tf_input_ids_init) | |
tf_input_mask = tf.constant(input_mask) | |
tf_segment_ids = tf.constant(segment_ids) | |
tf_dict = {"input_ids": tf_input_ids, "input_mask": tf_input_mask, "token_type_ids": tf_segment_ids} | |
mlm_index = 2 | |
tf_mlm_positions = tf.constant([[mlm_index]]) | |
# Load the config and model | |
albert_config = modeling.AlbertConfig.from_json_file(config_path) | |
albert_module_modeling = modeling.AlbertModel(albert_config, is_training=False, **tf_dict) | |
# Init the model from the saved HUB checkpoint | |
tvars = tf.trainable_variables() | |
(assignment_map, initialized_variable_names) = modeling.get_assignment_map_from_checkpoint(tvars, albert_path) | |
tf.train.init_from_checkpoint(albert_path, assignment_map) | |
assert len(tvars) == len(assignment_map.items()) | |
# Instantiate a HUB module with the right size | |
model = hub.Module("https://tfhub.dev/google/albert_{}/{}".format(model_size.lower(), version), trainable=False) | |
albert_inputs = dict(input_ids=tf_input_ids, input_mask=tf_input_mask, segment_ids=tf_segment_ids) | |
# Get the model outputs | |
albert_outputs = model(albert_inputs, signature="tokens", as_dict=True) | |
albert_mlm_outputs = model({**albert_inputs, "mlm_positions": tf_mlm_positions}, signature="mlm", as_dict=True)["mlm_logits"] | |
pooled_output = albert_outputs["pooled_output"] | |
sequence_output = albert_outputs["sequence_output"] | |
init = tf.global_variables_initializer() | |
with tf.Session() as sess: | |
sess.run(init) | |
# Using the HUB module | |
hub_pooled_output = sess.run(pooled_output) | |
hub_sequence_output = sess.run(sequence_output) | |
hub_mlm_outputs = sess.run(albert_mlm_outputs) | |
# Using the TF1 implementation | |
tf_embedding = sess.run(albert_module_modeling.get_embedding_output()) | |
tf_pooled = sess.run(albert_module_modeling.get_pooled_output()) | |
tf_sequence = sess.run(albert_module_modeling.get_sequence_output()) | |
# Calculate the difference between two tensors | |
def difference_between_tensors(tf_tensor, tf_tensor_2): | |
tf_np = np.array(tf_tensor) | |
tf_np_2 = np.array(tf_tensor_2) | |
return np.max(np.abs(tf_np - tf_np_2)) | |
print("\nComparing the HUB and TF1 layers") | |
print("-- pooled ", difference_between_tensors(hub_pooled_output, tf_pooled)) | |
print("-- full transformer ", difference_between_tensors(hub_sequence_output, tf_sequence)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment