Skip to content

Instantly share code, notes, and snippets.

View gaphex's full-sized avatar

Denis gaphex

  • Moscow
View GitHub Profile
MODEL_DIR = "uncased_L-12_H-768_A-12"
config_path = "/content/{}/bert_config.json".format(MODEL_DIR)
vocab_path = "/content/{}/vocab.txt".format(MODEL_DIR)
tags_and_args = []
for is_training in (True, False):
tags = set()
if is_training:
tags.add("train")
@gaphex
gaphex / read_examples.py
Created November 17, 2019 15:21
Read a list of `InputExample`s from a list of strings.
def read_examples(str_list):
"""Read a list of `InputExample`s from a list of strings."""
unique_id = 0
for s in str_list:
line = convert_to_unicode(s)
if not line:
continue
line = line.strip()
text_a = None
text_b = None
@gaphex
gaphex / features_to_arrays.py
Created November 17, 2019 15:25
"""Convert a list of InputFeatures to np.arrays"""
def features_to_arrays(features):
"""Convert a list of InputFeatures to np.arrays"""
all_input_ids = []
all_input_mask = []
all_segment_ids = []
for feature in features:
all_input_ids.append(feature.input_ids)
all_input_mask.append(feature.input_mask)
@gaphex
gaphex / build_bert_preprocessor.py
Created November 17, 2019 15:29
Build a text preprocessing pipeline for BERT
def build_preprocessor(voc_path, seq_len, lower=True):
"""
Build a text preprocessing pipeline for BERT
Returns a function which converts a list of strings to a list
of three np.arrays with [input_ids, input_mask, segment_ids]
"""
tokenizer = FullTokenizer(vocab_file=voc_path, do_lower_case=lower)
def strings_to_arrays(sents):
@gaphex
gaphex / bert_layer.py
Last active November 17, 2019 17:45
Keras BERT layer
class BertLayer(tf.keras.layers.Layer):
def __init__(self, bert_path, seq_len=64, n_tune_layers=3,
pooling="cls", verbose=False,
tune_embeddings=False, **kwargs):
self.n_tune_layers = n_tune_layers
self.tune_embeddings = tune_embeddings
self.seq_len = seq_len
self.trainable = True
@gaphex
gaphex / build.py
Created November 17, 2019 17:46
layer build method
def build(self, input_shape):
self.bert = hub.Module(self.bert_path, trainable=self.trainable, name=f"{self.name}_module")
trainable_layers = []
if self.tune_embeddings:
trainable_layers.append("embeddings")
if self.pooling == "cls":
trainable_layers.append("pooler")
@gaphex
gaphex / build_preprocessor.py
Created November 17, 2019 17:47
build_preprocessor method
def build_preprocessor(self):
sess = tf.keras.backend.get_session()
tokenization_info = self.bert(signature="tokenization_info", as_dict=True)
vocab_file, do_lower_case = sess.run([tokenization_info["vocab_file"],
tokenization_info["do_lower_case"]])
self.preprocessor = build_preprocessor(vocab_file, self.seq_len, do_lower_case)
@gaphex
gaphex / initialize_module.py
Last active November 25, 2019 18:06
initialize_module method
def initialize_module(self):
sess = tf.keras.backend.get_session()
vars_initialized = sess.run([tf.is_variable_initialized(var)
for var in self.bert.variables])
uninitialized = []
for var, is_initialized in zip(self.bert.variables, vars_initialized):
if not is_initialized:
uninitialized.append(var)
@gaphex
gaphex / call.py
Last active November 27, 2019 11:39
call method
def call(self, input):
if self.w_preprocessing:
input = tf.numpy_function(self.preprocessor, [input], [tf.int32, tf.int32, tf.int32])
for feature in input:
feature.set_shape((None, self.seq_len))
input_ids, input_mask, segment_ids = input
bert_inputs = dict(
input_ids=input_ids, input_mask=input_mask, segment_ids=segment_ids
inp = tf.keras.Input(shape=(1,), dtype=tf.string)
encoder = BertLayer(bert_path="./bert-module/", seq_len=48,
tune_embeddings=False, do_preprocessing=True,
pooling='cls', n_tune_layers=3, verbose=False)
pred = tf.keras.layers.Dense(1, activation='sigmoid')(encoder(inp))
model = tf.keras.models.Model(inputs=[inp], outputs=[pred])
model.summary()