Skip to content

Instantly share code, notes, and snippets.

@lijiansong
Created April 14, 2021 06:16
Show Gist options
  • Save lijiansong/95c84fbd0bddf82590203c3473f333e9 to your computer and use it in GitHub Desktop.
Save lijiansong/95c84fbd0bddf82590203c3473f333e9 to your computer and use it in GitHub Desktop.
import math
import datetime
from tqdm import tqdm
import pandas as pd
import numpy as np
import tensorflow as tf
import bert
from bert import BertModelLayer
from bert.loader import StockBertConfig, map_stock_config_to_params, load_stock_weights
from bert.tokenization.bert_tokenization import FullTokenizer
## Load Data!!!
from tensorflow import keras
import os
import re
# Load all files from a directory in a DataFrame.
def load_directory_data(directory):
data = {}
data["sentence"] = []
data["sentiment"] = []
for file_path in tqdm(os.listdir(directory), desc=os.path.basename(directory)):
with tf.io.gfile.GFile(os.path.join(directory, file_path), "r") as f:
data["sentence"].append(f.read())
data["sentiment"].append(re.match("\d+_(\d+)\.txt", file_path).group(1))
return pd.DataFrame.from_dict(data)
# Merge positive and negative examples, add a polarity column and shuffle.
def load_dataset(directory):
pos_df = load_directory_data(os.path.join(directory, "pos"))
neg_df = load_directory_data(os.path.join(directory, "neg"))
pos_df["polarity"] = 1
neg_df["polarity"] = 0
return pd.concat([pos_df, neg_df]).sample(frac=1).reset_index(drop=True)
# Download and process the dataset files.
def download_and_load_datasets(force_download=False):
dataset = tf.keras.utils.get_file(
fname="aclImdb.tar.gz",
origin="http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz",
extract=True)
train_df = load_dataset(os.path.join(os.path.dirname(dataset),
"aclImdb", "train"))
test_df = load_dataset(os.path.join(os.path.dirname(dataset),
"aclImdb", "test"))
return train_df, test_df
import bert
from bert import BertModelLayer
from bert.loader import StockBertConfig, map_stock_config_to_params, load_stock_weights
#from bert.tokenization import FullTokenizer
class MovieReviewData:
DATA_COLUMN = "sentence"
LABEL_COLUMN = "polarity"
def __init__(self, tokenizer: FullTokenizer, sample_size=None, max_seq_len=1024):
self.tokenizer = tokenizer
self.sample_size = sample_size
self.max_seq_len = 0
train, test = download_and_load_datasets()
train, test = map(lambda df: df.reindex(df[MovieReviewData.DATA_COLUMN].str.len().sort_values().index),
[train, test])
if sample_size is not None:
assert sample_size % 128 == 0
train, test = train.head(sample_size), test.head(sample_size)
# train, test = map(lambda df: df.sample(sample_size), [train, test])
((self.train_x, self.train_y),
(self.test_x, self.test_y)) = map(self._prepare, [train, test])
print("max seq_len", self.max_seq_len)
self.max_seq_len = min(self.max_seq_len, max_seq_len)
((self.train_x, self.train_x_token_types),
(self.test_x, self.test_x_token_types)) = map(self._pad,
[self.train_x, self.test_x])
def _prepare(self, df):
x, y = [], []
with tqdm(total=df.shape[0], unit_scale=True) as pbar:
for ndx, row in df.iterrows():
text, label = row[MovieReviewData.DATA_COLUMN], row[MovieReviewData.LABEL_COLUMN]
tokens = self.tokenizer.tokenize(text)
tokens = ["[CLS]"] + tokens + ["[SEP]"]
token_ids = self.tokenizer.convert_tokens_to_ids(tokens)
self.max_seq_len = max(self.max_seq_len, len(token_ids))
x.append(token_ids)
y.append(int(label))
pbar.update()
return np.array(x), np.array(y)
def _pad(self, ids):
x, t = [], []
token_type_ids = [0] * self.max_seq_len
for input_ids in ids:
input_ids = input_ids[:min(len(input_ids), self.max_seq_len - 2)]
input_ids = input_ids + [0] * (self.max_seq_len - len(input_ids))
x.append(np.array(input_ids))
t.append(token_type_ids)
return np.array(x), np.array(t)
bert_model_name="uncased_L-12_H-768_A-12"
bert_ckpt_dir = os.path.join(".model/",bert_model_name)
bert_ckpt_file = os.path.join(bert_ckpt_dir, "bert_model.ckpt")
bert_config_file = os.path.join(bert_ckpt_dir, "bert_config.json")
# perparing data
tokenizer = FullTokenizer(vocab_file=os.path.join(bert_ckpt_dir, "vocab.txt"))
data = MovieReviewData(tokenizer,
sample_size=10*128*2,#5000,
max_seq_len=128)
print(" train_x:", data.train_x.shape)
print("train_x_token_types:", data.train_x_token_types.shape)
print(" train_y:", data.train_y.shape)
print(" test_x:", data.test_x.shape)
print(" max_seq_len:", data.max_seq_len)
def flatten_layers(root_layer):
if isinstance(root_layer, keras.layers.Layer):
yield root_layer
for layer in root_layer._layers:
for sub_layer in flatten_layers(layer):
yield sub_layer
def freeze_bert_layers(l_bert):
"""
Freezes all but LayerNorm and adapter layers - see arXiv:1902.00751.
"""
for layer in flatten_layers(l_bert):
if layer.name in ["LayerNorm", "adapter-down", "adapter-up"]:
layer.trainable = True
elif len(layer._layers) == 0:
layer.trainable = False
l_bert.embeddings_layer.trainable = False
def create_learning_rate_scheduler(max_learn_rate=5e-5,
end_learn_rate=1e-7,
warmup_epoch_count=10,
total_epoch_count=90):
def lr_scheduler(epoch):
if epoch < warmup_epoch_count:
res = (max_learn_rate/warmup_epoch_count) * (epoch + 1)
else:
res = max_learn_rate*math.exp(math.log(end_learn_rate/max_learn_rate)*(epoch-warmup_epoch_count+1)/(total_epoch_count-warmup_epoch_count+1))
return float(res)
learning_rate_scheduler = tf.keras.callbacks.LearningRateScheduler(lr_scheduler, verbose=1)
return learning_rate_scheduler
def create_model(max_seq_len, adapter_size=64):
"""Creates a classification model."""
#adapter_size = 64 # see - arXiv:1902.00751
# create the bert layer
with tf.io.gfile.GFile(bert_config_file, "r") as reader:
bc = StockBertConfig.from_json_string(reader.read())
bert_params = map_stock_config_to_params(bc)
bert_params.adapter_size = adapter_size
bert = BertModelLayer.from_params(bert_params, name="bert")
input_ids = keras.layers.Input(shape=(max_seq_len,), dtype='int32', name="input_ids")
# token_type_ids = keras.layers.Input(shape=(max_seq_len,), dtype='int32', name="token_type_ids")
# output = bert([input_ids, token_type_ids])
output = bert(input_ids)
print("bert shape", output.shape)
cls_out = keras.layers.Lambda(lambda seq: seq[:, 0, :])(output)
cls_out = keras.layers.Dropout(0.5)(cls_out)
logits = keras.layers.Dense(units=768, activation="tanh")(cls_out)
logits = keras.layers.Dropout(0.5)(logits)
logits = keras.layers.Dense(units=2, activation="softmax")(logits)
# model = keras.Model(inputs=[input_ids, token_type_ids], outputs=logits)
# model.build(input_shape=[(None, max_seq_len), (None, max_seq_len)])
model = keras.Model(inputs=input_ids, outputs=logits)
model.build(input_shape=(None, max_seq_len))
# load the pre-trained model weights
load_stock_weights(bert, bert_ckpt_file)
# freeze weights if adapter-BERT is used
if adapter_size is not None:
freeze_bert_layers(bert)
model.compile(optimizer=keras.optimizers.Adam(),
loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True),
metrics=[keras.metrics.SparseCategoricalAccuracy(name="acc")])
model.summary()
return model
adapter_size = None # use None to fine-tune all of BERT
model = create_model(data.max_seq_len, adapter_size=adapter_size)
log_dir = ".log/movie_reviews/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%s")
tensorboard_callback = keras.callbacks.TensorBoard(log_dir=log_dir)
total_epoch_count = 50
# model.fit(x=(data.train_x, data.train_x_token_types), y=data.train_y,
model.fit(x=data.train_x, y=data.train_y,
validation_split=0.1,
batch_size=48,
shuffle=True,
epochs=total_epoch_count,
callbacks=[create_learning_rate_scheduler(max_learn_rate=1e-5,
end_learn_rate=1e-7,
warmup_epoch_count=20,
total_epoch_count=total_epoch_count),
keras.callbacks.EarlyStopping(patience=20, restore_best_weights=True),
tensorboard_callback])
#model.save_weights('./movie_reviews.h5', overwrite=True)
_, train_acc = model.evaluate(data.train_x, data.train_y)
print("train acc", train_acc)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment