Skip to content

Instantly share code, notes, and snippets.

View gaphex's full-sized avatar

Denis gaphex

  • Moscow
View GitHub Profile
{
"embeddings": [
{
"tensorName": "Intent40",
"tensorShape": [
1360,
768
],
"tensorPath": "https://gist.githubusercontent.com/gaphex/d56f0124a7b7459408d92bbaca373f3e/raw/abb29897ab1015d7e05d9eda90514aeb45181953/embeddings.tsv",
"metadataPath": "https://gist.githubusercontent.com/gaphex/2793c044577abc73cff7d438a2097ef1/raw/014ab59be93252b9516c0883d21dba6a9b0622ad/metadata.tsv"
!wget https://storage.googleapis.com/bert_models/2018_10_18/uncased_L-12_H-768_A-12.zip
!unzip uncased_L-12_H-768_A-12.zip
!wget https://nlp.stanford.edu/projects/snli/snli_1.0.zip
!unzip snli_1.0.zip
!wget https://www.nyu.edu/projects/bowman/multinli/multinli_1.0.zip
!unzip multinli_1.0.zip
!git clone https://github.com/brmson/dataset-sts
!git clone https://github.com/gaphex/bert_experimental
from loader import load_sts, load_sick2014
from bert_experimental.finetuning.text_preprocessing import build_preprocessor
from bert_experimental.finetuning.bert_layer import BertLayer
from bert_experimental.finetuning.modeling import BertConfig, BertModel, build_bert_module
BERT_DIR = "/content/uncased_L-12_H-768_A-12/"
build_bert_module(BERT_DIR+"bert_config.json",
BERT_DIR+"vocab.txt",
def load_snli(fpaths):
sa, sb, lb = [], [], []
fpaths = np.atleast_1d(fpaths)
for fpath in fpaths:
with open(fpath) as fi:
for line in fi:
sample = json.loads(line)
sa.append(sample['sentence1'])
sb.append(sample['sentence2'])
lb.append(sample['gold_label'])
def prepare_snli(sa, sb, lb):
classes = {"entailment", "contradiction"}
anc_to_pairs = defaultdict(list)
filtered = {}
skipped = 0
anchor_id = 0
for xa, xb, y in zip(sa, sb, lb):
anc_to_pairs[xa].append((xb, y))
train_data = ["./snli_1.0/snli_1.0_train.jsonl", "./multinli_1.0/multinli_1.0_train.jsonl"]
test_data = ["./snli_1.0/snli_1.0_test.jsonl", "./multinli_1.0/multinli_1.0_dev_matched.jsonl"]
tr_a, tr_b, tr_l = load_snli(train_data)
ts_a, ts_b, ts_l = load_snli(test_data)
fd_tr = prepare_dataset(tr_a, tr_b, tr_l)
fd_ts = prepare_dataset(ts_a, ts_b, ts_l)
class TripletGenerator:
def __init__(self, datadict, hard_frac = 0.2, batch_size=256):
self.datadict = datadict
self._anchor_idx = np.array(list(self.datadict.keys()))
self._hard_frac = hard_frac
self._generator = self.generate_batch(batch_size)
def generate_batch(self, size):
while True:
def build_model(module_path, seq_len = 24, tune_lr=6, loss = softmax_loss):
inp_anc = tf.keras.Input(shape=(1, ), dtype=tf.string)
inp_pos = tf.keras.Input(shape=(1, ), dtype=tf.string)
inp_neg = tf.keras.Input(shape=(1, ), dtype=tf.string)
sent_encoder = BertLayer(module_path, seq_len, n_tune_layers=tune_lr, do_preprocessing=True,
verbose=False, pooling="mean", trainable=True, tune_embeddings=False)
anc_enc = sent_encoder(inp_anc)
def softmax_loss(vectors):
anc, pos, neg = vectors
pos_sim = tf.reduce_sum((anc * pos), axis=-1, keepdims=True)
neg_mul = tf.matmul(anc, neg, transpose_b=True)
neg_sim = tf.log(tf.reduce_sum(tf.exp(neg_mul), axis=-1, keepdims=True))
loss = tf.nn.relu(neg_sim - pos_sim)
return loss
class PearsonrRankCallback(Callback):
def __init__(self, loader, filepaths, name=None, verbose=False,
sim_model=None, savemodel=None, savepath=None):
self.savemodel = savemodel
self.savepath = savepath
self.sim_model = sim_model
self.loader = loader
self.verbose = verbose