Skip to content

Instantly share code, notes, and snippets.

@PhilipMay
Created September 29, 2020 09:46
Show Gist options
  • Save PhilipMay/e4034f92e098aed1b72ce146c104a17f to your computer and use it in GitHub Desktop.
Save PhilipMay/e4034f92e098aed1b72ce146c104a17f to your computer and use it in GitHub Desktop.
from torch.utils.data import DataLoader
import math
from sentence_transformers import models, losses
from sentence_transformers import SentencesDataset, LoggingHandler, SentenceTransformer, util, InputExample
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator, SimilarityFunction
import logging
from datetime import datetime
import sys
import os
import gzip
import pandas as pd
import csv
import numpy as np
import optuna
import transformers
evaluation_steps = 1000
base_save_dir = '/srv/data/nlp/sentence_transformers'
model_name = 'dbmdz/bert-base-german-uncased'
study_name='all_nli_de_08'
logging.basicConfig(format='%(asctime)s - %(message)s',
datefmt='%Y-%m-%d %H:%M:%S',
level=logging.INFO,
handlers=[LoggingHandler()])
def callback(value, a, b):
print('callback:', value, a, b)
if math.isnan(value):
raise optuna.exceptions.TrialPruned()
def train(trial, i):
train_batch_size = trial.suggest_int('train_batch_size', 16, 60)
num_epochs = trial.suggest_int('num_epochs', 1, 5)
lr = trial.suggest_uniform('lr', 2e-6, 2e-4) # 2e-5
eps = trial.suggest_uniform('eps', 1e-7, 1e-5) # 1e-6
weight_decay = trial.suggest_uniform('weight_decay', 0.001, 0.1) # 0.01
warmup_steps_mul = trial.suggest_uniform('warmup_steps_mul', 0.1, 0.5)
model_save_path = f'{base_save_dir}/{study_name}_t{trial.number:02d}_i{i}'
label2int = {"contradiction": 0, "entailment": 1, "neutral": 2}
# create model
word_embedding_model = models.Transformer(model_name)
pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(),
pooling_mode_mean_tokens=True,
pooling_mode_cls_token=False,
pooling_mode_max_tokens=False)
model = SentenceTransformer(modules=[word_embedding_model, pooling_model])
# read mnli
mnli_df = pd.read_csv('./mnli/mnli_all_en_de.csv')
mnli_df.drop(mnli_df[mnli_df['gold_label'] == '-'].index, inplace=True)
mnli_df.dropna(inplace=True)
s1_de = mnli_df['sentence1_de'].tolist()
s2_de = mnli_df['sentence2_de'].tolist()
label = mnli_df['gold_label'].tolist()
# read and add snli
snli_df = pd.read_csv('./snli/snli_all_en_de.csv')
snli_df.drop(snli_df[snli_df['gold_label'] == '-'].index, inplace=True)
snli_df.dropna(inplace=True)
s1_de.extend(snli_df['sentence1_de'].tolist())
s2_de.extend(snli_df['sentence2_de'].tolist())
label.extend(snli_df['gold_label'].tolist())
assert len(s1_de) == len(s2_de) == len(label)
train_samples = []
for i, (_s1_de, _s2_de, _label) in enumerate(zip(s1_de, s2_de, label)):
label_id = label2int[_label]
assert type(_s1_de) == str
assert len(_s1_de) > 0
assert type(_s2_de) == str
assert len(_s2_de) > 0
assert type(label_id) == int
train_samples.append(InputExample(texts=[_s1_de, _s2_de], label=label_id))
train_dataset = SentencesDataset(train_samples, model=model)
train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=train_batch_size)
train_loss = losses.SoftmaxLoss(model=model, sentence_embedding_dimension=model.get_sentence_embedding_dimension(), num_labels=len(label2int))
stsb_dev = pd.read_csv('./data/stsbenchmark/de/sts_dev_de.csv', sep='\t', quoting=csv.QUOTE_NONE, names=['label', 's1', 's2'])
s1 = stsb_dev['s1'].tolist()
s2 = stsb_dev['s2'].tolist()
label = stsb_dev['label'].tolist()
stsb_dev = pd.read_csv('./data/stsbenchmark/de/sts_test_de.csv', sep='\t', quoting=csv.QUOTE_NONE, names=['label', 's1', 's2'])
s1.extend(stsb_dev['s1'].tolist())
s2.extend(stsb_dev['s2'].tolist())
label.extend(stsb_dev['label'].tolist())
dev_samples = []
for _s1, _s2, _label in zip(s1, s2, label):
score = _label / 5.0
assert type(_s1) == str
assert len(_s1) > 0
assert type(_s2) == str
assert len(_s2) > 0
assert type(score) == float
assert score >= 0.0
assert score <= 1.0
dev_samples.append(InputExample(texts=[_s1, _s2], label=score))
assert len(dev_samples) == 1500 + 1379
dev_evaluator = EmbeddingSimilarityEvaluator.from_input_examples(
dev_samples,
batch_size=train_batch_size,
name='sts-dev',
main_similarity=SimilarityFunction.COSINE
)
warmup_steps = math.ceil(len(train_dataset) * num_epochs / train_batch_size * warmup_steps_mul) # 0.1
logging.info("Warmup-steps: {}".format(warmup_steps))
#optimizer_class = None
#optimizer_class_str = trial.suggest_categorical('optimizer_class', ['AdamW', 'Adafactor'])
#if optimizer_class_str == 'Adafactor':
# optimizer_class = transformers.optimization.Adafactor
#elif optimizer_class_str == 'AdamW':
# optimizer_class = transformers.optimization.AdamW
#else:
# assert False
# Train the model
model.fit(train_objectives=[(train_dataloader, train_loss)],
evaluator=dev_evaluator,
epochs=num_epochs,
scheduler=trial.suggest_categorical('scheduler', ['WarmupLinear', 'warmupcosine', 'warmupcosinewithhardrestarts']),
#optimizer_class=optimizer_class,
evaluation_steps=evaluation_steps,
warmup_steps=warmup_steps,
output_path=model_save_path,
optimizer_params={'lr': lr, 'eps': eps, 'correct_bias': False},
weight_decay=weight_decay,
callback=callback,
)
best_score = model.best_score
print(best_score)
return best_score
def objective(trial):
try:
results = []
for i in range(3):
result = train(trial, i)
results.append(result)
trial.set_user_attr('results', str(results))
mean_result = np.mean(results)
trial.set_user_attr('mean_result', str(mean_result))
if mean_result < 0.77:
return mean_result
return mean_result
except Exception as e:
trial.set_user_attr('exception', str(e))
print(e)
return 0
study = optuna.create_study(
study_name=study_name,
storage='sqlite:///optuna.db',
load_if_exists=True,
direction='maximize',
)
study.optimize(objective)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment