Skip to content

Instantly share code, notes, and snippets.

@Leo-Lee15
Forked from PhilipMay/all_nli_de.py
Created March 21, 2024 03:19
Show Gist options
  • Select an option

  • Save Leo-Lee15/77665a5825768bb895dda8797bc0d3ac to your computer and use it in GitHub Desktop.

Select an option

Save Leo-Lee15/77665a5825768bb895dda8797bc0d3ac to your computer and use it in GitHub Desktop.
from torch.utils.data import DataLoader
import math
from sentence_transformers import models, losses
from sentence_transformers import SentencesDataset, LoggingHandler, SentenceTransformer, util, InputExample
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator, SimilarityFunction
import logging
from datetime import datetime
import sys
import os
import gzip
import pandas as pd
import csv
import numpy as np
import optuna
import transformers
evaluation_steps = 1000
base_save_dir = '/srv/data/nlp/sentence_transformers'
model_name = 'dbmdz/bert-base-german-uncased'
study_name='all_nli_de_08'
logging.basicConfig(format='%(asctime)s - %(message)s',
datefmt='%Y-%m-%d %H:%M:%S',
level=logging.INFO,
handlers=[LoggingHandler()])
def callback(value, a, b):
print('callback:', value, a, b)
if math.isnan(value):
raise optuna.exceptions.TrialPruned()
def train(trial, i):
train_batch_size = trial.suggest_int('train_batch_size', 16, 60)
num_epochs = trial.suggest_int('num_epochs', 1, 5)
lr = trial.suggest_uniform('lr', 2e-6, 2e-4) # 2e-5
eps = trial.suggest_uniform('eps', 1e-7, 1e-5) # 1e-6
weight_decay = trial.suggest_uniform('weight_decay', 0.001, 0.1) # 0.01
warmup_steps_mul = trial.suggest_uniform('warmup_steps_mul', 0.1, 0.5)
model_save_path = f'{base_save_dir}/{study_name}_t{trial.number:02d}_i{i}'
label2int = {"contradiction": 0, "entailment": 1, "neutral": 2}
# create model
word_embedding_model = models.Transformer(model_name)
pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(),
pooling_mode_mean_tokens=True,
pooling_mode_cls_token=False,
pooling_mode_max_tokens=False)
model = SentenceTransformer(modules=[word_embedding_model, pooling_model])
# read mnli
mnli_df = pd.read_csv('./mnli/mnli_all_en_de.csv')
mnli_df.drop(mnli_df[mnli_df['gold_label'] == '-'].index, inplace=True)
mnli_df.dropna(inplace=True)
s1_de = mnli_df['sentence1_de'].tolist()
s2_de = mnli_df['sentence2_de'].tolist()
label = mnli_df['gold_label'].tolist()
# read and add snli
snli_df = pd.read_csv('./snli/snli_all_en_de.csv')
snli_df.drop(snli_df[snli_df['gold_label'] == '-'].index, inplace=True)
snli_df.dropna(inplace=True)
s1_de.extend(snli_df['sentence1_de'].tolist())
s2_de.extend(snli_df['sentence2_de'].tolist())
label.extend(snli_df['gold_label'].tolist())
assert len(s1_de) == len(s2_de) == len(label)
train_samples = []
for i, (_s1_de, _s2_de, _label) in enumerate(zip(s1_de, s2_de, label)):
label_id = label2int[_label]
assert type(_s1_de) == str
assert len(_s1_de) > 0
assert type(_s2_de) == str
assert len(_s2_de) > 0
assert type(label_id) == int
train_samples.append(InputExample(texts=[_s1_de, _s2_de], label=label_id))
train_dataset = SentencesDataset(train_samples, model=model)
train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=train_batch_size)
train_loss = losses.SoftmaxLoss(model=model, sentence_embedding_dimension=model.get_sentence_embedding_dimension(), num_labels=len(label2int))
stsb_dev = pd.read_csv('./data/stsbenchmark/de/sts_dev_de.csv', sep='\t', quoting=csv.QUOTE_NONE, names=['label', 's1', 's2'])
s1 = stsb_dev['s1'].tolist()
s2 = stsb_dev['s2'].tolist()
label = stsb_dev['label'].tolist()
stsb_dev = pd.read_csv('./data/stsbenchmark/de/sts_test_de.csv', sep='\t', quoting=csv.QUOTE_NONE, names=['label', 's1', 's2'])
s1.extend(stsb_dev['s1'].tolist())
s2.extend(stsb_dev['s2'].tolist())
label.extend(stsb_dev['label'].tolist())
dev_samples = []
for _s1, _s2, _label in zip(s1, s2, label):
score = _label / 5.0
assert type(_s1) == str
assert len(_s1) > 0
assert type(_s2) == str
assert len(_s2) > 0
assert type(score) == float
assert score >= 0.0
assert score <= 1.0
dev_samples.append(InputExample(texts=[_s1, _s2], label=score))
assert len(dev_samples) == 1500 + 1379
dev_evaluator = EmbeddingSimilarityEvaluator.from_input_examples(
dev_samples,
batch_size=train_batch_size,
name='sts-dev',
main_similarity=SimilarityFunction.COSINE
)
warmup_steps = math.ceil(len(train_dataset) * num_epochs / train_batch_size * warmup_steps_mul) # 0.1
logging.info("Warmup-steps: {}".format(warmup_steps))
#optimizer_class = None
#optimizer_class_str = trial.suggest_categorical('optimizer_class', ['AdamW', 'Adafactor'])
#if optimizer_class_str == 'Adafactor':
# optimizer_class = transformers.optimization.Adafactor
#elif optimizer_class_str == 'AdamW':
# optimizer_class = transformers.optimization.AdamW
#else:
# assert False
# Train the model
model.fit(train_objectives=[(train_dataloader, train_loss)],
evaluator=dev_evaluator,
epochs=num_epochs,
scheduler=trial.suggest_categorical('scheduler', ['WarmupLinear', 'warmupcosine', 'warmupcosinewithhardrestarts']),
#optimizer_class=optimizer_class,
evaluation_steps=evaluation_steps,
warmup_steps=warmup_steps,
output_path=model_save_path,
optimizer_params={'lr': lr, 'eps': eps, 'correct_bias': False},
weight_decay=weight_decay,
callback=callback,
)
best_score = model.best_score
print(best_score)
return best_score
def objective(trial):
try:
results = []
for i in range(3):
result = train(trial, i)
results.append(result)
trial.set_user_attr('results', str(results))
mean_result = np.mean(results)
trial.set_user_attr('mean_result', str(mean_result))
if mean_result < 0.77:
return mean_result
return mean_result
except Exception as e:
trial.set_user_attr('exception', str(e))
print(e)
return 0
study = optuna.create_study(
study_name=study_name,
storage='sqlite:///optuna.db',
load_if_exists=True,
direction='maximize',
)
study.optimize(objective)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment