Skip to content

Instantly share code, notes, and snippets.

@PhilipMay
Created September 7, 2020 14:01
Show Gist options
  • Save PhilipMay/2e42eeb7174cf0a122036a26ab38ceba to your computer and use it in GitHub Desktop.
Save PhilipMay/2e42eeb7174cf0a122036a26ab38ceba to your computer and use it in GitHub Desktop.
import logging
from pathlib import Path
import torch
import random
import pandas as pd
from sklearn.metrics import f1_score
from farm.data_handler.data_silo import DataSilo, DataSiloForCrossVal
from farm.data_handler.processor import TextClassificationProcessor
from farm.modeling.optimization import initialize_optimizer
from farm.modeling.adaptive_model import AdaptiveModel
from farm.modeling.language_model import LanguageModel
from farm.modeling.prediction_head import TextClassificationHead
from farm.modeling.tokenization import Tokenizer
from farm.train import Trainer, EarlyStopping
from farm.utils import set_all_seeds, MLFlowLogger, initialize_device_settings
from farm.infer import Inferencer
#lang_model = "./models/dbmdz-bert-base-german-uncased"
lang_model = "bert-base-german-dbmdz-uncased"
#lang_model = "./models/german-nlp-group-electra-base-german-uncased"
#lang_model = "german-nlp-group/electra-base-german-uncased"
data_dir = "./data/germeval18"
metric_name = "f1_macro"
save_dir = Path("./saved_models/electra-bert-test")
set_all_seeds(seed=42)
device, n_gpu = initialize_device_settings(use_cuda=True)
n_epochs = 2
batch_size = 32
use_amp = None
tokenizer = Tokenizer.load(pretrained_model_name_or_path=lang_model)
label_list = ["OTHER", "OFFENSE"]
processor = TextClassificationProcessor(tokenizer=tokenizer,
max_seq_len=64,
data_dir=Path(data_dir),
label_list=label_list,
metric=metric_name,
label_column_name="coarse_label"
)
data_silo = DataSilo(processor=processor, batch_size=batch_size)
language_model = LanguageModel.load(lang_model)
prediction_head = TextClassificationHead(
class_weights=data_silo.calculate_class_weights(task_name="text_classification"),
num_labels=len(label_list))
model = AdaptiveModel(
language_model=language_model,
prediction_heads=[prediction_head],
embeds_dropout_prob=0.2,
lm_output_types=["per_sequence"],
device=device)
model, optimizer, lr_schedule = initialize_optimizer(
model=model,
learning_rate=0.5e-5,
device=device,
n_batches=len(data_silo.loaders["train"]),
n_epochs=n_epochs,
use_amp=use_amp)
trainer = Trainer(
model=model,
optimizer=optimizer,
data_silo=data_silo,
epochs=n_epochs,
n_gpu=n_gpu,
lr_schedule=lr_schedule,
#evaluate_every=evaluate_every,
device=device,
#early_stopping=earlystopping,
)
trainer.train()
# save model
model.save(save_dir)
processor.save(save_dir)
# just to be sure
model.cpu()
torch.cuda.empty_cache()
del model
del processor
input("Press enter to continue...")
model = Inferencer.load(save_dir,
num_processes=4,
batch_size=batch_size,
gpu=True,
)
df = pd.read_csv("./data/germeval18" + "/test.tsv", sep="\t")
text = df['text'].tolist()
label = df['coarse_label'].tolist()
assert len(text) == len(label)
unlabeled_text_dict = [{'text': t} for t in text]
result = model.inference_from_dicts(dicts=unlabeled_text_dict)
predicted_label_list = []
predicted_probability_list = []
for outer in result:
for inner in outer['predictions']:
predicted_label_list.append(inner['label'])
predicted_probability_list.append(inner['probability'])
print(predicted_label_list[:10])
print(label[:10])
f1_macro = f1_score(label, predicted_label_list, average='macro')
print("f1_macro", f1_macro)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment