Created
September 7, 2020 14:01
-
-
Save PhilipMay/2e42eeb7174cf0a122036a26ab38ceba to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import logging | |
from pathlib import Path | |
import torch | |
import random | |
import pandas as pd | |
from sklearn.metrics import f1_score | |
from farm.data_handler.data_silo import DataSilo, DataSiloForCrossVal | |
from farm.data_handler.processor import TextClassificationProcessor | |
from farm.modeling.optimization import initialize_optimizer | |
from farm.modeling.adaptive_model import AdaptiveModel | |
from farm.modeling.language_model import LanguageModel | |
from farm.modeling.prediction_head import TextClassificationHead | |
from farm.modeling.tokenization import Tokenizer | |
from farm.train import Trainer, EarlyStopping | |
from farm.utils import set_all_seeds, MLFlowLogger, initialize_device_settings | |
from farm.infer import Inferencer | |
#lang_model = "./models/dbmdz-bert-base-german-uncased" | |
lang_model = "bert-base-german-dbmdz-uncased" | |
#lang_model = "./models/german-nlp-group-electra-base-german-uncased" | |
#lang_model = "german-nlp-group/electra-base-german-uncased" | |
data_dir = "./data/germeval18" | |
metric_name = "f1_macro" | |
save_dir = Path("./saved_models/electra-bert-test") | |
set_all_seeds(seed=42) | |
device, n_gpu = initialize_device_settings(use_cuda=True) | |
n_epochs = 2 | |
batch_size = 32 | |
use_amp = None | |
tokenizer = Tokenizer.load(pretrained_model_name_or_path=lang_model) | |
label_list = ["OTHER", "OFFENSE"] | |
processor = TextClassificationProcessor(tokenizer=tokenizer, | |
max_seq_len=64, | |
data_dir=Path(data_dir), | |
label_list=label_list, | |
metric=metric_name, | |
label_column_name="coarse_label" | |
) | |
data_silo = DataSilo(processor=processor, batch_size=batch_size) | |
language_model = LanguageModel.load(lang_model) | |
prediction_head = TextClassificationHead( | |
class_weights=data_silo.calculate_class_weights(task_name="text_classification"), | |
num_labels=len(label_list)) | |
model = AdaptiveModel( | |
language_model=language_model, | |
prediction_heads=[prediction_head], | |
embeds_dropout_prob=0.2, | |
lm_output_types=["per_sequence"], | |
device=device) | |
model, optimizer, lr_schedule = initialize_optimizer( | |
model=model, | |
learning_rate=0.5e-5, | |
device=device, | |
n_batches=len(data_silo.loaders["train"]), | |
n_epochs=n_epochs, | |
use_amp=use_amp) | |
trainer = Trainer( | |
model=model, | |
optimizer=optimizer, | |
data_silo=data_silo, | |
epochs=n_epochs, | |
n_gpu=n_gpu, | |
lr_schedule=lr_schedule, | |
#evaluate_every=evaluate_every, | |
device=device, | |
#early_stopping=earlystopping, | |
) | |
trainer.train() | |
# save model | |
model.save(save_dir) | |
processor.save(save_dir) | |
# just to be sure | |
model.cpu() | |
torch.cuda.empty_cache() | |
del model | |
del processor | |
input("Press enter to continue...") | |
model = Inferencer.load(save_dir, | |
num_processes=4, | |
batch_size=batch_size, | |
gpu=True, | |
) | |
df = pd.read_csv("./data/germeval18" + "/test.tsv", sep="\t") | |
text = df['text'].tolist() | |
label = df['coarse_label'].tolist() | |
assert len(text) == len(label) | |
unlabeled_text_dict = [{'text': t} for t in text] | |
result = model.inference_from_dicts(dicts=unlabeled_text_dict) | |
predicted_label_list = [] | |
predicted_probability_list = [] | |
for outer in result: | |
for inner in outer['predictions']: | |
predicted_label_list.append(inner['label']) | |
predicted_probability_list.append(inner['probability']) | |
print(predicted_label_list[:10]) | |
print(label[:10]) | |
f1_macro = f1_score(label, predicted_label_list, average='macro') | |
print("f1_macro", f1_macro) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment