pip install simpletransformers pip uninstall torch, then run latest install script from pytorch.org
environment variables:
$ PORCH_N=100_000 PORCH_CUDA=True python bmark_distil.py
########## | |
# Part 1 # | |
########## | |
import pandas as pd | |
prefix = 'data/' | |
train_df = pd.read_csv(prefix + 'train.csv', header=None) | |
train_df.head() | |
eval_df = pd.read_csv(prefix + 'test.csv', header=None) | |
eval_df.head() | |
train_df[0] = (train_df[0] == 2).astype(int) | |
eval_df[0] = (eval_df[0] == 2).astype(int) | |
train_df = pd.DataFrame({ | |
'text': train_df[1].replace(r'\n', ' ', regex=True), | |
'label':train_df[0] | |
}) | |
#print(train_df.head()) | |
eval_df = pd.DataFrame({ | |
'text': eval_df[1].replace(r'\n', ' ', regex=True), | |
'label':eval_df[0] | |
}) | |
#print(eval_df.head()) | |
########## | |
# Part 2 # | |
########## | |
from simpletransformers.classification import ClassificationArgs, ClassificationModel | |
import os | |
os.environ["TOKENIZERS_PARALLELISM"] = "false" | |
model_args = ClassificationArgs( | |
use_multiprocessing=False, | |
use_multiprocessing_for_evaluation=False, | |
overwrite_output_dir=True, | |
) | |
use_cuda = bool(os.environ.get('PORCH_CUDA', False)) | |
n = int(os.environ.get('PORCH_N', '10_000')) | |
print(f"{n=} {use_cuda=}") | |
model = ClassificationModel( | |
"distilbert", "distilbert-base-uncased", | |
use_cuda=use_cuda, | |
args=model_args | |
) | |
#model = ClassificationModel('roberta', 'roberta-base', args=model_args) | |
# Train the model | |
model.train_model(train_df.sample(n=n)) | |
# Evaluate the model | |
#result, model_outputs, wrong_predictions = model.eval_model(eval_df) | |
# CPU: 1.35 it/s using 6 cores @ 2.6 GHz | |
# n=10_000: CUDA 3.28 it/s; 768 CUDA cores @ 1.5 GHz | |
# a = core-cycles / sec | |
# b = iter / sec | |
# | |
# want: core-cycles / iter | |
# cores * clock_rate / (iter / sec) | |
# CPU: 11.6 giga core-cycles / iter | |
# GPU: 351 giga core-cycles / iter |