Last active
September 13, 2023 22:33
-
-
Save jinhangjiang/2fa43ee2ee862805c694ea98f3283947 to your computer and use it in GitHub Desktop.
transformers_linear_regression
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
gpu_info = !nvidia-smi | |
gpu_info = '\n'.join(gpu_info) | |
if gpu_info.find('failed') >= 0: | |
print('Not connected to a GPU') | |
else: | |
print(gpu_info) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
test_embeddings = tokenizer(test_reviews_3000.reviews.astype(str).tolist(), truncation=True, padding=True, max_length=126) | |
test_dataset = MakeTorchData(test_embeddings, test_reviews_3000.Rating.astype(float)) | |
bert_trainer.eval_dataset = test_dataset | |
bert_trainer.evaluate() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{'epoch': 5.0, | |
'eval_accuracy': 0.6603333333333333, | |
'eval_loss': 0.48832669854164124, | |
'eval_mae': 0.45958763360977173, | |
'eval_mse': 0.4883267283439636, | |
'eval_r2': 0.6103774787626888, | |
'eval_rmse': 0.6988037824630737, | |
'eval_runtime': 13.5094, | |
'eval_samples_per_second': 222.068, | |
'eval_steps_per_second': 11.103} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import torch, gc, random, datasets | |
from transformers.file_utils import is_tf_available, is_torch_available | |
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments | |
%load_ext memory_profiler | |
from sklearn.model_selection import train_test_split | |
from sklearn.metrics import mean_squared_error, r2_score, mean_squared_error, mean_absolute_error | |
import pandas as pd | |
import numpy as np |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Make data | |
X = Data | |
y = Target | |
# Split Data | |
X_train, X_test, y_train, y_test = train_test_split(X.tolist(), y, test_size=test_size) | |
# Call the Tokenizer | |
tokenizer = AutoTokenizer.from_pretrained(model_name) | |
# Encode the text | |
train_encodings = tokenizer(X_train, truncation=True, padding=True, max_length=max_length) | |
valid_encodings = tokenizer(X_test, truncation=True, padding=True, max_length=max_length) | |
class MakeTorchData(torch.utils.data.Dataset): | |
def __init__(self, encodings, labels): | |
self.encodings = encodings | |
self.labels = labels | |
def __getitem__(self, idx): | |
item = {k: torch.tensor(v[idx]) for k, v in self.encodings.items()} | |
item["labels"] = torch.tensor([self.labels[idx]]) | |
item["labels"] = float(item["labels"]) | |
return item | |
def __len__(self): | |
return len(self.labels) | |
# convert our tokenized data into a torch Dataset | |
train_dataset = MakeTorchData(train_encodings, y_train.ravel()) | |
valid_dataset = MakeTorchData(valid_encodings, y_test.ravel()) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
model = AutoModelForSequenceClassification.from_pretrained(model_name, | |
num_labels = 1).to("cuda") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def compute_metrics_for_regression(eval_pred): | |
logits, labels = eval_pred | |
labels = labels.reshape(-1, 1) | |
mse = mean_squared_error(labels, logits) | |
rmse = mean_squared_error(labels, logits, squared=False) | |
mae = mean_absolute_error(labels, logits) | |
r2 = r2_score(labels, logits) | |
smape = 1/len(labels) * np.sum(2 * np.abs(logits-labels) / (np.abs(labels) + np.abs(logits))*100) | |
return {"mse": mse, "rmse": rmse, "mae": mae, "r2": r2, "smape": smape} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Specifiy the arguments for the trainer | |
training_args = TrainingArguments( | |
output_dir ='./results', | |
num_train_epochs = num_epochs, | |
per_device_train_batch_size = 64, | |
per_device_eval_batch_size = 20, | |
weight_decay = 0.01, | |
learning_rate = 2e-5, | |
logging_dir = './logs', | |
save_total_limit = 10, | |
load_best_model_at_end = True, | |
metric_for_best_model = 'rmse', | |
evaluation_strategy = "epoch", | |
save_strategy = "epoch", | |
) | |
# Call the Trainer | |
trainer = Trainer( | |
model = model, | |
args = training_args, | |
train_dataset = train_dataset, | |
eval_dataset = valid_dataset, | |
compute_metrics = compute_metrics_for_regression, | |
) | |
# Train the model | |
trainer.train() | |
# Call the summary | |
trainer.evaluate() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Epoch | Training Loss | Validation Loss | Mse | Rmse | Mae | R2 | Smape | |
---|---|---|---|---|---|---|---|---|
1 | No log | 0.192932 | 0.192932 | 0.439241 | 0.390277 | -2.478255 | 13.924477 | |
2 | No log | 0.049018 | 0.049018 | 0.221400 | 0.185570 | 0.116285 | 7.139155 | |
3 | No log | 0.083286 | 0.083286 | 0.288593 | 0.219865 | -0.501508 | 8.309401 | |
... | ... | |||||||
44 | 0.012600 | 0.029716 | 0.029716 | 0.172384 | 0.132232 | 0.464267 | 5.152074 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def compute_metrics_for_regression(eval_pred): | |
... | |
... | |
single_squared_errors = ((logits - labels).flatten()**2).tolist() | |
accuracy = sum([1 for e in single_squared_errors if e < 0.25]) / len(single_squared_errors) | |
return {"mse": mse, "rmse": rmse, "mae": mae, "r2": r2, "accuracy": accuracy} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Epoch | Training Loss | Validation Loss | Mse | Rmse | Mae | R2 | Accuracy | |
---|---|---|---|---|---|---|---|---|
1 | No log | 0.507760 | 0.507760 | 0.712573 | 0.491016 | 0.581568 | 0.653206 | |
2 | No log | 0.434158 | 0.434158 | 0.658906 | 0.438609 | 0.642221 | 0.662969 | |
3 | No log | 0.469371 | 0.469371 | 0.685106 | 0.449530 | 0.613203 | 0.669477 | |
4 | No log | 0.440199 | 0.440199 | 0.663475 | 0.432040 | 0.637242 | 0.668738 | |
5 | 0.769400 | 0.447230 | 0.447231 | 0.668753 | 0.439436 | 0.631448 | 0.669107 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment