-
-
Save existeundelta/4ed86ccfb29dca748938490fff6ae1d0 to your computer and use it in GitHub Desktop.
Code snippets associated with the <BLOG_NAME> blog.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import elasticsearch import Elasticsearch | |
from pathlib import Path | |
from eland.ml.pytorch import PyTorchModel | |
from eland.ml.pytorch.transformers import TransformerModel | |
# Load the custom model | |
tm = TransformerModel("model", "text_classification") | |
# Export the model to a TorchScript representation which Elasticsearch uses | |
tmp_path = "models" | |
Path(tmp_path).mkdir(parents=True, exist_ok=True) | |
model_path, config, vocab_path = tm.save(tmp_path) | |
# Import model into Elasticsearch | |
es = Elasticsearch("ES_CLUSTER_URL", timeout=300) | |
ptm = PyTorchModel(es, tm.elasticsearch_model_id()) | |
# You can also give the model a custom model id like | |
# ptm = PyTorchModel(es, "roberta_model") | |
ptm.import_model(model_path=model_path, config_path=None, vocab_path=vocab_path, config=config) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from datasets import load_dataset | |
from transformers import RobertaTokenizer | |
# Load local datasets | |
data_files = {"train": "data/train.csv", "test": "data/test.csv"} | |
data = load_dataset("csv", data_files=data_files) | |
# Initialize the tokenizer | |
tokenizer = RobertaTokenizer.from_pretrained("roberta-base") | |
# Save the tokenizer for importing into Elastic later | |
tokenizer.save_pretrained("roberta_model") | |
# Tokenizer function | |
def tokenize_function(examples): | |
return tokenizer( | |
examples["concat"], | |
truncation=True, | |
max_length = 512, | |
padding=True) | |
# Tokenize data in batches | |
tokenized_data = data.map(tokenize_function, batched=True) | |
# Get tokenized train and test data | |
tokenized_train_data = tokenized_data["train"] | |
tokenized_test_data = tokenized_data["test"] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from datasets import load_metric | |
import numpy as np | |
from transformers import RobertaForSequenceClassification | |
from transformers import TrainingArguments | |
# Initialize the model | |
model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=2) | |
# Define training arguments | |
training_args = TrainingArguments(output_dir="roberta_model", evaluation_strategy="epoch") | |
# Define metrics to track | |
accuracy = load_metric("accuracy") | |
recall = load_metric("recall") | |
precision = load_metric("precision") | |
f1 = load_metric("f1") | |
# Metric evaluation function | |
def compute_metrics(eval_pred): | |
logits, labels = eval_pred | |
predictions = np.argmax(logits, axis=-1) | |
eval_accuracy = accuracy.compute(predictions=predictions, references=labels) | |
eval_recall = recall.compute(predictions=predictions, references=labels) | |
eval_precision = precision.compute(predictions=predictions, references=labels) | |
eval_f1 = f1.compute(predictions=predictions, references=labels) | |
return eval_accuracy, eval_recall, eval_precision, eval_f1 | |
# Create the Trainer object | |
trainer = Trainer( | |
model=model, | |
args=training_args, | |
train_dataset=train_dataset, | |
eval_dataset=test_dataset, | |
compute_metrics=compute_metrics | |
) | |
# Train the model | |
trainer.train() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment