existeundelta · September 18, 2023 08:55
diff --git a/import.py b/import.py
 import elasticsearch import Elasticsearch
 from pathlib import Path
 from eland.ml.pytorch import PyTorchModel
 from eland.ml.pytorch.transformers import TransformerModel

 # Load the custom model
 tm = TransformerModel("model", "text_classification")

 # Export the model to a TorchScript representation which Elasticsearch uses
 tmp_path = "models"
 Path(tmp_path).mkdir(parents=True, exist_ok=True)
 model_path, config, vocab_path = tm.save(tmp_path)

 # Import model into Elasticsearch
 es = Elasticsearch("ES_CLUSTER_URL", timeout=300)
 ptm = PyTorchModel(es, tm.elasticsearch_model_id())

 # You can also give the model a custom model id like
 # ptm = PyTorchModel(es, "roberta_model")

 ptm.import_model(model_path=model_path, config_path=None, vocab_path=vocab_path, config=config)
diff --git a/tokenize.py b/tokenize.py
 from datasets import load_dataset
 from transformers import RobertaTokenizer

 # Load local datasets
 data_files = {"train": "data/train.csv", "test": "data/test.csv"}
 data = load_dataset("csv", data_files=data_files)

 # Initialize the tokenizer
 tokenizer = RobertaTokenizer.from_pretrained("roberta-base")

 # Save the tokenizer for importing into Elastic later
 tokenizer.save_pretrained("roberta_model")

 # Tokenizer function
 def tokenize_function(examples):
    return tokenizer(
        examples["concat"],
        truncation=True,
        max_length = 512,
        padding=True)

 # Tokenize data in batches
 tokenized_data = data.map(tokenize_function, batched=True)

 # Get tokenized train and test data
 tokenized_train_data = tokenized_data["train"]
 tokenized_test_data = tokenized_data["test"]
diff --git a/train.py b/train.py
 from datasets import load_metric
 import numpy as np
 from transformers import RobertaForSequenceClassification
 from transformers import TrainingArguments

 # Initialize the model
 model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=2)

 # Define training arguments
 training_args = TrainingArguments(output_dir="roberta_model", evaluation_strategy="epoch")

 # Define metrics to track
 accuracy = load_metric("accuracy")
 recall = load_metric("recall")
 precision = load_metric("precision")
 f1 = load_metric("f1")

 # Metric evaluation function
 def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    eval_accuracy = accuracy.compute(predictions=predictions, references=labels)
    eval_recall = recall.compute(predictions=predictions, references=labels)
    eval_precision = precision.compute(predictions=predictions, references=labels)
    eval_f1 = f1.compute(predictions=predictions, references=labels)
    return eval_accuracy, eval_recall, eval_precision, eval_f1

 # Create the Trainer object
 trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics
 )

 # Train the model 
 trainer.train()
	import elasticsearch import Elasticsearch
	from pathlib import Path
	from eland.ml.pytorch import PyTorchModel
	from eland.ml.pytorch.transformers import TransformerModel

	# Load the custom model
	tm = TransformerModel("model", "text_classification")

	# Export the model to a TorchScript representation which Elasticsearch uses
	tmp_path = "models"
	Path(tmp_path).mkdir(parents=True, exist_ok=True)
	model_path, config, vocab_path = tm.save(tmp_path)

	# Import model into Elasticsearch
	es = Elasticsearch("ES_CLUSTER_URL", timeout=300)
	ptm = PyTorchModel(es, tm.elasticsearch_model_id())

	# You can also give the model a custom model id like
	# ptm = PyTorchModel(es, "roberta_model")

	ptm.import_model(model_path=model_path, config_path=None, vocab_path=vocab_path, config=config)
	from datasets import load_dataset
	from transformers import RobertaTokenizer

	# Load local datasets
	data_files = {"train": "data/train.csv", "test": "data/test.csv"}
	data = load_dataset("csv", data_files=data_files)

	# Initialize the tokenizer
	tokenizer = RobertaTokenizer.from_pretrained("roberta-base")

	# Save the tokenizer for importing into Elastic later
	tokenizer.save_pretrained("roberta_model")

	# Tokenizer function
	def tokenize_function(examples):
	return tokenizer(
	examples["concat"],
	truncation=True,
	max_length = 512,
	padding=True)

	# Tokenize data in batches
	tokenized_data = data.map(tokenize_function, batched=True)

	# Get tokenized train and test data
	tokenized_train_data = tokenized_data["train"]
	tokenized_test_data = tokenized_data["test"]
	from datasets import load_metric
	import numpy as np
	from transformers import RobertaForSequenceClassification
	from transformers import TrainingArguments

	# Initialize the model
	model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=2)

	# Define training arguments
	training_args = TrainingArguments(output_dir="roberta_model", evaluation_strategy="epoch")

	# Define metrics to track
	accuracy = load_metric("accuracy")
	recall = load_metric("recall")
	precision = load_metric("precision")
	f1 = load_metric("f1")

	# Metric evaluation function
	def compute_metrics(eval_pred):
	logits, labels = eval_pred
	predictions = np.argmax(logits, axis=-1)
	eval_accuracy = accuracy.compute(predictions=predictions, references=labels)
	eval_recall = recall.compute(predictions=predictions, references=labels)
	eval_precision = precision.compute(predictions=predictions, references=labels)
	eval_f1 = f1.compute(predictions=predictions, references=labels)
	return eval_accuracy, eval_recall, eval_precision, eval_f1

	# Create the Trainer object
	trainer = Trainer(
	model=model,
	args=training_args,
	train_dataset=train_dataset,
	eval_dataset=test_dataset,
	compute_metrics=compute_metrics
	)

	# Train the model
	trainer.train()