Created
November 8, 2023 10:33
-
-
Save davidkyle/8b543bd079789041fa3446d1fe7ab349 to your computer and use it in GitHub Desktop.
Script to install a local copy of the ELSER model in Elasticsearch
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from elasticsearch import Elasticsearch | |
from eland.ml.pytorch import PyTorchModel | |
from eland.ml.pytorch.nlp_ml_model import ( | |
TextExpansionInferenceOptions, | |
NlpTrainedModelConfig, | |
TrainedModelInput, | |
NlpBertTokenizationConfig | |
) | |
""" | |
Use this script to install a local copy of the ELSER model in Elasticsearch. | |
The ELSER model and vocabulary files must be downloaded and stored locally, | |
point this script at those files by modifying the hard coded `model_path` | |
and `vocab_path` variables. | |
Once the path variables and Elasticsearch connection settings have been | |
updated run the script `python3 upload_local_elser.py` | |
The ELSER v2 model can be downloaded from: | |
Model File - https://ml-models.elastic.co/elser_model_2.pt | |
Vocabulary File - https://ml-models.elastic.co/elser_model_2.vocab.json | |
The plaform specific version optimised for X86 Linux can be downloaded from: | |
Model File - https://ml-models.elastic.co/elser_model_2_linux-x86_64.pt | |
Vocabulary File - https://ml-models.elastic.co/elser_model_2_linux-x86_64.vocab.json | |
This script requires the Eland package to be installed: https://github.com/elastic/eland | |
""" | |
def get_es_client(): | |
""" | |
Host URL and auth are hard coded, update these for you cluster | |
""" | |
es_args = { | |
'request_timeout': 300, | |
'verify_certs': False | |
} | |
es_args['basic_auth'] = ('elastic-admin', 'elastic-password') | |
es_args['hosts'] = 'http://localhost:9200' | |
es_client = Elasticsearch(**es_args) | |
print(es_client.info()) | |
return es_client | |
if __name__ == "__main__": | |
es = get_es_client() | |
ptm = PyTorchModel(es, 'elser-local') | |
tokenization_config = NlpBertTokenizationConfig(truncate='first', do_lower_case=True, with_special_tokens=True, max_sequence_length=512) | |
inference_config = TextExpansionInferenceOptions(tokenization=tokenization_config) | |
model_config = NlpTrainedModelConfig( | |
description="ELSER from local upload", | |
model_type="pytorch", | |
inference_config=inference_config, | |
input=TrainedModelInput( | |
field_names=["text_field"], | |
), | |
) | |
model_path='/PATH/TO/elser_model_2_XXX.pt' | |
vocab_path='/PATH/TO/elser_model_2.vocab.json' | |
print(f"Creating model with id '{ptm.model_id}'") | |
ptm.put_config(config=model_config) | |
print(f"Uploading model vocabulary") | |
ptm.put_vocab(vocab_path) | |
print(f"Uploading model definition") | |
ptm.put_model(model_path) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment