Skip to content

Instantly share code, notes, and snippets.

@lantiga
Last active September 18, 2022 03:03
Show Gist options
  • Save lantiga/eea9db320f81de6031c261ead1382464 to your computer and use it in GitHub Desktop.
Save lantiga/eea9db320f81de6031c261ead1382464 to your computer and use it in GitHub Desktop.
🤗 Huggingface Bert on RedisAI
from transformers import BertForQuestionAnswering
import torch
bert_name = "bert-large-uncased-whole-word-masking-finetuned-squad"
model = BertForQuestionAnswering.from_pretrained(bert_name, torchscript=True)
model.eval()
inputs = [torch.ones(1, 2, dtype=torch.int64),
torch.ones(1, 2, dtype=torch.int64),
torch.ones(1, 2, dtype=torch.int64)]
with torch.no_grad():
traced_model = torch.jit.trace(model, inputs)
torch.jit.save(traced_model, "traced_bert_qa.pt")
import redisai
r = redisai.Client()
model_file = 'traced_bert_qa.pt'
with open(model_file, 'rb') as f:
model = f.read()
chunk_size = 500 * 1024 * 1024
model_chunks = [model[i:i + chunk_size] for i in range(0, len(model), chunk_size)]
r.modelset('bert-qa', 'TORCH', 'CPU', model)
import redisai
from transformers import AutoTokenizer
import numpy as np
tokenizer = AutoTokenizer.from_pretrained("bert-large-uncased-whole-word-masking-finetuned-squad")
r = redisai.Client()
text = r"""
At a very high level, one of the most critical steps in any ML pipeline is called AI serving, a task usually performed by an AI inference engine. The AI inference engine is responsible for the model deployment and performance monitoring steps in the figure above, and represents a whole new world that will eventually determine whether applications can use AI technologies to improve operational efficiencies and solve real business problems.
"""
questions = [
"What is the most critical step in any ML pipeline?",
"What is AI serving?",
"What is an AI inference engine?",
]
for question in questions:
inputs = tokenizer.encode_plus(question, text, add_special_tokens=True, return_tensors="pt")
input_ids = inputs['input_ids'].numpy()
attention_mask = inputs['attention_mask'].numpy()
token_type_ids = inputs['token_type_ids'].numpy()
r.tensorset('input_ids', input_ids)
r.tensorset('attention_mask', attention_mask)
r.tensorset('token_type_ids', token_type_ids)
r.modelrun('bert-qa', ['input_ids', 'attention_mask', 'token_type_ids'],
['answer_start_scores', 'answer_end_scores'])
answer_start_scores = r.tensorget('answer_start_scores')
answer_end_scores = r.tensorget('answer_end_scores')
answer_start = np.argmax(answer_start_scores)
answer_end = np.argmax(answer_end_scores) + 1
input_ids = inputs["input_ids"].tolist()[0]
output_tokens = tokenizer.convert_ids_to_tokens(input_ids[answer_start:answer_end])
answer = tokenizer.convert_tokens_to_string(output_tokens)
print(f"Question: {question}")
print(f"Answer: {answer}\n")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment