Created
November 24, 2022 11:34
-
-
Save mht-sharma/e31f4b95586092960b81dcb3af32618b to your computer and use it in GitHub Desktop.
Benchmark QA pipeline Roberta HF
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from pathlib import Path | |
from time import perf_counter | |
import numpy as np | |
from optimum.onnxruntime import ORTModelForQuestionAnswering | |
from transformers import AutoModelForQuestionAnswering, AutoTokenizer, pipeline | |
model_id = "deepset/roberta-base-squad2" | |
onnx_path = Path("onnx") | |
task = "question-answering" | |
tokenizer = AutoTokenizer.from_pretrained(model_id) | |
context = "Hello, my name is Philipp and I live in Nuremberg, Germany. Currently I am working as a Technical Lead at Hugging Face to democratize artificial intelligence through open source and open science. In the past I designed and implemented cloud-native machine learning architectures for fin-tech and insurance companies. I found my passion for cloud concepts and machine learning 5 years ago. Since then I never stopped learning. Currently, I am focusing myself in the area NLP and how to leverage models like BERT, Roberta, T5, ViT, and GPT2 to generate business value." | |
question = "As what is Philipp working?" | |
# load vanilla transformers and convert to onnx | |
model_orig = AutoModelForQuestionAnswering.from_pretrained(model_id) | |
torch_qa = pipeline( | |
task, model=model_orig, tokenizer=tokenizer, handle_impossible_answer=True | |
) | |
model = ORTModelForQuestionAnswering.from_pretrained(model_id, from_transformers=True) | |
optimum_qa = pipeline( | |
task, model=model, tokenizer=tokenizer, handle_impossible_answer=True | |
) | |
def measure_latency(pipe): | |
latencies = [] | |
# warm up | |
for _ in range(10): | |
_ = pipe(question=question, context=context) | |
# Timed run | |
for _ in range(1000): | |
start_time = perf_counter() | |
_ = pipe(question=question, context=context) | |
latency = perf_counter() - start_time | |
latencies.append(latency) | |
# Compute run statistics | |
time_avg_ms = 1000 * np.mean(latencies) | |
time_std_ms = 1000 * np.std(latencies) | |
return f"Average latency (ms) - {time_avg_ms:.2f} +\- {time_std_ms:.2f}" | |
print(f"Torch model {measure_latency(torch_qa)}") | |
print(f"Optimum model {measure_latency(optimum_qa)}") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment