Skip to content

Instantly share code, notes, and snippets.

@mht-sharma
Created November 24, 2022 11:34
Show Gist options
  • Save mht-sharma/e31f4b95586092960b81dcb3af32618b to your computer and use it in GitHub Desktop.
Save mht-sharma/e31f4b95586092960b81dcb3af32618b to your computer and use it in GitHub Desktop.
Benchmark QA pipeline Roberta HF
from pathlib import Path
from time import perf_counter
import numpy as np
from optimum.onnxruntime import ORTModelForQuestionAnswering
from transformers import AutoModelForQuestionAnswering, AutoTokenizer, pipeline
model_id = "deepset/roberta-base-squad2"
onnx_path = Path("onnx")
task = "question-answering"
tokenizer = AutoTokenizer.from_pretrained(model_id)
context = "Hello, my name is Philipp and I live in Nuremberg, Germany. Currently I am working as a Technical Lead at Hugging Face to democratize artificial intelligence through open source and open science. In the past I designed and implemented cloud-native machine learning architectures for fin-tech and insurance companies. I found my passion for cloud concepts and machine learning 5 years ago. Since then I never stopped learning. Currently, I am focusing myself in the area NLP and how to leverage models like BERT, Roberta, T5, ViT, and GPT2 to generate business value."
question = "As what is Philipp working?"
# load vanilla transformers and convert to onnx
model_orig = AutoModelForQuestionAnswering.from_pretrained(model_id)
torch_qa = pipeline(
task, model=model_orig, tokenizer=tokenizer, handle_impossible_answer=True
)
model = ORTModelForQuestionAnswering.from_pretrained(model_id, from_transformers=True)
optimum_qa = pipeline(
task, model=model, tokenizer=tokenizer, handle_impossible_answer=True
)
def measure_latency(pipe):
latencies = []
# warm up
for _ in range(10):
_ = pipe(question=question, context=context)
# Timed run
for _ in range(1000):
start_time = perf_counter()
_ = pipe(question=question, context=context)
latency = perf_counter() - start_time
latencies.append(latency)
# Compute run statistics
time_avg_ms = 1000 * np.mean(latencies)
time_std_ms = 1000 * np.std(latencies)
return f"Average latency (ms) - {time_avg_ms:.2f} +\- {time_std_ms:.2f}"
print(f"Torch model {measure_latency(torch_qa)}")
print(f"Optimum model {measure_latency(optimum_qa)}")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment