mht-sharma · November 24, 2022 11:34
diff --git a/benchmark_qa.py b/benchmark_qa.py
 from pathlib import Path
 from time import perf_counter

 import numpy as np
 from optimum.onnxruntime import ORTModelForQuestionAnswering
 from transformers import AutoModelForQuestionAnswering, AutoTokenizer, pipeline

 model_id = "deepset/roberta-base-squad2"
 onnx_path = Path("onnx")
 task = "question-answering"
 tokenizer = AutoTokenizer.from_pretrained(model_id)

 context = "Hello, my name is Philipp and I live in Nuremberg, Germany. Currently I am working as a Technical Lead at Hugging Face to democratize artificial intelligence through open source and open science. In the past I designed and implemented cloud-native machine learning architectures for fin-tech and insurance companies. I found my passion for cloud concepts and machine learning 5 years ago. Since then I never stopped learning. Currently, I am focusing myself in the area NLP and how to leverage models like BERT, Roberta, T5, ViT, and GPT2 to generate business value."
 question = "As what is Philipp working?"

 # load vanilla transformers and convert to onnx
 model_orig = AutoModelForQuestionAnswering.from_pretrained(model_id)
 torch_qa = pipeline(
    task, model=model_orig, tokenizer=tokenizer, handle_impossible_answer=True
 )

 model = ORTModelForQuestionAnswering.from_pretrained(model_id, from_transformers=True)
 optimum_qa = pipeline(
    task, model=model, tokenizer=tokenizer, handle_impossible_answer=True
 )


 def measure_latency(pipe):
    latencies = []
    # warm up
    for _ in range(10):
        _ = pipe(question=question, context=context)
    # Timed run
    for _ in range(1000):
        start_time = perf_counter()
        _ = pipe(question=question, context=context)
        latency = perf_counter() - start_time
        latencies.append(latency)
    # Compute run statistics
    time_avg_ms = 1000 * np.mean(latencies)
    time_std_ms = 1000 * np.std(latencies)
    return f"Average latency (ms) - {time_avg_ms:.2f} +\- {time_std_ms:.2f}"


 print(f"Torch model {measure_latency(torch_qa)}")
 print(f"Optimum model {measure_latency(optimum_qa)}")
	from pathlib import Path
	from time import perf_counter

	import numpy as np
	from optimum.onnxruntime import ORTModelForQuestionAnswering
	from transformers import AutoModelForQuestionAnswering, AutoTokenizer, pipeline

	model_id = "deepset/roberta-base-squad2"
	onnx_path = Path("onnx")
	task = "question-answering"
	tokenizer = AutoTokenizer.from_pretrained(model_id)

	context = "Hello, my name is Philipp and I live in Nuremberg, Germany. Currently I am working as a Technical Lead at Hugging Face to democratize artificial intelligence through open source and open science. In the past I designed and implemented cloud-native machine learning architectures for fin-tech and insurance companies. I found my passion for cloud concepts and machine learning 5 years ago. Since then I never stopped learning. Currently, I am focusing myself in the area NLP and how to leverage models like BERT, Roberta, T5, ViT, and GPT2 to generate business value."
	question = "As what is Philipp working?"

	# load vanilla transformers and convert to onnx
	model_orig = AutoModelForQuestionAnswering.from_pretrained(model_id)
	torch_qa = pipeline(
	task, model=model_orig, tokenizer=tokenizer, handle_impossible_answer=True
	)

	model = ORTModelForQuestionAnswering.from_pretrained(model_id, from_transformers=True)
	optimum_qa = pipeline(
	task, model=model, tokenizer=tokenizer, handle_impossible_answer=True
	)


	def measure_latency(pipe):
	latencies = []
	# warm up
	for _ in range(10):
	_ = pipe(question=question, context=context)
	# Timed run
	for _ in range(1000):
	start_time = perf_counter()
	_ = pipe(question=question, context=context)
	latency = perf_counter() - start_time
	latencies.append(latency)
	# Compute run statistics
	time_avg_ms = 1000 * np.mean(latencies)
	time_std_ms = 1000 * np.std(latencies)
	return f"Average latency (ms) - {time_avg_ms:.2f} +\- {time_std_ms:.2f}"


	print(f"Torch model {measure_latency(torch_qa)}")
	print(f"Optimum model {measure_latency(optimum_qa)}")