helena-intel · December 17, 2024 18:13
diff --git a/inference_adapters.py b/inference_adapters.py
 """
 OpenVINO LLM chat sample that uses greedy search for reproducible inference

 Prerequisites: 
 - pip install openvino-genai
 - an OpenVINO LLM.

 Usage: python llm_chat.py /path/to/ov_model DEVICE

 Modified from https://github.com/openvinotoolkit/openvino.genai/tree/master/samples/python/chat_sample
 """

 import argparse
 import time
 from pathlib import Path

 import openvino_genai

 DEFAULT_SYSTEM_PROMPT = """
 You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe.  Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature.
 If a question does not make any sense or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information.
 """

 start_message = " <|start_header_id|>system<|end_header_id|>\n\n" + DEFAULT_SYSTEM_PROMPT + "<|eot_id|>"

 prompt = "why can't you square a circle?"

 def streamer(subword):
    print(subword, end="", flush=True)
    # Return flag corresponds whether generation should be stopped.
    # False means continue generation.
    return False


 def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("model_dir")
    parser.add_argument("adapter_file")
    parser.add_argument("device")
    args = parser.parse_args()

    pipeline_config = {"NPUW_CACHE_DIR": "npucache"} if args.device == "NPU" else {"CACHE_DIR": "model_cache"}
    adapter = openvino_genai.Adapter(args.adapter_file)
    adapter_config = openvino_genai.AdapterConfig(adapter)

    pipe = openvino_genai.LLMPipeline(args.model_dir, args.device, adapters=adapter_config, **pipeline_config)

    config = pipe.get_generation_config()
    config.max_new_tokens = 256
    config.do_sample = False
    config.ignore_eos = True  # ensure 256 characters are generated for performance comparison

    # warmup inference for GPU reproducibility
    pipe.generate("hello", max_new_tokens=5)

    pipe.start_chat(system_message=start_message)
    start = time.perf_counter()
    pipe.generate(prompt, config, streamer=streamer, adapters=openvino_genai.AdapterConfig())
    end = time.perf_counter()
    print()
    print(f"Inference duration: {end-start:.2f} seconds")
    print("\n----------")
    pipe.finish_chat()


 if "__main__" == __name__:
    main()
	"""
	OpenVINO LLM chat sample that uses greedy search for reproducible inference

	Prerequisites:
	- pip install openvino-genai
	- an OpenVINO LLM.

	Usage: python llm_chat.py /path/to/ov_model DEVICE

	Modified from https://github.com/openvinotoolkit/openvino.genai/tree/master/samples/python/chat_sample
	"""

	import argparse
	import time
	from pathlib import Path

	import openvino_genai

	DEFAULT_SYSTEM_PROMPT = """
	You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature.
	If a question does not make any sense or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information.
	"""

	start_message = " <\|start_header_id\|>system<\|end_header_id\|>\n\n" + DEFAULT_SYSTEM_PROMPT + "<\|eot_id\|>"

	prompt = "why can't you square a circle?"

	def streamer(subword):
	print(subword, end="", flush=True)
	# Return flag corresponds whether generation should be stopped.
	# False means continue generation.
	return False


	def main():
	parser = argparse.ArgumentParser()
	parser.add_argument("model_dir")
	parser.add_argument("adapter_file")
	parser.add_argument("device")
	args = parser.parse_args()

	pipeline_config = {"NPUW_CACHE_DIR": "npucache"} if args.device == "NPU" else {"CACHE_DIR": "model_cache"}
	adapter = openvino_genai.Adapter(args.adapter_file)
	adapter_config = openvino_genai.AdapterConfig(adapter)

	pipe = openvino_genai.LLMPipeline(args.model_dir, args.device, adapters=adapter_config, **pipeline_config)

	config = pipe.get_generation_config()
	config.max_new_tokens = 256
	config.do_sample = False
	config.ignore_eos = True # ensure 256 characters are generated for performance comparison

	# warmup inference for GPU reproducibility
	pipe.generate("hello", max_new_tokens=5)

	pipe.start_chat(system_message=start_message)
	start = time.perf_counter()
	pipe.generate(prompt, config, streamer=streamer, adapters=openvino_genai.AdapterConfig())
	end = time.perf_counter()
	print()
	print(f"Inference duration: {end-start:.2f} seconds")
	print("\n----------")
	pipe.finish_chat()


	if "__main__" == __name__:
	main()
No results found