Created
December 17, 2024 18:13
-
-
Save helena-intel/2c6fc24e623354f83a89ceb0173c02bc to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
OpenVINO LLM chat sample that uses greedy search for reproducible inference | |
Prerequisites: | |
- pip install openvino-genai | |
- an OpenVINO LLM. | |
Usage: python llm_chat.py /path/to/ov_model DEVICE | |
Modified from https://github.com/openvinotoolkit/openvino.genai/tree/master/samples/python/chat_sample | |
""" | |
import argparse | |
import time | |
from pathlib import Path | |
import openvino_genai | |
DEFAULT_SYSTEM_PROMPT = """ | |
You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature. | |
If a question does not make any sense or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information. | |
""" | |
start_message = " <|start_header_id|>system<|end_header_id|>\n\n" + DEFAULT_SYSTEM_PROMPT + "<|eot_id|>" | |
prompt = "why can't you square a circle?" | |
def streamer(subword): | |
print(subword, end="", flush=True) | |
# Return flag corresponds whether generation should be stopped. | |
# False means continue generation. | |
return False | |
def main(): | |
parser = argparse.ArgumentParser() | |
parser.add_argument("model_dir") | |
parser.add_argument("adapter_file") | |
parser.add_argument("device") | |
args = parser.parse_args() | |
pipeline_config = {"NPUW_CACHE_DIR": "npucache"} if args.device == "NPU" else {"CACHE_DIR": "model_cache"} | |
adapter = openvino_genai.Adapter(args.adapter_file) | |
adapter_config = openvino_genai.AdapterConfig(adapter) | |
pipe = openvino_genai.LLMPipeline(args.model_dir, args.device, adapters=adapter_config, **pipeline_config) | |
config = pipe.get_generation_config() | |
config.max_new_tokens = 256 | |
config.do_sample = False | |
config.ignore_eos = True # ensure 256 characters are generated for performance comparison | |
# warmup inference for GPU reproducibility | |
pipe.generate("hello", max_new_tokens=5) | |
pipe.start_chat(system_message=start_message) | |
start = time.perf_counter() | |
pipe.generate(prompt, config, streamer=streamer, adapters=openvino_genai.AdapterConfig()) | |
end = time.perf_counter() | |
print() | |
print(f"Inference duration: {end-start:.2f} seconds") | |
print("\n----------") | |
pipe.finish_chat() | |
if "__main__" == __name__: | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment