taoky · August 25, 2023 09:52
diff --git a/interactive.py b/interactive.py
 # Deps:
 # pip install bigdl-llm[all]
 import torch
 import time
 import argparse
 # import numpy as np
 import readline

 from bigdl.llm.transformers import AutoModel
 from transformers import AutoTokenizer

 # disable stdout buffering
 import functools
 print = functools.partial(print, flush=True)


 if __name__ == '__main__':
    parser = argparse.ArgumentParser(description='Stream Chat for ChatGLM2 model')
    parser.add_argument('--repo-id-or-model-path', type=str, default="THUDM/chatglm2-6b",
                        help='The huggingface repo id for the ChatGLM2 model to be downloaded'
                             ', or the path to the huggingface checkpoint folder')

    args = parser.parse_args()
    model_path = args.repo_id_or_model_path

    # Load model in 4 bit,
    # which convert the relevant layers in the model into INT4 format
    model = AutoModel.from_pretrained(model_path,
                                      load_in_4bit=True,
                                      trust_remote_code=True)

    # Load tokenizer
    tokenizer = AutoTokenizer.from_pretrained(model_path,
                                              trust_remote_code=True)

    print("输入 reset 重置对话。")
    history = []
    with torch.inference_mode():
        while True:
            try:
                try:
                    question = input("> ")
                except EOFError:
                    exit()
                if question == "reset":
                    history = []
                    print("对话已重置。")
                    continue
                response_ = ""
                timer = time.time()
                print('-'*20, 'Stream Chat Output', '-'*20)
                for response, history_new in model.stream_chat(tokenizer, question, history):
                    history = history_new
                    print(response.replace(response_, ""), end='')
                    response_ = response
                print("")
                print(f"对话耗时 {time.time() - timer:.2f} 秒。")
            except KeyboardInterrupt:
                print("(Interrupted)")
	# Deps:
	# pip install bigdl-llm[all]
	import torch
	import time
	import argparse
	# import numpy as np
	import readline

	from bigdl.llm.transformers import AutoModel
	from transformers import AutoTokenizer

	# disable stdout buffering
	import functools
	print = functools.partial(print, flush=True)


	if __name__ == '__main__':
	parser = argparse.ArgumentParser(description='Stream Chat for ChatGLM2 model')
	parser.add_argument('--repo-id-or-model-path', type=str, default="THUDM/chatglm2-6b",
	help='The huggingface repo id for the ChatGLM2 model to be downloaded'
	', or the path to the huggingface checkpoint folder')

	args = parser.parse_args()
	model_path = args.repo_id_or_model_path

	# Load model in 4 bit,
	# which convert the relevant layers in the model into INT4 format
	model = AutoModel.from_pretrained(model_path,
	load_in_4bit=True,
	trust_remote_code=True)

	# Load tokenizer
	tokenizer = AutoTokenizer.from_pretrained(model_path,
	trust_remote_code=True)

	print("输入 reset 重置对话。")
	history = []
	with torch.inference_mode():
	while True:
	try:
	try:
	question = input("> ")
	except EOFError:
	exit()
	if question == "reset":
	history = []
	print("对话已重置。")
	continue
	response_ = ""
	timer = time.time()
	print('-'20, 'Stream Chat Output', '-'20)
	for response, history_new in model.stream_chat(tokenizer, question, history):
	history = history_new
	print(response.replace(response_, ""), end='')
	response_ = response
	print("")
	print(f"对话耗时 {time.time() - timer:.2f} 秒。")
	except KeyboardInterrupt:
	print("(Interrupted)")