John-Lin · August 1, 2023 13:36 · John-Lin · Aug 1, 2023
diff --git a/app.py b/app.py
 from langchain.llms import LlamaCpp
 from langchain import PromptTemplate, LLMChain
 from langchain.callbacks.manager import CallbackManager
 from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler

 template_cot = """Question: {question}
 Answer: Let's work this out in a step by step way to be sure we have the right answer."""

 template_zero_shot = """{question}"""

 prompt = PromptTemplate(template=template_zero_shot, input_variables=["question"])

 # Callbacks support token-wise streaming
 callback_manager = CallbackManager([StreamingStdOutCallbackHandler()])
 # Verbose is required to pass to the callback manager

 n_gpu_layers = 1  # Metal set to 1 is enough.
 n_batch = 512  # Should be between 1 and n_ctx, consider the amount of RAM of your Apple Silicon Chip.

 # Make sure the model path is correct for your system!
 llm = LlamaCpp(
    model_path="./models/7B/ggml-model-q4_0.bin",
    temperature=0.75,
    n_gpu_layers=n_gpu_layers,
    n_batch=n_batch,
    f16_kv=True,  # MUST set to True, otherwise you will run into problem after a couple of calls
    max_tokens=1024,
    callback_manager=callback_manager,
    verbose=True,
 ) # type: ignore

 llm_chain = LLMChain(prompt=prompt, llm=llm)

 question = "I went to the market and bought 10 apples. I gave 2 apples to the neighbor and 2 to the repairman. I then went and bought 5 more apples and ate 1. How many apples did I remain with?"

 print(llm_chain.run(question))
	from langchain.llms import LlamaCpp
	from langchain import PromptTemplate, LLMChain
	from langchain.callbacks.manager import CallbackManager
	from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler

	template_cot = """Question: {question}
	Answer: Let's work this out in a step by step way to be sure we have the right answer."""

	template_zero_shot = """{question}"""

	prompt = PromptTemplate(template=template_zero_shot, input_variables=["question"])

	# Callbacks support token-wise streaming
	callback_manager = CallbackManager([StreamingStdOutCallbackHandler()])
	# Verbose is required to pass to the callback manager

	n_gpu_layers = 1 # Metal set to 1 is enough.
	n_batch = 512 # Should be between 1 and n_ctx, consider the amount of RAM of your Apple Silicon Chip.

	# Make sure the model path is correct for your system!
	llm = LlamaCpp(
	model_path="./models/7B/ggml-model-q4_0.bin",
	temperature=0.75,
	n_gpu_layers=n_gpu_layers,
	n_batch=n_batch,
	f16_kv=True, # MUST set to True, otherwise you will run into problem after a couple of calls
	max_tokens=1024,
	callback_manager=callback_manager,
	verbose=True,
	) # type: ignore

	llm_chain = LLMChain(prompt=prompt, llm=llm)

	question = "I went to the market and bought 10 apples. I gave 2 apples to the neighbor and 2 to the repairman. I then went and bought 5 more apples and ate 1. How many apples did I remain with?"

	print(llm_chain.run(question))