conda create -n llama2 python=3.9
conda activate llama2# langchain
pip install langchain
# llama-cpp-python
FORCE_CMAKE=1 pip install -U llama-cpp-python --no-cache-dirBelow are some links to the models with 4-bit quantization.
conda create -n llama2 python=3.9
conda activate llama2# langchain
pip install langchain
# llama-cpp-python
FORCE_CMAKE=1 pip install -U llama-cpp-python --no-cache-dirBelow are some links to the models with 4-bit quantization.
| from langchain.llms import LlamaCpp | |
| from langchain.callbacks.manager import CallbackManager | |
| from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler | |
| callback_manager = CallbackManager([StreamingStdOutCallbackHandler()]) | |
| llm = LlamaCpp( | |
| model_path="{MODEL_FILE_PATH}", | |
| n_gpu_layers=0, | |
| temperature=0.1, | |
| top_p=0.95, | |
| repetition_penalty=1, | |
| f16_kv=True, | |
| callback_manager=callback_manager, | |
| verbose=True | |
| ) | |
| prompt_template = """\ | |
| You are a helpful assistant. | |
| You do not respond as 'User' or pretend to be 'User'. | |
| You only respond once as Assistant. | |
| User: {query} | |
| """ | |
| def llama2(query): | |
| prompt = prompt_template.format(query=query) | |
| response = llm(prompt) | |
| return response | |
| # use this function to get response from Llama2 model | |
| llama2("Hello") |