Last active
February 2, 2024 16:22
-
-
Save mapmeld/782186ad1acadb3642fd105f9f777759 to your computer and use it in GitHub Desktop.
llama2-langchain
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# this should run on a GPU CoLab notebook | |
# pip install langchain xformers transformers datasets bitsandbytes accelerate --quiet | |
# get access to the meta-llama models, accept license, and get a read token | |
hf_auth = '######' | |
from langchain.chains import ConversationChain | |
from langchain.llms import HuggingFacePipeline | |
from langchain.memory import ConversationSummaryBufferMemory | |
from langchain.prompts.prompt import PromptTemplate | |
from torch import cuda, bfloat16 | |
import torch | |
from transformers import StoppingCriteria, StoppingCriteriaList | |
import transformers | |
model_id = 'meta-llama/Llama-2-7b-chat-hf' | |
device = f'cuda:{cuda.current_device()}' if cuda.is_available() else 'cpu' | |
# set quantization configuration to load large model with less GPU memory | |
# this requires the `bitsandbytes` library | |
bnb_config = transformers.BitsAndBytesConfig( | |
load_in_4bit=True, | |
bnb_4bit_quant_type='nf4', | |
bnb_4bit_use_double_quant=True, | |
bnb_4bit_compute_dtype=bfloat16 | |
) | |
model_config = transformers.AutoConfig.from_pretrained( | |
model_id, | |
use_auth_token=hf_auth | |
) | |
model = transformers.AutoModelForCausalLM.from_pretrained( | |
model_id, | |
trust_remote_code=True, | |
config=model_config, | |
quantization_config=bnb_config, | |
device_map='auto', | |
use_auth_token=hf_auth | |
) | |
model.eval() | |
tokenizer = transformers.AutoTokenizer.from_pretrained( | |
model_id, | |
use_auth_token=hf_auth | |
) | |
stop_list = ['\nHuman:', '\n```\n'] | |
stop_token_ids = [tokenizer(x)['input_ids'] for x in stop_list] | |
stop_token_ids = [torch.LongTensor(x).to(device) for x in stop_token_ids] | |
# define custom stopping criteria object | |
class StopOnTokens(StoppingCriteria): | |
def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool: | |
for stop_ids in stop_token_ids: | |
if torch.eq(input_ids[0][-len(stop_ids):], stop_ids).all(): | |
return True | |
return False | |
stopping_criteria = StoppingCriteriaList([StopOnTokens()]) | |
generate_text = transformers.pipeline( | |
model=model, | |
tokenizer=tokenizer, | |
return_full_text=True, # langchain expects the full text | |
task='text-generation', | |
# we pass model parameters here too | |
stopping_criteria=stopping_criteria, # without this model rambles during chat | |
temperature=0.1, # 'randomness' of outputs, 0.0 is the min and 1.0 the max | |
max_new_tokens=512, # max number of tokens to generate in the output | |
repetition_penalty=1.1 # without this output begins repeating | |
) | |
DEFAULT_TEMPLATE = """<s>[INST] <<SYS>> | |
The following is a friendly conversation between a human and an AI on a serious space mission. The AI is talkative and provides lots of specific details from its context. If the AI does not know the answer to a question, it truthfully says it does not know. | |
Current conversation: | |
{history} | |
<</SYS>> | |
{input} [/INST]""" | |
PROMPT = PromptTemplate(input_variables=["history", "input"], template=DEFAULT_TEMPLATE) | |
chain = ConversationChain(llm=llm, memory=ConversationSummaryBufferMemory(llm=llm, max_token_limit=100), prompt=PROMPT) | |
# checking again that everything is working fine | |
response = chain.predict(input="Explain the difference between someone's competence and someone's competency.") | |
print(response) | |
chain.predict(input="What did I just ask about?") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment