mapmeld · February 2, 2024 16:22
diff --git a/llama2-langchain.py b/llama2-langchain.py
 # this should run on a GPU CoLab notebook
 # pip install langchain xformers transformers datasets bitsandbytes accelerate --quiet
 # get access to the meta-llama models, accept license, and get a read token

 hf_auth = '######'

 from langchain.chains import ConversationChain
 from langchain.llms import HuggingFacePipeline
 from langchain.memory import ConversationSummaryBufferMemory
 from langchain.prompts.prompt import PromptTemplate

 from torch import cuda, bfloat16
 import torch
 from transformers import StoppingCriteria, StoppingCriteriaList
 import transformers


 model_id = 'meta-llama/Llama-2-7b-chat-hf'

 device = f'cuda:{cuda.current_device()}' if cuda.is_available() else 'cpu'

 # set quantization configuration to load large model with less GPU memory
 # this requires the `bitsandbytes` library
 bnb_config = transformers.BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type='nf4',
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=bfloat16
 )

 model_config = transformers.AutoConfig.from_pretrained(
    model_id,
    use_auth_token=hf_auth
 )

 model = transformers.AutoModelForCausalLM.from_pretrained(
    model_id,
    trust_remote_code=True,
    config=model_config,
    quantization_config=bnb_config,
    device_map='auto',
    use_auth_token=hf_auth
 )
 model.eval()

 tokenizer = transformers.AutoTokenizer.from_pretrained(
    model_id,
    use_auth_token=hf_auth
 )

 stop_list = ['\nHuman:', '\n```\n']

 stop_token_ids = [tokenizer(x)['input_ids'] for x in stop_list]
 stop_token_ids = [torch.LongTensor(x).to(device) for x in stop_token_ids]

 # define custom stopping criteria object
 class StopOnTokens(StoppingCriteria):
    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
        for stop_ids in stop_token_ids:
            if torch.eq(input_ids[0][-len(stop_ids):], stop_ids).all():
                return True
        return False

 stopping_criteria = StoppingCriteriaList([StopOnTokens()])

 generate_text = transformers.pipeline(
    model=model, 
    tokenizer=tokenizer,
    return_full_text=True,  # langchain expects the full text
    task='text-generation',
    # we pass model parameters here too
    stopping_criteria=stopping_criteria,  # without this model rambles during chat
    temperature=0.1,  # 'randomness' of outputs, 0.0 is the min and 1.0 the max
    max_new_tokens=512,  # max number of tokens to generate in the output
    repetition_penalty=1.1  # without this output begins repeating
 )

 DEFAULT_TEMPLATE = """<s>[INST] <<SYS>>
 The following is a friendly conversation between a human and an AI on a serious space mission. The AI is talkative and provides lots of specific details from its context. If the AI does not know the answer to a question, it truthfully says it does not know.

 Current conversation:
 {history}
 <</SYS>>
 {input} [/INST]"""
 PROMPT = PromptTemplate(input_variables=["history", "input"], template=DEFAULT_TEMPLATE)

 chain = ConversationChain(llm=llm, memory=ConversationSummaryBufferMemory(llm=llm, max_token_limit=100), prompt=PROMPT)

 # checking again that everything is working fine
 response = chain.predict(input="Explain the difference between someone's competence and someone's competency.")
 print(response)

 chain.predict(input="What did I just ask about?")
	# this should run on a GPU CoLab notebook
	# pip install langchain xformers transformers datasets bitsandbytes accelerate --quiet
	# get access to the meta-llama models, accept license, and get a read token

	hf_auth = '######'

	from langchain.chains import ConversationChain
	from langchain.llms import HuggingFacePipeline
	from langchain.memory import ConversationSummaryBufferMemory
	from langchain.prompts.prompt import PromptTemplate

	from torch import cuda, bfloat16
	import torch
	from transformers import StoppingCriteria, StoppingCriteriaList
	import transformers


	model_id = 'meta-llama/Llama-2-7b-chat-hf'

	device = f'cuda:{cuda.current_device()}' if cuda.is_available() else 'cpu'

	# set quantization configuration to load large model with less GPU memory
	# this requires the `bitsandbytes` library
	bnb_config = transformers.BitsAndBytesConfig(
	load_in_4bit=True,
	bnb_4bit_quant_type='nf4',
	bnb_4bit_use_double_quant=True,
	bnb_4bit_compute_dtype=bfloat16
	)

	model_config = transformers.AutoConfig.from_pretrained(
	model_id,
	use_auth_token=hf_auth
	)

	model = transformers.AutoModelForCausalLM.from_pretrained(
	model_id,
	trust_remote_code=True,
	config=model_config,
	quantization_config=bnb_config,
	device_map='auto',
	use_auth_token=hf_auth
	)
	model.eval()

	tokenizer = transformers.AutoTokenizer.from_pretrained(
	model_id,
	use_auth_token=hf_auth
	)

	stop_list = ['\nHuman:', '\n```\n']

	stop_token_ids = [tokenizer(x)['input_ids'] for x in stop_list]
	stop_token_ids = [torch.LongTensor(x).to(device) for x in stop_token_ids]

	# define custom stopping criteria object
	class StopOnTokens(StoppingCriteria):
	def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
	for stop_ids in stop_token_ids:
	if torch.eq(input_ids[0][-len(stop_ids):], stop_ids).all():
	return True
	return False

	stopping_criteria = StoppingCriteriaList([StopOnTokens()])

	generate_text = transformers.pipeline(
	model=model,
	tokenizer=tokenizer,
	return_full_text=True, # langchain expects the full text
	task='text-generation',
	# we pass model parameters here too
	stopping_criteria=stopping_criteria, # without this model rambles during chat
	temperature=0.1, # 'randomness' of outputs, 0.0 is the min and 1.0 the max
	max_new_tokens=512, # max number of tokens to generate in the output
	repetition_penalty=1.1 # without this output begins repeating
	)

	DEFAULT_TEMPLATE = """<s>[INST] <<SYS>>
	The following is a friendly conversation between a human and an AI on a serious space mission. The AI is talkative and provides lots of specific details from its context. If the AI does not know the answer to a question, it truthfully says it does not know.

	Current conversation:
	{history}
	<</SYS>>
	{input} [/INST]"""
	PROMPT = PromptTemplate(input_variables=["history", "input"], template=DEFAULT_TEMPLATE)

	chain = ConversationChain(llm=llm, memory=ConversationSummaryBufferMemory(llm=llm, max_token_limit=100), prompt=PROMPT)

	# checking again that everything is working fine
	response = chain.predict(input="Explain the difference between someone's competence and someone's competency.")
	print(response)

	chain.predict(input="What did I just ask about?")