fblissjr · June 19, 2023 18:40
diff --git a/autogptq_llama_local_llm.txt b/autogptq_llama_local_llm.txt
 ## WIP synthetic question/answer pair generation using a local llm based on llamaindex
 ### Trying to make autogptq+llamaindex+transformers wrapper fix for broken tokenizer working
 ### next, add axolotl integration for prompting strategies and finetuning

 edit:Think I just got it working with AutoGPTQ. Had to manually set stop tokens and edit the transformers.util.py (https://github.com/jerryjliu/llama_index/issues/3501)
 For future reference, if anyone needs the code pattern for using AutoGPTQ with llama_index, this is confirmed working on my side -

 Step 1. Hack transformers (this sucks, but I couldn't find any other way - if anyone else does, let me know)
 https://github.com/jerryjliu/llama_index/issues/3501
 Quote from issue:

 "You can temporarily make it work this way:
 open site-packages/transformers/generation/utils.py
 this will be located in the folder wherever your python interpreter is
 delete line 1139, 1140, 1141
 Remember to keep those lines somewhere, for when you are done with this project and have to restore the package as it was."
 Step 2. Load your model and tokenizer like this.
 # V2

 import os
 import json
 import torch
 from transformers import AutoTokenizer, StoppingCriteria, StoppingCriteriaList, logging
 from auto_gptq import AutoGPTQForCausalLM, BaseQuantizeConfig

 quantized_model_dir = os.path.join(llm_model_path, "TheBloke_WizardLM-30B-GPTQ")
 model_basename = "wizardlm-30b-GPTQ-4bit.act.order"

 use_triton = False

 tokenizer_config_path = os.path.join(quantized_model_dir, "tokenizer_config.json")

 # Load the tokenizer config as a dict
 with open(tokenizer_config_path, "r") as f:
    tokenizer_config = json.load(f)

 # Now initialize the tokenizer with the config
 tokenizer = AutoTokenizer.from_pretrained(
    quantized_model_dir, use_fast=True, return_token_type_ids=False, **tokenizer_config
 )

 # Verify the start and stop tokens
 print(f"Start token: {tokenizer.bos_token}, ID: {tokenizer.bos_token_id}")
 print(f"End token: {tokenizer.eos_token}, ID: {tokenizer.eos_token_id}")

 model = AutoGPTQForCausalLM.from_quantized(
    quantized_model_dir,
    model_basename=model_basename,
    use_safetensors=True,
    trust_remote_code=False,
    device="cuda:0",
    use_triton=use_triton,
    quantize_config=None,
 )

 # Note: check the prompt template is correct for this model.
 prompt = "Tell me about AI"

 print("\n\n*** Generate:")

 input_ids = tokenizer(prompt, return_tensors="pt").input_ids.cuda()
 output = model.generate(inputs=input_ids, temperature=0.7, max_new_tokens=50)
 print(tokenizer.decode(output[0], skip_special_tokens=True))

 # Set the bos_token_id and eos_token_id
 model.config.bos_token_id = tokenizer.bos_token_id
 model.config.eos_token_id = tokenizer.eos_token_id

 # Prevent printing spurious transformers error when using pipeline with AutoGPTQ
 logging.set_verbosity(logging.CRITICAL)
 Step 3. Set up your template prompts correctly

 # setup prompts
 from llama_index.prompts.prompts import SimpleInputPrompt

 system_prompt = """
 A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions.\n
 USER: {query_str}\n
 ASSISTANT: "
 """

 # This will wrap the default prompts that are internal to llama-index
 query_wrapper_prompt = SimpleInputPrompt("USER: {query_str}\nASSISTANT: ")
 Step 4. Set up your service context like this, using the embedding model of your choice

 service_context = ServiceContext.from_defaults(
    llm_predictor=hf_predictor, embed_model=embed_model
 )
 
 Step 5. (WIP) generate your q&a pairs per your instruction template of choice (see: https://github.com/OpenAccess-AI-Collective/axolotl/tree/9492d4ebb718568305a7402150733c9617bfc29f/src/axolotl/prompt_strategies for different prompt strategies depending on your finetuning goals), export as json or jsonl

 Integrate with axolotl: https://github.com/OpenAccess-AI-Collective/axolotl/blob/9492d4ebb718568305a7402150733c9617bfc29f/README.md?plain=1#L233 
 fblissjr — Today at 1:35 PM
 Basically a $0 synthetic data generator off your own private data.

 If you use the built-in response evaluator, you could even have it classify/label for you whether or not the retrieved result was helpful / not helpful (see SAIL for reference/inspiration: https://openlsr.org/sail-7b)

 Then you pretty much have a finetuned model whose sole task is to classify a given retrieved node as helpful/not helpful.
	## WIP synthetic question/answer pair generation using a local llm based on llamaindex
	### Trying to make autogptq+llamaindex+transformers wrapper fix for broken tokenizer working
	### next, add axolotl integration for prompting strategies and finetuning

	edit:Think I just got it working with AutoGPTQ. Had to manually set stop tokens and edit the transformers.util.py (https://github.com/jerryjliu/llama_index/issues/3501)
	For future reference, if anyone needs the code pattern for using AutoGPTQ with llama_index, this is confirmed working on my side -

	Step 1. Hack transformers (this sucks, but I couldn't find any other way - if anyone else does, let me know)
	https://github.com/jerryjliu/llama_index/issues/3501
	Quote from issue:

	"You can temporarily make it work this way:
	open site-packages/transformers/generation/utils.py
	this will be located in the folder wherever your python interpreter is
	delete line 1139, 1140, 1141
	Remember to keep those lines somewhere, for when you are done with this project and have to restore the package as it was."
	Step 2. Load your model and tokenizer like this.
	# V2

	import os
	import json
	import torch
	from transformers import AutoTokenizer, StoppingCriteria, StoppingCriteriaList, logging
	from auto_gptq import AutoGPTQForCausalLM, BaseQuantizeConfig

	quantized_model_dir = os.path.join(llm_model_path, "TheBloke_WizardLM-30B-GPTQ")
	model_basename = "wizardlm-30b-GPTQ-4bit.act.order"

	use_triton = False

	tokenizer_config_path = os.path.join(quantized_model_dir, "tokenizer_config.json")

	# Load the tokenizer config as a dict
	with open(tokenizer_config_path, "r") as f:
	tokenizer_config = json.load(f)

	# Now initialize the tokenizer with the config
	tokenizer = AutoTokenizer.from_pretrained(
	quantized_model_dir, use_fast=True, return_token_type_ids=False, **tokenizer_config
	)

	# Verify the start and stop tokens
	print(f"Start token: {tokenizer.bos_token}, ID: {tokenizer.bos_token_id}")
	print(f"End token: {tokenizer.eos_token}, ID: {tokenizer.eos_token_id}")

	model = AutoGPTQForCausalLM.from_quantized(
	quantized_model_dir,
	model_basename=model_basename,
	use_safetensors=True,
	trust_remote_code=False,
	device="cuda:0",
	use_triton=use_triton,
	quantize_config=None,
	)

	# Note: check the prompt template is correct for this model.
	prompt = "Tell me about AI"

	print("\n\n*** Generate:")

	input_ids = tokenizer(prompt, return_tensors="pt").input_ids.cuda()
	output = model.generate(inputs=input_ids, temperature=0.7, max_new_tokens=50)
	print(tokenizer.decode(output[0], skip_special_tokens=True))

	# Set the bos_token_id and eos_token_id
	model.config.bos_token_id = tokenizer.bos_token_id
	model.config.eos_token_id = tokenizer.eos_token_id

	# Prevent printing spurious transformers error when using pipeline with AutoGPTQ
	logging.set_verbosity(logging.CRITICAL)
	Step 3. Set up your template prompts correctly

	# setup prompts
	from llama_index.prompts.prompts import SimpleInputPrompt

	system_prompt = """
	A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions.\n
	USER: {query_str}\n
	ASSISTANT: "
	"""

	# This will wrap the default prompts that are internal to llama-index
	query_wrapper_prompt = SimpleInputPrompt("USER: {query_str}\nASSISTANT: ")
	Step 4. Set up your service context like this, using the embedding model of your choice

	service_context = ServiceContext.from_defaults(
	llm_predictor=hf_predictor, embed_model=embed_model
	)

	Step 5. (WIP) generate your q&a pairs per your instruction template of choice (see: https://github.com/OpenAccess-AI-Collective/axolotl/tree/9492d4ebb718568305a7402150733c9617bfc29f/src/axolotl/prompt_strategies for different prompt strategies depending on your finetuning goals), export as json or jsonl

	Integrate with axolotl: https://github.com/OpenAccess-AI-Collective/axolotl/blob/9492d4ebb718568305a7402150733c9617bfc29f/README.md?plain=1#L233
	fblissjr — Today at 1:35 PM
	Basically a $0 synthetic data generator off your own private data.

	If you use the built-in response evaluator, you could even have it classify/label for you whether or not the retrieved result was helpful / not helpful (see SAIL for reference/inspiration: https://openlsr.org/sail-7b)

	Then you pretty much have a finetuned model whose sole task is to classify a given retrieved node as helpful/not helpful.