lamoboos223 · August 22, 2024 10:59
diff --git a/rag-example.py b/rag-example.py

 '''
 Steps for making a RAG system in Langchain framework

 1. Prepare the data (knowledge base)
 2. Use the Loader object to load the data in the proper format
 3. Chunk the data into appropriate size
 4. Create Embedding and Retriever
    4.1. Use Embedding model like BAAI/bge-base-en-v1.5 from HuggingFace
    4.2. Create a Vector database like FAISS (Facebook AI Similarity Search). And the Retriever is a an object from the DB class. E.g. `retriever = db.as_retriever(search_type="similarity", search_kwargs={"k": 4})`
 5. Load quantized model
 6. Setup the LLM chain
    6.1. Create a text_generation pipeline using the loaded model and its tokenizer
    6.2. Create a prompt template - this should follow the format of the model, so if you substitute the model checkpoint, make sure to use the appropriate formatting
    6.3. Combine the llm_chain with the retriever to create a RAG chain
 '''


 import torch
 from transformers import pipeline
 from langchain.prompts import PromptTemplate
 from langchain_community.vectorstores import FAISS
 from langchain_core.runnables import RunnablePassthrough
 from langchain_core.output_parsers import StrOutputParser
 from langchain_community.document_loaders import PyPDFLoader
 from langchain.text_splitter import RecursiveCharacterTextSplitter
 from langchain_huggingface import HuggingFaceEmbeddings, HuggingFacePipeline
 from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig


 # Step 1: Creating the pdf file ✓
 # Step 2: Loading the pdf file using the LangChain PDF loader ✓
 loader = PyPDFLoader("./rag.pdf")
 docs = loader.load()
 print(f"Length of documents before chunking: {len(docs)}")

 # Step 3: Create Chunks from the previously created documents ✓
 splitter = RecursiveCharacterTextSplitter(chunk_size=128, chunk_overlap=30)
 chunked_docs = splitter.split_documents(docs)
 print(f"Length of documents after chunking: {len(chunked_docs)}")

 # Step 4: Create Embeddings for the chunked documents
 # 4.1. Import the model BAAI/bge-base-en-v1.5 from HuggingFace And create the vector DB using FAISS library ✓
 db = FAISS.from_documents(chunked_docs, HuggingFaceEmbeddings(model_name="BAAI/bge-base-en-v1.5"))
 # 4.2. Derieve the Retriever out of the vector DB ✓
 retriever = db.as_retriever(search_type="similarity", search_kwargs={"k": 4})

 # Step 5: Load quantized model (The LLM that will be used for text generation) ✓
 model_name = "HuggingFaceH4/zephyr-7b-beta"
 bnb_config = BitsAndBytesConfig(
    load_in_4bit=True, bnb_4bit_use_double_quant=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=torch.bfloat16
 )
 model = AutoModelForCausalLM.from_pretrained(model_name, quantization_config=bnb_config)
 tokenizer = AutoTokenizer.from_pretrained(model_name)

 # Step 6: Setup the LLM chain 
 # 6.1. Create a text_generation pipeline using the loaded model and its tokenizer ✓
 text_generation_pipeline = pipeline(
    model=model,
    tokenizer=tokenizer,
    task="text-generation",
    temperature=0.2,
    do_sample=True,
    repetition_penalty=1.1,
    return_full_text=True,
    max_new_tokens=400,
 )

 llm = HuggingFacePipeline(pipeline=text_generation_pipeline)
 # 6.2. Create a prompt template ✓
 prompt_template = """
 <|system|>
 Answer the question based on your knowledge. Use the following context to help:

 {context}

 </s>
 <|user|>
 {question}
 </s>
 <|assistant|>

 """

 prompt = PromptTemplate(
    input_variables=["context", "question"],
    template=prompt_template,
 )
 llm_chain = prompt | llm | StrOutputParser()
 # 6.3. Combine the llm_chain with the retriever to create a RAG chain ✓
 retriever = db.as_retriever()
 rag_chain = {"context": retriever, "question": RunnablePassthrough()} | llm_chain

 # Test ✓
 question = "what is the author name?"
 without_rag_result = llm_chain.invoke({"context": "", "question": question})
 with_rag_result = rag_chain.invoke(question)
 print(f"Without RAG:\t {without_rag_result}\nWith RAG:\t {with_rag_result}")

	'''
	Steps for making a RAG system in Langchain framework

	1. Prepare the data (knowledge base)
	2. Use the Loader object to load the data in the proper format
	3. Chunk the data into appropriate size
	4. Create Embedding and Retriever
	4.1. Use Embedding model like BAAI/bge-base-en-v1.5 from HuggingFace
	4.2. Create a Vector database like FAISS (Facebook AI Similarity Search). And the Retriever is a an object from the DB class. E.g. `retriever = db.as_retriever(search_type="similarity", search_kwargs={"k": 4})`
	5. Load quantized model
	6. Setup the LLM chain
	6.1. Create a text_generation pipeline using the loaded model and its tokenizer
	6.2. Create a prompt template - this should follow the format of the model, so if you substitute the model checkpoint, make sure to use the appropriate formatting
	6.3. Combine the llm_chain with the retriever to create a RAG chain
	'''


	import torch
	from transformers import pipeline
	from langchain.prompts import PromptTemplate
	from langchain_community.vectorstores import FAISS
	from langchain_core.runnables import RunnablePassthrough
	from langchain_core.output_parsers import StrOutputParser
	from langchain_community.document_loaders import PyPDFLoader
	from langchain.text_splitter import RecursiveCharacterTextSplitter
	from langchain_huggingface import HuggingFaceEmbeddings, HuggingFacePipeline
	from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig


	# Step 1: Creating the pdf file ✓
	# Step 2: Loading the pdf file using the LangChain PDF loader ✓
	loader = PyPDFLoader("./rag.pdf")
	docs = loader.load()
	print(f"Length of documents before chunking: {len(docs)}")

	# Step 3: Create Chunks from the previously created documents ✓
	splitter = RecursiveCharacterTextSplitter(chunk_size=128, chunk_overlap=30)
	chunked_docs = splitter.split_documents(docs)
	print(f"Length of documents after chunking: {len(chunked_docs)}")

	# Step 4: Create Embeddings for the chunked documents
	# 4.1. Import the model BAAI/bge-base-en-v1.5 from HuggingFace And create the vector DB using FAISS library ✓
	db = FAISS.from_documents(chunked_docs, HuggingFaceEmbeddings(model_name="BAAI/bge-base-en-v1.5"))
	# 4.2. Derieve the Retriever out of the vector DB ✓
	retriever = db.as_retriever(search_type="similarity", search_kwargs={"k": 4})

	# Step 5: Load quantized model (The LLM that will be used for text generation) ✓
	model_name = "HuggingFaceH4/zephyr-7b-beta"
	bnb_config = BitsAndBytesConfig(
	load_in_4bit=True, bnb_4bit_use_double_quant=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=torch.bfloat16
	)
	model = AutoModelForCausalLM.from_pretrained(model_name, quantization_config=bnb_config)
	tokenizer = AutoTokenizer.from_pretrained(model_name)

	# Step 6: Setup the LLM chain
	# 6.1. Create a text_generation pipeline using the loaded model and its tokenizer ✓
	text_generation_pipeline = pipeline(
	model=model,
	tokenizer=tokenizer,
	task="text-generation",
	temperature=0.2,
	do_sample=True,
	repetition_penalty=1.1,
	return_full_text=True,
	max_new_tokens=400,
	)

	llm = HuggingFacePipeline(pipeline=text_generation_pipeline)
	# 6.2. Create a prompt template ✓
	prompt_template = """
	<\|system\|>
	Answer the question based on your knowledge. Use the following context to help:

	{context}

	</s>
	<\|user\|>
	{question}
	</s>
	<\|assistant\|>

	"""

	prompt = PromptTemplate(
	input_variables=["context", "question"],
	template=prompt_template,
	)
	llm_chain = prompt \| llm \| StrOutputParser()
	# 6.3. Combine the llm_chain with the retriever to create a RAG chain ✓
	retriever = db.as_retriever()
	rag_chain = {"context": retriever, "question": RunnablePassthrough()} \| llm_chain

	# Test ✓
	question = "what is the author name?"
	without_rag_result = llm_chain.invoke({"context": "", "question": question})
	with_rag_result = rag_chain.invoke(question)
	print(f"Without RAG:\t {without_rag_result}\nWith RAG:\t {with_rag_result}")