Created
May 24, 2024 16:18
-
-
Save decagondev/731f0d6953cc8724143e89d08e86d7b9 to your computer and use it in GitHub Desktop.
Rag example with llama index
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
import fitz # PyMuPDF | |
from llama_index import GPTSimpleVectorIndex, SimpleDirectoryReader, LLMPredictor, ServiceContext, Document | |
from llama_index.prompts import PromptTemplate | |
from ollama_llama import Llama | |
class PDFDirectoryReader(SimpleDirectoryReader): | |
def __init__(self, directory_path): | |
self.directory_path = directory_path | |
def load_data(self): | |
documents = [] | |
for filename in os.listdir(self.directory_path): | |
if filename.endswith('.pdf'): | |
file_path = os.path.join(self.directory_path, filename) | |
text = self._extract_text_from_pdf(file_path) | |
documents.append(Document(page_content=text, metadata={'source': filename})) | |
return documents | |
def _extract_text_from_pdf(self, file_path): | |
text = "" | |
with fitz.open(file_path) as pdf: | |
for page in pdf: | |
text += page.get_text() | |
return text | |
# Define your prompt | |
prompt = "How has Berkshire Hathaway's investment in Coca-cola grown?" | |
# Set up the Llama3 LLM | |
llm = Llama(api_key="your-ollama-api-key", model_name="llama3") | |
llm_predictor = LLMPredictor(llm=llm) | |
# Load your documents into the index from a directory containing PDFs | |
documents = PDFDirectoryReader('path/to/your/documents').load_data() | |
# Create the GPT Simple Vector Index | |
index = GPTSimpleVectorIndex.from_documents(documents, llm_predictor=llm_predictor) | |
# Create the service context | |
service_context = ServiceContext.from_defaults(llm_predictor=llm_predictor) | |
# Querying the vector database for "relevant" docs | |
relevant_docs = index.query(prompt, service_context=service_context) | |
context = "\n".join([doc.page_content for doc in relevant_docs]) | |
for doc in relevant_docs: | |
print(f"Source: {doc.metadata.get('source', 'unknown')}\nContent: {doc.page_content}\n\n") | |
print("__________________________") | |
# Adding context to our prompt | |
template = PromptTemplate(template="{query} Context: {context}", input_variables=["query", "context"]) | |
prompt_with_context = template.invoke({"query": prompt, "context": context}) | |
# Asking the LLM for a response from our prompt with the provided context | |
results = llm.invoke(prompt_with_context) | |
print(results.content) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment