Skip to content

Instantly share code, notes, and snippets.

@ranfysvalle02
Created May 19, 2023 03:44
Show Gist options
  • Save ranfysvalle02/1c441cebed6ca4aadc3812bb5e5899f2 to your computer and use it in GitHub Desktop.
Save ranfysvalle02/1c441cebed6ca4aadc3812bb5e5899f2 to your computer and use it in GitHub Desktop.
Build a ChatGPT with your Private Data using LlamaIndex and MongoDB
import logging
import sys
import nest_asyncio
nest_asyncio.apply()
logging.basicConfig(stream=sys.stdout, level=logging.INFO)
logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))
from llama_index import (
LLMPredictor,
GPTVectorStoreIndex,
GPTListIndex,
GPTSimpleKeywordTableIndex,
download_loader,
load_index_from_storage
)
import pymongo
from langchain.chat_models import ChatOpenAI
from llama_index.response.notebook_utils import display_response
import requests
from pathlib import Path
import os
from llama_index.node_parser import SimpleNodeParser
from llama_index.storage.docstore import MongoDocumentStore
from llama_index.storage.index_store import MongoIndexStore
from llama_index.storage.storage_context import StorageContext
MONGO_URI = "mongodb+srv://<username>:<password>@<atlas-cluster>.mongodb.net/?retryWrites=true&w=majority&tls=true"
MONGODB_DATABASE="llamatest"
def download_pdf(url, out_dir):
"""Downloads a PDF file from the given URL and saves it to the given directory.
Args:
url: The URL of the PDF file to download.
out_dir: The directory to save the PDF file to.
Returns:
The path to the downloaded PDF file.
"""
if not out_dir.exists():
os.makedirs(out_dir)
out_path = out_dir / "paper.pdf"
if not out_path.exists():
r = requests.get(url)
with open(out_path, 'wb') as f:
f.write(r.content)
return out_path
def load_data(out_path):
"""Loads the data from the given PDF file.
Args:
out_path: The path to the PDF file to load data from.
Returns:
The data from the PDF file.
"""
PDFReader = download_loader("PDFReader")
loader = PDFReader()
doc = loader.load_data(file=Path(out_path))[0]
return doc
def create_nodes(doc):
"""Creates nodes from the given document.
Args:
doc: The document to create nodes from.
Returns:
A list of nodes.
"""
nodes = SimpleNodeParser().get_nodes_from_documents([doc])
return nodes
def create_index(nodes, storage_context):
"""Creates an index from the given nodes and storage context.
Args:
nodes: The nodes to create an index from.
storage_context: The storage context to create an index in.
Returns:
The index object.
"""
list_index = GPTListIndex(nodes, storage_context=storage_context)
keyword_table_index = GPTSimpleKeywordTableIndex(nodes, storage_context=storage_context)
vector_index = GPTVectorStoreIndex(nodes, storage_context=storage_context)
vector_index.set_index_id("vectorz")
return vector_index
def query_index(vector_index, query):
"""Queries the given index for the given query.
Args:
vector_index: The index to query.
query: The query to search for.
Returns:
A list of results.
"""
query_engine = vector_index.as_query_engine()
list_response = query_engine.query(query)
return list_response
def main():
"""The main function.
This function downloads a PDF file, creates nodes from the file, creates an index from the nodes, and queries the index for a summary of the document.
"""
out_dir = Path("data")
out_path = download_pdf('https://webassets.mongodb.com/MongoDB_Supply_Chain_Security_whitepaper_Jun.pdf', out_dir)
doc = load_data(out_path)
nodes = create_nodes(doc)
storage_context = StorageContext.from_defaults(
docstore=MongoDocumentStore.from_uri(uri=MONGO_URI, db_name=MONGODB_DATABASE),
index_store=MongoIndexStore.from_uri(uri=MONGO_URI, db_name=MONGODB_DATABASE),
)
vector_index = create_index(nodes, storage_context)
list_response = query_index(vector_index, "What is a summary of this document?")
print(list_response)
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment