Created
May 19, 2023 03:44
-
-
Save ranfysvalle02/1c441cebed6ca4aadc3812bb5e5899f2 to your computer and use it in GitHub Desktop.
Build a ChatGPT with your Private Data using LlamaIndex and MongoDB
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import logging | |
import sys | |
import nest_asyncio | |
nest_asyncio.apply() | |
logging.basicConfig(stream=sys.stdout, level=logging.INFO) | |
logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout)) | |
from llama_index import ( | |
LLMPredictor, | |
GPTVectorStoreIndex, | |
GPTListIndex, | |
GPTSimpleKeywordTableIndex, | |
download_loader, | |
load_index_from_storage | |
) | |
import pymongo | |
from langchain.chat_models import ChatOpenAI | |
from llama_index.response.notebook_utils import display_response | |
import requests | |
from pathlib import Path | |
import os | |
from llama_index.node_parser import SimpleNodeParser | |
from llama_index.storage.docstore import MongoDocumentStore | |
from llama_index.storage.index_store import MongoIndexStore | |
from llama_index.storage.storage_context import StorageContext | |
MONGO_URI = "mongodb+srv://<username>:<password>@<atlas-cluster>.mongodb.net/?retryWrites=true&w=majority&tls=true" | |
MONGODB_DATABASE="llamatest" | |
def download_pdf(url, out_dir): | |
"""Downloads a PDF file from the given URL and saves it to the given directory. | |
Args: | |
url: The URL of the PDF file to download. | |
out_dir: The directory to save the PDF file to. | |
Returns: | |
The path to the downloaded PDF file. | |
""" | |
if not out_dir.exists(): | |
os.makedirs(out_dir) | |
out_path = out_dir / "paper.pdf" | |
if not out_path.exists(): | |
r = requests.get(url) | |
with open(out_path, 'wb') as f: | |
f.write(r.content) | |
return out_path | |
def load_data(out_path): | |
"""Loads the data from the given PDF file. | |
Args: | |
out_path: The path to the PDF file to load data from. | |
Returns: | |
The data from the PDF file. | |
""" | |
PDFReader = download_loader("PDFReader") | |
loader = PDFReader() | |
doc = loader.load_data(file=Path(out_path))[0] | |
return doc | |
def create_nodes(doc): | |
"""Creates nodes from the given document. | |
Args: | |
doc: The document to create nodes from. | |
Returns: | |
A list of nodes. | |
""" | |
nodes = SimpleNodeParser().get_nodes_from_documents([doc]) | |
return nodes | |
def create_index(nodes, storage_context): | |
"""Creates an index from the given nodes and storage context. | |
Args: | |
nodes: The nodes to create an index from. | |
storage_context: The storage context to create an index in. | |
Returns: | |
The index object. | |
""" | |
list_index = GPTListIndex(nodes, storage_context=storage_context) | |
keyword_table_index = GPTSimpleKeywordTableIndex(nodes, storage_context=storage_context) | |
vector_index = GPTVectorStoreIndex(nodes, storage_context=storage_context) | |
vector_index.set_index_id("vectorz") | |
return vector_index | |
def query_index(vector_index, query): | |
"""Queries the given index for the given query. | |
Args: | |
vector_index: The index to query. | |
query: The query to search for. | |
Returns: | |
A list of results. | |
""" | |
query_engine = vector_index.as_query_engine() | |
list_response = query_engine.query(query) | |
return list_response | |
def main(): | |
"""The main function. | |
This function downloads a PDF file, creates nodes from the file, creates an index from the nodes, and queries the index for a summary of the document. | |
""" | |
out_dir = Path("data") | |
out_path = download_pdf('https://webassets.mongodb.com/MongoDB_Supply_Chain_Security_whitepaper_Jun.pdf', out_dir) | |
doc = load_data(out_path) | |
nodes = create_nodes(doc) | |
storage_context = StorageContext.from_defaults( | |
docstore=MongoDocumentStore.from_uri(uri=MONGO_URI, db_name=MONGODB_DATABASE), | |
index_store=MongoIndexStore.from_uri(uri=MONGO_URI, db_name=MONGODB_DATABASE), | |
) | |
vector_index = create_index(nodes, storage_context) | |
list_response = query_index(vector_index, "What is a summary of this document?") | |
print(list_response) | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment