Created
June 7, 2023 06:05
-
-
Save ranfysvalle02/88e78e8fad977528c6016bb91db19218 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# https://www.mongodb.com/docs/atlas/atlas-search/knn-beta/ | |
import pymongo | |
from langchain.embeddings.openai import OpenAIEmbeddings | |
from langchain.llms import AzureOpenAI | |
import os | |
import glob | |
import time | |
from multiprocessing import Pool | |
from tqdm import tqdm | |
from rake_nltk import Rake | |
from langchain.tools import DuckDuckGoSearchRun | |
from langchain.agents import initialize_agent, Tool | |
from langchain.embeddings.openai import OpenAIEmbeddings | |
from langchain.text_splitter import RecursiveCharacterTextSplitter | |
from langchain.vectorstores import MongoDBAtlasVectorSearch | |
from langchain.document_loaders import TextLoader | |
from pymongo import MongoClient | |
from typing import Any, Dict, List | |
from langchain.docstore.document import Document | |
from langchain.document_loaders import ( | |
CSVLoader, | |
EverNoteLoader, | |
PDFMinerLoader, | |
TextLoader, | |
UnstructuredEmailLoader, | |
UnstructuredEPubLoader, | |
UnstructuredHTMLLoader, | |
UnstructuredMarkdownLoader, | |
UnstructuredODTLoader, | |
UnstructuredPowerPointLoader, | |
UnstructuredWordDocumentLoader, | |
) | |
MONGO_URI = "mongodb+srv://:@cluster1..mongodb.net/?retryWrites=true&w=majority&tls=true" | |
MONGODB_DATABASE = "spotlight" | |
MONGODB_COLLECTION = "demo" | |
os.environ["OPENAI_API_TYPE"] = "azure" | |
os.environ["OPENAI_API_KEY"] = "" | |
os.environ["OPENAI_API_BASE"] = "https://.openai.azure.com/" | |
azureEmbeddings = OpenAIEmbeddings( | |
deployment="", | |
model="text-embedding-ada-002", | |
openai_api_base="https://.openai.azure.com/", | |
openai_api_key="", | |
openai_api_type="azure", | |
chunk_size=1 | |
) | |
llm = AzureOpenAI( | |
deployment_name="", | |
model_name="gpt-35-turbo", | |
openai_api_base="https://.openai.azure.com/", | |
openai_api_key="", | |
temperature=0.7 | |
) | |
# Connect to the MongoDB server | |
client = MongoClient(MONGO_URI) | |
# Get the collection | |
collection = client[MONGODB_DATABASE][MONGODB_COLLECTION] | |
# Map file extensions to document loaders and their arguments | |
LOADER_MAPPING = { | |
".txt": (TextLoader, {"encoding": "utf8"}), | |
# Add more mappings for other file extensions and loaders as needed | |
} | |
def process_documents() -> List[Document]: | |
""" | |
Load documents and split in chunks | |
""" | |
print(f"Loading single document") | |
documents = load_single_document("./docs/state_of_the_union.txt") | |
if not documents: | |
print("No new documents to load") | |
exit(0) | |
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50) | |
texts = text_splitter.split_documents(documents) | |
return texts | |
def load_single_document(file_path: str) -> List[Document]: | |
ext = "." + file_path.rsplit(".", 1)[-1] | |
if ext in LOADER_MAPPING: | |
loader_class, loader_args = LOADER_MAPPING[ext] | |
loader = loader_class(file_path, **loader_args) | |
return loader.load() | |
raise ValueError(f"Unsupported file extension '{ext}'") | |
def main(): | |
print("First let's clean the database") | |
collection.delete_many({}) | |
q = "What did the president say about Ketanji Brown Jackson" | |
vectorstore = MongoDBAtlasVectorSearch(collection, azureEmbeddings) | |
texts = process_documents() | |
vectorstore.add_documents(texts) | |
print("Lets pause for 5 seconds for Search Index Update...") | |
time.sleep(5) | |
result = vectorstore.similarity_search(q) | |
print(result) | |
exit() | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment