Skip to content

Instantly share code, notes, and snippets.

@ranfysvalle02
Created June 7, 2023 06:05
Show Gist options
  • Save ranfysvalle02/88e78e8fad977528c6016bb91db19218 to your computer and use it in GitHub Desktop.
Save ranfysvalle02/88e78e8fad977528c6016bb91db19218 to your computer and use it in GitHub Desktop.
# https://www.mongodb.com/docs/atlas/atlas-search/knn-beta/
import pymongo
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.llms import AzureOpenAI
import os
import glob
import time
from multiprocessing import Pool
from tqdm import tqdm
from rake_nltk import Rake
from langchain.tools import DuckDuckGoSearchRun
from langchain.agents import initialize_agent, Tool
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import MongoDBAtlasVectorSearch
from langchain.document_loaders import TextLoader
from pymongo import MongoClient
from typing import Any, Dict, List
from langchain.docstore.document import Document
from langchain.document_loaders import (
CSVLoader,
EverNoteLoader,
PDFMinerLoader,
TextLoader,
UnstructuredEmailLoader,
UnstructuredEPubLoader,
UnstructuredHTMLLoader,
UnstructuredMarkdownLoader,
UnstructuredODTLoader,
UnstructuredPowerPointLoader,
UnstructuredWordDocumentLoader,
)
MONGO_URI = "mongodb+srv://:@cluster1..mongodb.net/?retryWrites=true&w=majority&tls=true"
MONGODB_DATABASE = "spotlight"
MONGODB_COLLECTION = "demo"
os.environ["OPENAI_API_TYPE"] = "azure"
os.environ["OPENAI_API_KEY"] = ""
os.environ["OPENAI_API_BASE"] = "https://.openai.azure.com/"
azureEmbeddings = OpenAIEmbeddings(
deployment="",
model="text-embedding-ada-002",
openai_api_base="https://.openai.azure.com/",
openai_api_key="",
openai_api_type="azure",
chunk_size=1
)
llm = AzureOpenAI(
deployment_name="",
model_name="gpt-35-turbo",
openai_api_base="https://.openai.azure.com/",
openai_api_key="",
temperature=0.7
)
# Connect to the MongoDB server
client = MongoClient(MONGO_URI)
# Get the collection
collection = client[MONGODB_DATABASE][MONGODB_COLLECTION]
# Map file extensions to document loaders and their arguments
LOADER_MAPPING = {
".txt": (TextLoader, {"encoding": "utf8"}),
# Add more mappings for other file extensions and loaders as needed
}
def process_documents() -> List[Document]:
"""
Load documents and split in chunks
"""
print(f"Loading single document")
documents = load_single_document("./docs/state_of_the_union.txt")
if not documents:
print("No new documents to load")
exit(0)
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
texts = text_splitter.split_documents(documents)
return texts
def load_single_document(file_path: str) -> List[Document]:
ext = "." + file_path.rsplit(".", 1)[-1]
if ext in LOADER_MAPPING:
loader_class, loader_args = LOADER_MAPPING[ext]
loader = loader_class(file_path, **loader_args)
return loader.load()
raise ValueError(f"Unsupported file extension '{ext}'")
def main():
print("First let's clean the database")
collection.delete_many({})
q = "What did the president say about Ketanji Brown Jackson"
vectorstore = MongoDBAtlasVectorSearch(collection, azureEmbeddings)
texts = process_documents()
vectorstore.add_documents(texts)
print("Lets pause for 5 seconds for Search Index Update...")
time.sleep(5)
result = vectorstore.similarity_search(q)
print(result)
exit()
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment