Last active
October 24, 2023 03:29
-
-
Save ranfysvalle02/a5aaa4f070ffd72b8fd62cf09ea7409a to your computer and use it in GitHub Desktop.
Python script that uses the langchain library to load web pages, extract their content, and store it in a MongoDB Atlas database along with its embedding
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from pymongo import MongoClient | |
import logging | |
from langchain.document_loaders import PlaywrightURLLoader | |
from langchain.vectorstores import MongoDBAtlasVectorSearch | |
from langchain.embeddings.openai import OpenAIEmbeddings | |
from langchain.embeddings import GPT4AllEmbeddings | |
gpt4all_embd = GPT4AllEmbeddings() | |
LOGGER = logging.getLogger(__name__) | |
logging.basicConfig(level=logging.INFO) | |
URLS = ["https://www.mongodb.com/customers/forbes"] | |
MONGODB_URI = "mongodb+srv://abc-dev:[email protected]/test" | |
DATABASE_NAME = "" | |
COLLECTION_NAME = "" | |
def main(): | |
try: | |
gpt4all_embd = GPT4AllEmbeddings() | |
loader = PlaywrightURLLoader(urls=URLS, remove_selectors=["header", "footer"]) | |
data = loader.load() | |
with MongoClient(MONGODB_URI) as client: | |
db = client[DATABASE_NAME] | |
collection = db[COLLECTION_NAME] | |
vectorstore = MongoDBAtlasVectorSearch(collection, gpt4all_embd) | |
# insert the documents in MongoDB Atlas with their embedding | |
vectorstore.from_documents(data, gpt4all_embd, collection=collection) | |
LOGGER.info("Documents inserted successfully.") | |
except Exception as e: | |
LOGGER.error("An error occurred: %s", str(e)) | |
if __name__ == "__main__": | |
main() |
To use the PlaywrightURLLoader, you will need to install playwright and unstructured. Additionally, you will need to install the Playwright Chromium browser:
Install playwright
pip install "playwright"
pip install "unstructured"
playwright install
from langchain.document_loaders import PlaywrightURLLoader
urls = [
"https://www.youtube.com/watch?v=dQw4w9WgXcQ",
"https://goo.gl/maps/NDSHwePEyaHMFGwh8",
]
loader = PlaywrightURLLoader(urls=urls, remove_selectors=["header", "footer"])
data = loader.load()
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
https://python.langchain.com/docs/integrations/document_loaders/url#setup-1