Skip to content

Instantly share code, notes, and snippets.

@ranfysvalle02
Last active October 24, 2023 03:29
Show Gist options
  • Save ranfysvalle02/a5aaa4f070ffd72b8fd62cf09ea7409a to your computer and use it in GitHub Desktop.
Save ranfysvalle02/a5aaa4f070ffd72b8fd62cf09ea7409a to your computer and use it in GitHub Desktop.
Python script that uses the langchain library to load web pages, extract their content, and store it in a MongoDB Atlas database along with its embedding
from pymongo import MongoClient
import logging
from langchain.document_loaders import PlaywrightURLLoader
from langchain.vectorstores import MongoDBAtlasVectorSearch
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.embeddings import GPT4AllEmbeddings
gpt4all_embd = GPT4AllEmbeddings()
LOGGER = logging.getLogger(__name__)
logging.basicConfig(level=logging.INFO)
URLS = ["https://www.mongodb.com/customers/forbes"]
MONGODB_URI = "mongodb+srv://abc-dev:[email protected]/test"
DATABASE_NAME = ""
COLLECTION_NAME = ""
def main():
try:
gpt4all_embd = GPT4AllEmbeddings()
loader = PlaywrightURLLoader(urls=URLS, remove_selectors=["header", "footer"])
data = loader.load()
with MongoClient(MONGODB_URI) as client:
db = client[DATABASE_NAME]
collection = db[COLLECTION_NAME]
vectorstore = MongoDBAtlasVectorSearch(collection, gpt4all_embd)
# insert the documents in MongoDB Atlas with their embedding
vectorstore.from_documents(data, gpt4all_embd, collection=collection)
LOGGER.info("Documents inserted successfully.")
except Exception as e:
LOGGER.error("An error occurred: %s", str(e))
if __name__ == "__main__":
main()
@ranfysvalle02
Copy link
Author

To use the PlaywrightURLLoader, you will need to install playwright and unstructured. Additionally, you will need to install the Playwright Chromium browser:

Install playwright

pip install "playwright"
pip install "unstructured"
playwright install
from langchain.document_loaders import PlaywrightURLLoader

urls = [
    "https://www.youtube.com/watch?v=dQw4w9WgXcQ",
    "https://goo.gl/maps/NDSHwePEyaHMFGwh8",
]

loader = PlaywrightURLLoader(urls=urls, remove_selectors=["header", "footer"])

data = loader.load()

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment