Created
May 18, 2023 04:29
-
-
Save Haste171/bbaf03d85e541ab338f4b7ba4d6350d4 to your computer and use it in GitHub Desktop.
Ingest readthedocs.io documentation websites with Langchain and upsert their ingestions to Pinecone
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import asyncio | |
import os | |
import tempfile | |
from urllib.parse import urljoin | |
from langchain.document_loaders import ReadTheDocsLoader | |
import aiohttp | |
from bs4 import BeautifulSoup | |
from langchain.embeddings.openai import OpenAIEmbeddings | |
from langchain.text_splitter import RecursiveCharacterTextSplitter | |
from langchain.vectorstores import Pinecone | |
import pinecone | |
OPENAI_API_KEY='...' | |
PINECONE_API_KEY='...' | |
PINECONE_ENV='...' | |
PINECONE_INDEX='...' | |
namespace = '...' | |
async def download_file(session, url, output_directory): | |
async with session.get(url) as response: | |
if response.status == 200: | |
file_name = os.path.join(output_directory, os.path.basename(url)) | |
file_content = await response.read() | |
with open(file_name, 'wb') as file: | |
file.write(file_content) | |
print(f"Downloaded: {url}") | |
else: | |
print(f"Failed to download: {url}") | |
async def main(): | |
base_url = "https://langchain.readthedocs.io/en/latest/" # example url for testing (compatible) | |
# Create a temporary directory | |
with tempfile.TemporaryDirectory() as temp_dir: | |
async with aiohttp.ClientSession() as session: | |
async with session.get(base_url) as response: | |
if response.status == 200: | |
soup = BeautifulSoup(await response.text(), "html.parser") | |
tasks = [] | |
for link in soup.find_all("a", href=True): | |
file_url = urljoin(base_url, link['href']) | |
if file_url.endswith('.html'): | |
tasks.append(download_file(session, file_url, temp_dir)) | |
await asyncio.gather(*tasks) | |
else: | |
print("Failed to retrieve the page.") | |
loader = ReadTheDocsLoader(temp_dir, features='html.parser', encoding='utf-8') | |
docs = loader.load() | |
text_splitter = RecursiveCharacterTextSplitter(chunk_size=2000, chunk_overlap=100) | |
texts = text_splitter.split_documents(docs) | |
pinecone.init( | |
api_key=PINECONE_API_KEY, | |
environment=PINECONE_ENV | |
) | |
embeddings = OpenAIEmbeddings(model='text-embedding-ada-002', openai_api_key=OPENAI_API_KEY) | |
Pinecone.from_documents(texts, embeddings, index_name=PINECONE_INDEX, namespace=namespace) | |
print('loaded!') | |
if __name__ == "__main__": | |
asyncio.run(main()) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment