Skip to content

Instantly share code, notes, and snippets.

@Haste171
Created May 18, 2023 04:29
Show Gist options
  • Save Haste171/bbaf03d85e541ab338f4b7ba4d6350d4 to your computer and use it in GitHub Desktop.
Save Haste171/bbaf03d85e541ab338f4b7ba4d6350d4 to your computer and use it in GitHub Desktop.
Ingest readthedocs.io documentation websites with Langchain and upsert their ingestions to Pinecone
import asyncio
import os
import tempfile
from urllib.parse import urljoin
from langchain.document_loaders import ReadTheDocsLoader
import aiohttp
from bs4 import BeautifulSoup
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Pinecone
import pinecone
OPENAI_API_KEY='...'
PINECONE_API_KEY='...'
PINECONE_ENV='...'
PINECONE_INDEX='...'
namespace = '...'
async def download_file(session, url, output_directory):
async with session.get(url) as response:
if response.status == 200:
file_name = os.path.join(output_directory, os.path.basename(url))
file_content = await response.read()
with open(file_name, 'wb') as file:
file.write(file_content)
print(f"Downloaded: {url}")
else:
print(f"Failed to download: {url}")
async def main():
base_url = "https://langchain.readthedocs.io/en/latest/" # example url for testing (compatible)
# Create a temporary directory
with tempfile.TemporaryDirectory() as temp_dir:
async with aiohttp.ClientSession() as session:
async with session.get(base_url) as response:
if response.status == 200:
soup = BeautifulSoup(await response.text(), "html.parser")
tasks = []
for link in soup.find_all("a", href=True):
file_url = urljoin(base_url, link['href'])
if file_url.endswith('.html'):
tasks.append(download_file(session, file_url, temp_dir))
await asyncio.gather(*tasks)
else:
print("Failed to retrieve the page.")
loader = ReadTheDocsLoader(temp_dir, features='html.parser', encoding='utf-8')
docs = loader.load()
text_splitter = RecursiveCharacterTextSplitter(chunk_size=2000, chunk_overlap=100)
texts = text_splitter.split_documents(docs)
pinecone.init(
api_key=PINECONE_API_KEY,
environment=PINECONE_ENV
)
embeddings = OpenAIEmbeddings(model='text-embedding-ada-002', openai_api_key=OPENAI_API_KEY)
Pinecone.from_documents(texts, embeddings, index_name=PINECONE_INDEX, namespace=namespace)
print('loaded!')
if __name__ == "__main__":
asyncio.run(main())
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment