Haste171 · May 18, 2023 04:29
diff --git a/langchain_ingest_docs.py b/langchain_ingest_docs.py
 import asyncio
 import os
 import tempfile
 from urllib.parse import urljoin
 from langchain.document_loaders import ReadTheDocsLoader

 import aiohttp
 from bs4 import BeautifulSoup

 from langchain.embeddings.openai import OpenAIEmbeddings
 from langchain.text_splitter import RecursiveCharacterTextSplitter
 from langchain.vectorstores import Pinecone
 import pinecone

 OPENAI_API_KEY='...'
 PINECONE_API_KEY='...'
 PINECONE_ENV='...'
 PINECONE_INDEX='...'

 namespace = '...'



 async def download_file(session, url, output_directory):
    async with session.get(url) as response:
        if response.status == 200:
            file_name = os.path.join(output_directory, os.path.basename(url))
            file_content = await response.read()
            with open(file_name, 'wb') as file:
                file.write(file_content)
            print(f"Downloaded: {url}")
        else:
            print(f"Failed to download: {url}")


 async def main():
    base_url = "https://langchain.readthedocs.io/en/latest/" # example url for testing (compatible)

    # Create a temporary directory
    with tempfile.TemporaryDirectory() as temp_dir:
        async with aiohttp.ClientSession() as session:
            async with session.get(base_url) as response:
                if response.status == 200:
                    soup = BeautifulSoup(await response.text(), "html.parser")
                    tasks = []

                    for link in soup.find_all("a", href=True):
                        file_url = urljoin(base_url, link['href'])
                        if file_url.endswith('.html'):
                            tasks.append(download_file(session, file_url, temp_dir))

                    await asyncio.gather(*tasks)
                else:
                    print("Failed to retrieve the page.")

        loader = ReadTheDocsLoader(temp_dir, features='html.parser', encoding='utf-8')
        docs = loader.load()
        text_splitter = RecursiveCharacterTextSplitter(chunk_size=2000, chunk_overlap=100)
        texts = text_splitter.split_documents(docs)

        pinecone.init(
            api_key=PINECONE_API_KEY,
            environment=PINECONE_ENV
        )
        embeddings = OpenAIEmbeddings(model='text-embedding-ada-002', openai_api_key=OPENAI_API_KEY)
        Pinecone.from_documents(texts, embeddings, index_name=PINECONE_INDEX, namespace=namespace)
        print('loaded!')

 if __name__ == "__main__":
    asyncio.run(main())
	import asyncio
	import os
	import tempfile
	from urllib.parse import urljoin
	from langchain.document_loaders import ReadTheDocsLoader

	import aiohttp
	from bs4 import BeautifulSoup

	from langchain.embeddings.openai import OpenAIEmbeddings
	from langchain.text_splitter import RecursiveCharacterTextSplitter
	from langchain.vectorstores import Pinecone
	import pinecone

	OPENAI_API_KEY='...'
	PINECONE_API_KEY='...'
	PINECONE_ENV='...'
	PINECONE_INDEX='...'

	namespace = '...'



	async def download_file(session, url, output_directory):
	async with session.get(url) as response:
	if response.status == 200:
	file_name = os.path.join(output_directory, os.path.basename(url))
	file_content = await response.read()
	with open(file_name, 'wb') as file:
	file.write(file_content)
	print(f"Downloaded: {url}")
	else:
	print(f"Failed to download: {url}")


	async def main():
	base_url = "https://langchain.readthedocs.io/en/latest/" # example url for testing (compatible)

	# Create a temporary directory
	with tempfile.TemporaryDirectory() as temp_dir:
	async with aiohttp.ClientSession() as session:
	async with session.get(base_url) as response:
	if response.status == 200:
	soup = BeautifulSoup(await response.text(), "html.parser")
	tasks = []

	for link in soup.find_all("a", href=True):
	file_url = urljoin(base_url, link['href'])
	if file_url.endswith('.html'):
	tasks.append(download_file(session, file_url, temp_dir))

	await asyncio.gather(*tasks)
	else:
	print("Failed to retrieve the page.")

	loader = ReadTheDocsLoader(temp_dir, features='html.parser', encoding='utf-8')
	docs = loader.load()
	text_splitter = RecursiveCharacterTextSplitter(chunk_size=2000, chunk_overlap=100)
	texts = text_splitter.split_documents(docs)

	pinecone.init(
	api_key=PINECONE_API_KEY,
	environment=PINECONE_ENV
	)
	embeddings = OpenAIEmbeddings(model='text-embedding-ada-002', openai_api_key=OPENAI_API_KEY)
	Pinecone.from_documents(texts, embeddings, index_name=PINECONE_INDEX, namespace=namespace)
	print('loaded!')

	if __name__ == "__main__":
	asyncio.run(main())