Created
October 30, 2023 00:22
-
-
Save amotl/f8a62404e23a172f0671842e965dae48 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
pip install requests 'requests-cache<2' | |
""" | |
import os | |
import requests_cache | |
import typing as t | |
from langchain.embeddings import OpenAIEmbeddings | |
from langchain.schema import Document | |
from langchain.text_splitter import CharacterTextSplitter | |
from langchain.vectorstores.cratedb import CrateDBVectorSearch | |
from unstructured.partition.html import partition_html | |
# Connect to CrateDB instance defined by `CRATEDB_CONNECTION_STRING`. | |
CONNECTION_STRING = os.environ.get( | |
"CRATEDB_CONNECTION_STRING", | |
"crate://crate@localhost/?schema=notebook", | |
) | |
COLLECTION_NAME = "state_of_the_union_test" | |
embeddings = OpenAIEmbeddings() | |
http = requests_cache.CachedSession(".httpcache") | |
def document_from_url(url: str) -> Document: | |
""" | |
Converge URL resource into LangChain Document. | |
""" | |
response = http.get(url) | |
elements = partition_html(text=response.text) | |
text = "\n\n".join([str(el) for el in elements]) | |
metadata = {"source": url} | |
return Document(page_content=text, metadata=metadata) | |
def load_documents(url: str) -> t.List[Document]: | |
""" | |
Load URL resource, and split paragraphs in response into individual documents. | |
""" | |
documents = [document_from_url(url)] | |
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0) | |
docs = text_splitter.split_documents(documents) | |
return docs | |
def main(): | |
print("Acquiring") | |
url = "https://github.com/langchain-ai/langchain/raw/v0.0.325/docs/docs/modules/state_of_the_union.txt" | |
docs = load_documents(url) | |
print("Loading") | |
db = CrateDBVectorSearch.from_documents( | |
documents=docs, | |
embedding=embeddings, | |
collection_name=COLLECTION_NAME, | |
connection_string=CONNECTION_STRING, | |
pre_delete_collection=True, | |
) | |
print("Querying") | |
docs_with_score = db.similarity_search_with_score("foo", k=10) | |
print(f"Result count: {len(docs_with_score)}") | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment