Created
June 11, 2024 03:11
-
-
Save bclavie/f7b041328615d52cf5c0a9caaf03fd5e to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Fetch some text content in two different categories | |
from wikipediaapi import Wikipedia | |
wiki = Wikipedia('RAGBot/0.0', 'en') | |
docs = [{"text": x, | |
"category": "person"} | |
for x in wiki.page('Hayao_Miyazaki').text.split('\n\n')] | |
docs += [{"text": x, | |
"category": "film"} | |
for x in wiki.page('Spirited_Away').text.split('\n\n')] | |
# Enter LanceDB | |
import lancedb | |
from lancedb.pydantic import LanceModel, Vector | |
from lancedb.embeddings import get_registry | |
# Initialise the embedding model | |
model_registry = get_registry().get("sentence-transformers") | |
model = model_registry.create(name="BAAI/bge-small-en-v1.5") | |
# Create a Model to store attributes for filtering | |
class Document(LanceModel): | |
text: str = model.SourceField() | |
vector: Vector(384) = model.VectorField() | |
category: str | |
db = lancedb.connect(".my_db") | |
tbl = db.create_table("my_table", schema=Document) | |
# Embed the documents and store them in the database | |
tbl.add(docs) | |
# Generate the full-text (tf-idf) search index | |
tbl.create_fts_index("text") | |
# Initialise a reranker -- here, Cohere's API one | |
from lancedb.rerankers import CohereReranker | |
reranker = CohereReranker() | |
query = "What is Chihiro's new name given to her by the witch?" | |
results = (tbl.search(query, query_type="hybrid") # Hybrid means text + vector | |
.where("category = 'film'", prefilter=True) # Restrict to only docs in the 'film' category | |
.limit(10) # Get 10 results from first-pass retrieval | |
.rerank(reranker=reranker) # For the reranker to compute the final ranking | |
) | |
results.to_pandas() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment