Last active
February 28, 2025 11:10
-
-
Save davidberenstein1957/f0157a471ec59d9dd44ae6957f1d52ec to your computer and use it in GitHub Desktop.
vector search on the Hugging Face Hub
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# /// script | |
# requires-python = ">=3.11,<3.12" | |
# dependencies = [ | |
# "duckdb", sentence-transformers, huggingface_hub | |
# ] | |
# /// | |
from sentence_transformers import SentenceTransformer | |
import duckdb | |
from huggingface_hub import get_token | |
model = SentenceTransformer("TaylorAI/bge-micro-v2") | |
def similarity_search( | |
query: str, | |
k: int = 5, | |
dataset_name: str = "smol-blueprint/hf-blogs-text-embeddings", | |
embedding_column: str = "embedding", | |
): | |
query_vector = model.encode(query) | |
embedding_dim = model.get_sentence_embedding_dimension() | |
sql = f""" | |
SELECT | |
*, | |
array_cosine_distance( | |
{embedding_column}::float[{embedding_dim}], | |
{query_vector.tolist()}::float[{embedding_dim}] | |
) as distance | |
FROM 'hf://datasets/{dataset_name}/**/*.parquet' | |
ORDER BY distance | |
LIMIT {k} | |
""" | |
return duckdb.sql(sql).to_df() | |
similarity_search("How can I use the Hub for vector search?") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Now that
ibis
supports fixed-length arrays this operation can also be done without writing any sql :)