Skip to content

Instantly share code, notes, and snippets.

@davidberenstein1957
Last active February 28, 2025 11:10
Show Gist options
  • Save davidberenstein1957/f0157a471ec59d9dd44ae6957f1d52ec to your computer and use it in GitHub Desktop.
Save davidberenstein1957/f0157a471ec59d9dd44ae6957f1d52ec to your computer and use it in GitHub Desktop.
vector search on the Hugging Face Hub
# /// script
# requires-python = ">=3.11,<3.12"
# dependencies = [
# "duckdb", sentence-transformers, huggingface_hub
# ]
# ///
from sentence_transformers import SentenceTransformer
import duckdb
from huggingface_hub import get_token
model = SentenceTransformer("TaylorAI/bge-micro-v2")
def similarity_search(
query: str,
k: int = 5,
dataset_name: str = "smol-blueprint/hf-blogs-text-embeddings",
embedding_column: str = "embedding",
):
query_vector = model.encode(query)
embedding_dim = model.get_sentence_embedding_dimension()
sql = f"""
SELECT
*,
array_cosine_distance(
{embedding_column}::float[{embedding_dim}],
{query_vector.tolist()}::float[{embedding_dim}]
) as distance
FROM 'hf://datasets/{dataset_name}/**/*.parquet'
ORDER BY distance
LIMIT {k}
"""
return duckdb.sql(sql).to_df()
similarity_search("How can I use the Hub for vector search?")
@Riezebos
Copy link

Riezebos commented Feb 28, 2025

Now that ibis supports fixed-length arrays this operation can also be done without writing any sql :)

# /// script
# requires-python = ">=3.11,<3.12"
# dependencies = [
#     "ibis-framework[duckdb]", sentence-transformers, huggingface_hub
# ]
# ///
import ibis
import ibis.expr.datatypes as dt
from ibis import _
from sentence_transformers import SentenceTransformer

model = SentenceTransformer("TaylorAI/bge-micro-v2")


@ibis.udf.scalar.builtin
def array_cosine_distance(x, y) -> float:
    """Compute cosine similarity between two vectors."""


def similarity_search(
    query: str,
    k: int = 5,
    dataset_name: str = "smol-blueprint/hf-blogs-text-embeddings",
    embedding_column: str = "embedding",
):
    query_vector = model.encode(query)
    embedding_dim = model.get_sentence_embedding_dimension()

    return (
        ibis.read_parquet(f"hf://datasets/{dataset_name}/**/*.parquet")
        .mutate(
            distance=array_cosine_distance(
                _[embedding_column].cast(dt.Array(dt.Float32(), length=embedding_dim)),
                ibis.array(query_vector).cast(dt.Array(dt.Float32(), length=embedding_dim)),
            )
        )
        .order_by(_.distance)
        .limit(k)
    )


similarity_search("How can I use the Hub for vector search?")

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment