Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Save jkaunert/871fdf991d7bd9dd19e4fddaaabba7ca to your computer and use it in GitHub Desktop.
Save jkaunert/871fdf991d7bd9dd19e4fddaaabba7ca to your computer and use it in GitHub Desktop.
vector search on the Hugging Face Hub
from sentence_transformers import SentenceTransformer
import duckdb
from huggingface_hub import get_token
model = SentenceTransformer("TaylorAI/bge-micro-v2")
def similarity_search(
query: str,
k: int = 5,
dataset_name: str = "smol-blueprint/hf-blogs-text-embeddings",
embedding_column: str = "embedding",
):
query_vector = model.encode(query)
embedding_dim = model.get_sentence_embedding_dimension()
sql = f"""
SELECT
*,
array_cosine_distance(
{embedding_column}::float[{embedding_dim}],
{query_vector.tolist()}::float[{embedding_dim}]
) as distance
FROM 'hf://datasets/{dataset_name}/**/*.parquet'
ORDER BY distance
LIMIT {k}
"""
return duckdb.sql(sql).to_df()
similarity_search("How can I use the Hub for vector search?")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment