Forked from davidberenstein1957/vector_search_hub_datasets.py
Created
January 16, 2025 04:39
-
-
Save jkaunert/871fdf991d7bd9dd19e4fddaaabba7ca to your computer and use it in GitHub Desktop.
vector search on the Hugging Face Hub
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from sentence_transformers import SentenceTransformer | |
import duckdb | |
from huggingface_hub import get_token | |
model = SentenceTransformer("TaylorAI/bge-micro-v2") | |
def similarity_search( | |
query: str, | |
k: int = 5, | |
dataset_name: str = "smol-blueprint/hf-blogs-text-embeddings", | |
embedding_column: str = "embedding", | |
): | |
query_vector = model.encode(query) | |
embedding_dim = model.get_sentence_embedding_dimension() | |
sql = f""" | |
SELECT | |
*, | |
array_cosine_distance( | |
{embedding_column}::float[{embedding_dim}], | |
{query_vector.tolist()}::float[{embedding_dim}] | |
) as distance | |
FROM 'hf://datasets/{dataset_name}/**/*.parquet' | |
ORDER BY distance | |
LIMIT {k} | |
""" | |
return duckdb.sql(sql).to_df() | |
similarity_search("How can I use the Hub for vector search?") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment