Skip to content

Instantly share code, notes, and snippets.

@davidmezzetti
Created July 1, 2024 13:01
Show Gist options
  • Save davidmezzetti/43a60af6e135a031c0bfb4de02bc11f4 to your computer and use it in GitHub Desktop.
Save davidmezzetti/43a60af6e135a031c0bfb4de02bc11f4 to your computer and use it in GitHub Desktop.
import polars as pl
import faiss
from llama_index.core.schema import TextNode
from llama_index.core.vector_stores import VectorStoreQuery
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.vector_stores.faiss import FaissVectorStore
# Data to index
data = [
"US tops 5 million confirmed virus cases",
"Canada's last fully intact ice shelf has suddenly collapsed, forming a Manhattan-sized iceberg",
"Beijing mobilises invasion craft along coast as Taiwan tensions escalate",
"The National Park Service warns against sacrificing slower friends in a bear attack",
"Maine man wins $1M from $25 lottery ticket",
"Make huge profits without work, earn up to $100,000 a day"
]
# Create vector store
embeddings = HuggingFaceEmbedding(
model_name="sentence-transformers/nli-mpnet-base-v2",
)
store = FaissVectorStore(faiss_index=faiss.IndexFlatL2(768))
store.add([TextNode(id=i, text=x,
embedding=embeddings.get_text_embedding(x)) for i, x in enumerate(data)]
)
# Search with results as documents
query = embeddings.get_text_embedding("feel good story")
results = store.query(VectorStoreQuery(query_embedding=query))
pl.from_dicts([results])
# ┌───────┬──────────────┬───────────┐
# │ nodes ┆ similarities ┆ ids │
# │ --- ┆ --- ┆ --- │
# │ null ┆ list[f32] ┆ list[str] │
# ╞═══════╪══════════════╪═══════════╡
# │ null ┆ [1.83342] ┆ ["4"] │
# └───────┴──────────────┴───────────┘
#################################################
import polars as pl
import txtai
# Create vector store. Uses SQLite + Faiss.
embeddings = txtai.Embeddings(
path="sentence-transformers/nli-mpnet-base-v2",
content=True
)
embeddings.index({"text": x, "length": len(x)} for x in data)
# Search with dynamic columns and SQL
results = embeddings.search(
"""SELECT id, text, length, score
FROM txtai
WHERE similar('feel good story')""", 1)
# Polars and Pandas DataFrame support
pl.from_dicts(results)
# ┌─────┬─────────────────────────────────┬────────┬──────────┐
# │ id ┆ text ┆ length ┆ score │
# │ --- ┆ --- ┆ --- ┆ --- │
# │ str ┆ str ┆ i64 ┆ f64 │
# ╞═════╪═════════════════════════════════╪════════╪══════════╡
# │ 4 ┆ Maine man wins $1M from $25 lo… ┆ 42 ┆ 0.08329 │
# └─────┴─────────────────────────────────┴────────┴──────────┘
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment