Skip to content

Instantly share code, notes, and snippets.

@markwatson
Created February 28, 2025 22:51
Show Gist options
  • Save markwatson/9b4740c53a5f800125373a36d47d652a to your computer and use it in GitHub Desktop.
Save markwatson/9b4740c53a5f800125373a36d47d652a to your computer and use it in GitHub Desktop.
Text similarity using Linq-Embed-Mistral
from sentence_transformers import SentenceTransformer
import numpy as np
import torch
def find_similar_texts(query_text, texts, k=3):
"""
Finds the most similar texts to a query using Sentence Transformers.
Args:
query_text (str): The text to find similar texts to.
texts (list[str]): A list of texts to search within.
k (int): The number of most similar texts to return.
Returns:
list[tuple[str, float]]: A list of tuples, where each tuple contains a similar
text and its similarity score.
"""
model = SentenceTransformer(
"Linq-AI-Research/Linq-Embed-Mistral",
model_kwargs={'torch_dtype': torch.float16}
) # Load a popular Sentence Transformer model
query_embedding = model.encode([query_text])[0] # Encode the query
text_embeddings = model.encode(texts) # Encode the texts
similarities = [
cosine_similarity(query_embedding, text_embedding)
for text_embedding in text_embeddings
]
sorted_indices = np.argsort(similarities)[::-1] # sort in descending order
similar_texts = [(texts[i], similarities[i]) for i in sorted_indices]
return similar_texts[:k] # return the top k
def cosine_similarity(v1, v2):
"""Calculates cosine similarity between two vectors."""
dot_product = np.dot(v1, v2)
magnitude_v1 = np.linalg.norm(v1)
magnitude_v2 = np.linalg.norm(v2)
if magnitude_v1 == 0 or magnitude_v2 == 0:
return 0 # Handle zero vectors
return dot_product / (magnitude_v1 * magnitude_v2)
# Example usage
texts = [
"The quick brown fox jumps over the lazy dog.",
"A fast brown fox leaps over a sleeping dog.",
"The cat sat on the mat.",
"Dogs and cats are different animals.",
"A speedy brown fox jumped over the tired dog.",
]
query_text = "A rapid brown fox jumped over a sleepy dog."
similar_texts = find_similar_texts(query_text, texts)
print("Similar texts:")
for text, score in similar_texts:
print(f"- {text} (Similarity: {score})")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment