Created
February 28, 2025 22:51
-
-
Save markwatson/9b4740c53a5f800125373a36d47d652a to your computer and use it in GitHub Desktop.
Text similarity using Linq-Embed-Mistral
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from sentence_transformers import SentenceTransformer | |
import numpy as np | |
import torch | |
def find_similar_texts(query_text, texts, k=3): | |
""" | |
Finds the most similar texts to a query using Sentence Transformers. | |
Args: | |
query_text (str): The text to find similar texts to. | |
texts (list[str]): A list of texts to search within. | |
k (int): The number of most similar texts to return. | |
Returns: | |
list[tuple[str, float]]: A list of tuples, where each tuple contains a similar | |
text and its similarity score. | |
""" | |
model = SentenceTransformer( | |
"Linq-AI-Research/Linq-Embed-Mistral", | |
model_kwargs={'torch_dtype': torch.float16} | |
) # Load a popular Sentence Transformer model | |
query_embedding = model.encode([query_text])[0] # Encode the query | |
text_embeddings = model.encode(texts) # Encode the texts | |
similarities = [ | |
cosine_similarity(query_embedding, text_embedding) | |
for text_embedding in text_embeddings | |
] | |
sorted_indices = np.argsort(similarities)[::-1] # sort in descending order | |
similar_texts = [(texts[i], similarities[i]) for i in sorted_indices] | |
return similar_texts[:k] # return the top k | |
def cosine_similarity(v1, v2): | |
"""Calculates cosine similarity between two vectors.""" | |
dot_product = np.dot(v1, v2) | |
magnitude_v1 = np.linalg.norm(v1) | |
magnitude_v2 = np.linalg.norm(v2) | |
if magnitude_v1 == 0 or magnitude_v2 == 0: | |
return 0 # Handle zero vectors | |
return dot_product / (magnitude_v1 * magnitude_v2) | |
# Example usage | |
texts = [ | |
"The quick brown fox jumps over the lazy dog.", | |
"A fast brown fox leaps over a sleeping dog.", | |
"The cat sat on the mat.", | |
"Dogs and cats are different animals.", | |
"A speedy brown fox jumped over the tired dog.", | |
] | |
query_text = "A rapid brown fox jumped over a sleepy dog." | |
similar_texts = find_similar_texts(query_text, texts) | |
print("Similar texts:") | |
for text, score in similar_texts: | |
print(f"- {text} (Similarity: {score})") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment