Skip to content

Instantly share code, notes, and snippets.

@Slater-Victoroff
Created May 17, 2019 01:09
Show Gist options
  • Save Slater-Victoroff/4210107ce444bf6c1f550f4be7d13857 to your computer and use it in GitHub Desktop.
Save Slater-Victoroff/4210107ce444bf6c1f550f4be7d13857 to your computer and use it in GitHub Desktop.
Demo script for needle-in-a-haystack problems
"""
Demo script for Needle-in-a-haystack problems
"""
from functools import partial
import numpy as np
from indicoio.custom import vectorize
from scipy.spatial.distance import cdist
from scipy.stats import gmean
class SimilarityCalculator():
"""
Class that takes in a set of query_docs and then allows the user to find
similar documents to seed a downstream teach task
"""
def __init__(self, query_docs):
self.distance_function = partial(cdist, vectorize(query_docs))
def most_similar(self, documents, top_n=100):
"""
Return the top_n most similar documents to query_docs.
"""
scores = self.get_scores(documents)
return [documents[i] for i in np.argsort(scores)[:top_n]]
def get_scores(self, documents):
"""
Get raw scores for distance between documents and query_docs.
Smaller scores are better.
"""
distances = self.distance_function(vectorize(documents))
np.add(distances, 0.01) # To avoid divide by zero issues
return gmean(distances)
if __name__ == "__main__":
TESTER = SimilarityCalculator(["furry", "rabbit", "dog"])
TEST_DOCS = ["kitten", "fish", "puppy", "eagle", "bear", "koala"]
print(TESTER.most_similar(TEST_DOCS))
# ['puppy', 'kitten', 'bear', 'koala', 'eagle', 'fish']
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment