Created
May 17, 2019 01:09
-
-
Save Slater-Victoroff/4210107ce444bf6c1f550f4be7d13857 to your computer and use it in GitHub Desktop.
Demo script for needle-in-a-haystack problems
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
Demo script for Needle-in-a-haystack problems | |
""" | |
from functools import partial | |
import numpy as np | |
from indicoio.custom import vectorize | |
from scipy.spatial.distance import cdist | |
from scipy.stats import gmean | |
class SimilarityCalculator(): | |
""" | |
Class that takes in a set of query_docs and then allows the user to find | |
similar documents to seed a downstream teach task | |
""" | |
def __init__(self, query_docs): | |
self.distance_function = partial(cdist, vectorize(query_docs)) | |
def most_similar(self, documents, top_n=100): | |
""" | |
Return the top_n most similar documents to query_docs. | |
""" | |
scores = self.get_scores(documents) | |
return [documents[i] for i in np.argsort(scores)[:top_n]] | |
def get_scores(self, documents): | |
""" | |
Get raw scores for distance between documents and query_docs. | |
Smaller scores are better. | |
""" | |
distances = self.distance_function(vectorize(documents)) | |
np.add(distances, 0.01) # To avoid divide by zero issues | |
return gmean(distances) | |
if __name__ == "__main__": | |
TESTER = SimilarityCalculator(["furry", "rabbit", "dog"]) | |
TEST_DOCS = ["kitten", "fish", "puppy", "eagle", "bear", "koala"] | |
print(TESTER.most_similar(TEST_DOCS)) | |
# ['puppy', 'kitten', 'bear', 'koala', 'eagle', 'fish'] |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment