Last active
March 8, 2019 15:25
-
-
Save Slater-Victoroff/848966185135bd9450a206e131b212b6 to your computer and use it in GitHub Desktop.
20 questions with word embeddings
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import json | |
from collections import OrderedDict | |
import numpy as np | |
from scipy.spatial.distance import cdist | |
from indicoio.custom import vectorize | |
from nouns import NOUNS | |
FEATURES = json.load(open("features.json"), object_pairs_hook=OrderedDict) | |
def featurize_nouns(outfile="features.json"): | |
""" | |
Turn noun list into text features. | |
Responsible for creating a json file that maps words to the features for that word. | |
""" | |
features = vectorize(NOUNS, batch_size=200) # To speed up, increase batch_size, to avoid Gateway Timeout Errors, reduce batch_size | |
result = {word: feature for word, feature in zip(NOUNS, features)} | |
json.dump(result, open(outfile, 'w')) | |
def _get_top_n_words_from_indices(scores, top_n, reverse=True): | |
"""Turn a list of scores into a list of words.""" | |
if reverse: | |
return [list(FEATURES.keys())[index] for index in np.argsort(scores)[0][::-1][:top_n]] | |
else: | |
return [list(FEATURES.keys())[index] for index in np.argsort(scores)[0][:top_n]] | |
def ask_question(options): | |
""" | |
Pose a question to the user. | |
options should be the list of answer choices | |
""" | |
option_text = "".join(["\t%s: %s\n" % _ for _ in enumerate(options)]) # Format options for prompting | |
response = input("Which is closest to your word?\n" + option_text) # Prompt for input | |
answer_features = vectorize(options[int(response)]) # Vectorize selected response | |
# Compute distances between selected answer and all featurized nouns | |
distances = cdist([answer_features], list(FEATURES.values()), metric="cosine") | |
# Squeeze the 0 - 1 distances into an 0.5 - 1 space | |
distances /= 2 | |
distances += 0.5 | |
return distances | |
def closest_difference(*args, top_n): # kwarg after args means this is python3 only | |
""" | |
Find question options likely to help use choose between the most likely answers. | |
Input is arbitrary length list of terms to consider. Complexity is ~O(n^2), and generally | |
this approach will get fuzzier and less effective with too many considered terms. | |
""" | |
deltas = [] | |
features = vectorize(list(args)) # Vectorize all args | |
for i, outer_arg in enumerate(args): | |
for j, inner_arg in enumerate(args): | |
if j > i: # Make sure that we're only computing novel combinations of terms | |
# For each combination of terms, append the difference in those terms to deltas | |
deltas.append(np.subtract(features[i], features[j])) | |
# Goal is to pick terms that are close to the entries in the deltas array. | |
# The idea is that embeddings that are close to A - B will be most helpful in | |
# Learning to differentiate between A and B. | |
distances = cdist(deltas, list(FEATURES.values()), metric="cosine") # Compute distances | |
# Reduce call to optimize for overall distance to all entries in the deltas array | |
# Multiplication used to bias selection toward singular values close to zero. | |
distances = np.multiply.reduce(distances, axis=0).reshape(-1, 1) | |
# Once we've found the best candidates, map them back to words so we can actually ask the question | |
return [list(FEATURES.keys())[i[0]] for i in np.argsort(distances, axis=0)[:top_n]] | |
def twenty_questions(base_questions, print_n=5, questions=5, answer_ply=10): | |
""" | |
Start prompting user for input and trying to determine the word they're thinking of | |
base_questions is a series of lists laying out some initialization questions to help the model along. | |
Good base_questions are high-level features like object color, size, texture, etc... | |
print_n refers to how many of the top confidence results should be printed out at the finish of | |
the routine. If things work perfectly a print_n of 1 should be perfectly acceptable. | |
questions is the number of questions to ask the user. Makes sense to increase this as the number | |
of options increases. 5 was plenty for the standard list of 1000 terms. | |
answer_ply is the number of answers to consider when creating the next question. Generally increasing | |
this number will lead to better performance. In earlier iterations it also may make sense to increase this | |
number as confidence that one of the top n guesses is actually correct will increase with time. | |
""" | |
scores = np.ones((len(FEATURES), 1)) | |
for question in base_questions: | |
# For each question divide the starting score list by the distances returned by ask_question | |
# No need to normalize these values as we only ever use them for ranking, | |
# which is why a straight divide is acceptable. | |
scores = np.divide(scores, ask_question(question)) | |
current_top = _get_top_n_words_from_indices(scores, answer_ply) | |
for _ in range(questions): # Run same process iteratively as confidences are updated | |
distances = ask_question(closest_difference(*current_top, top_n=5)) | |
scores = np.divide(scores, distances) | |
current_top = _get_top_n_words_from_indices(distances, answer_ply, False) | |
# Print current most confident results once all questions have been asked. | |
print(_get_top_n_words_from_indices(scores, print_n)) | |
if __name__ == "__main__": | |
# featurize_nouns() | |
# twenty_questions([ | |
# ["animal", "mineral", "vegetable"], | |
# ["red", "blue", "green", "black", "white"], | |
# ["big", "medium", "small"], | |
# ["smooth", "rough"], | |
# ]) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment