Created
August 31, 2022 15:19
-
-
Save zachschillaci27/32b1f821f7d595a9be7e855c6c6d68af to your computer and use it in GitHub Desktop.
Get the most similar word in a spaCy model vocabulary based on word vector distance
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import numpy as np | |
import spacy | |
from spacy.language import Language | |
from typing import Callable | |
def get_most_similar_word(nlp: spacy.language.Language, word: str, metric: Callable[[np.array, np.array], float]) -> str: | |
# Get the vector of representation of the query word | |
vector = nlp.vocab.get_vector(word) | |
# Get the most similar vector by row index in the word vector matrix | |
index = argmax([metric(vector, v) for v in nlp.vocab.vectors.data]) | |
# Mapping of row index -> string hashes | |
m = {v: k for k, v in nlp.vocab.vectors.key2row.items()} | |
# Now get the string representation of this hash | |
return nlp.vocab.strings.as_string(m[index]) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment