Created
March 14, 2021 09:25
-
-
Save Witiko/6d2e94fae4ae194e6374c07f321d7a05 to your computer and use it in GitHub Desktop.
Interprets the soft cosine measure in Gensim 4 as a sum of word pair similarities
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def interpret_soft_cosine_measure(doc1, doc2, dictionary, similarity_matrix): | |
word_pair_importances = dict() | |
for word1_id, word1_weight in doc1: | |
for word2_id, word2_weight in doc2: | |
word_similarity = similarity_matrix.matrix[word1_id, word2_id] | |
word_pair_importance = word1_weight * word_similarity * word2_weight | |
if word_pair_importance == 0: | |
continue | |
word1 = dictionary.id2token[word1_id] | |
word2 = dictionary.id2token[word2_id] | |
if (word1, word2) not in word_pair_importances: | |
word_pair_importances[word1, word2] = 0.0 | |
word_pair_importances[word1, word2] += word_pair_importance | |
norm = 1.0 | |
norm *= similarity_matrix.inner_product(doc1, doc1) or 1.0 | |
norm *= similarity_matrix.inner_product(doc2, doc2) or 1.0 | |
normalized_word_pair_importances = { | |
(word1, word2): word_pair_importance / norm | |
for (word1, word2), word_pair_importance | |
in word_pair_importances.items() | |
} | |
similarity = sum(normalized_word_pair_importances.values()) | |
normalized_word_pair_importances = sorted(normalized_word_pair_importances.items(), key=lambda x: x[1], reverse=True) | |
normalized_word_pair_importances = ' + '.join( | |
'{:.02f} ({}:{})'.format(word_pair_importance, word1, word2) if word1 != word2 else '{:.02f} ({})'.format(word_pair_importance, word1) | |
for (word1, word2), word_pair_importance | |
in normalized_word_pair_importances | |
) | |
print('{:.02f} = {}'.format(similarity, normalized_word_pair_importances)) | |
import gensim.downloader as api | |
from gensim.corpora import Dictionary | |
from gensim.models import TfidfModel | |
from gensim.similarities import SparseTermSimilarityMatrix, WordEmbeddingSimilarityIndex | |
import nltk | |
from nltk.corpus import stopwords | |
sentence_obama = 'Obama speaks to the media in Illinois'.lower().split() | |
sentence_president = 'The president greets the press in Chicago'.lower().split() | |
sentence_orange = 'Having a tough time finding an orange juice press machine?'.lower().split() | |
nltk.download('stopwords', quiet=True) | |
stop_words = stopwords.words('english') | |
sentence_obama = [w for w in sentence_obama if w not in stop_words] | |
sentence_president = [w for w in sentence_president if w not in stop_words] | |
sentence_orange = [w for w in sentence_orange if w not in stop_words] | |
corpus = [sentence_obama, sentence_president, sentence_orange] | |
dictionary = Dictionary(corpus) | |
tfidf = TfidfModel(dictionary=dictionary) | |
sentence_obama = tfidf[dictionary.doc2bow(sentence_obama)] | |
sentence_president = tfidf[dictionary.doc2bow(sentence_president)] | |
sentence_orange = tfidf[dictionary.doc2bow(sentence_orange)] | |
glove = api.load("glove-wiki-gigaword-50") | |
similarity_index = WordEmbeddingSimilarityIndex(glove) | |
similarity_matrix = SparseTermSimilarityMatrix(similarity_index, dictionary, tfidf) | |
interpret_soft_cosine_measure(sentence_obama, sentence_president, dictionary, similarity_matrix) | |
interpret_soft_cosine_measure(sentence_obama, sentence_orange, dictionary, similarity_matrix) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment