Skip to content

Instantly share code, notes, and snippets.

@Witiko
Created March 14, 2021 09:25
Show Gist options
  • Save Witiko/6d2e94fae4ae194e6374c07f321d7a05 to your computer and use it in GitHub Desktop.
Save Witiko/6d2e94fae4ae194e6374c07f321d7a05 to your computer and use it in GitHub Desktop.
Interprets the soft cosine measure in Gensim 4 as a sum of word pair similarities
def interpret_soft_cosine_measure(doc1, doc2, dictionary, similarity_matrix):
word_pair_importances = dict()
for word1_id, word1_weight in doc1:
for word2_id, word2_weight in doc2:
word_similarity = similarity_matrix.matrix[word1_id, word2_id]
word_pair_importance = word1_weight * word_similarity * word2_weight
if word_pair_importance == 0:
continue
word1 = dictionary.id2token[word1_id]
word2 = dictionary.id2token[word2_id]
if (word1, word2) not in word_pair_importances:
word_pair_importances[word1, word2] = 0.0
word_pair_importances[word1, word2] += word_pair_importance
norm = 1.0
norm *= similarity_matrix.inner_product(doc1, doc1) or 1.0
norm *= similarity_matrix.inner_product(doc2, doc2) or 1.0
normalized_word_pair_importances = {
(word1, word2): word_pair_importance / norm
for (word1, word2), word_pair_importance
in word_pair_importances.items()
}
similarity = sum(normalized_word_pair_importances.values())
normalized_word_pair_importances = sorted(normalized_word_pair_importances.items(), key=lambda x: x[1], reverse=True)
normalized_word_pair_importances = ' + '.join(
'{:.02f} ({}:{})'.format(word_pair_importance, word1, word2) if word1 != word2 else '{:.02f} ({})'.format(word_pair_importance, word1)
for (word1, word2), word_pair_importance
in normalized_word_pair_importances
)
print('{:.02f} = {}'.format(similarity, normalized_word_pair_importances))
import gensim.downloader as api
from gensim.corpora import Dictionary
from gensim.models import TfidfModel
from gensim.similarities import SparseTermSimilarityMatrix, WordEmbeddingSimilarityIndex
import nltk
from nltk.corpus import stopwords
sentence_obama = 'Obama speaks to the media in Illinois'.lower().split()
sentence_president = 'The president greets the press in Chicago'.lower().split()
sentence_orange = 'Having a tough time finding an orange juice press machine?'.lower().split()
nltk.download('stopwords', quiet=True)
stop_words = stopwords.words('english')
sentence_obama = [w for w in sentence_obama if w not in stop_words]
sentence_president = [w for w in sentence_president if w not in stop_words]
sentence_orange = [w for w in sentence_orange if w not in stop_words]
corpus = [sentence_obama, sentence_president, sentence_orange]
dictionary = Dictionary(corpus)
tfidf = TfidfModel(dictionary=dictionary)
sentence_obama = tfidf[dictionary.doc2bow(sentence_obama)]
sentence_president = tfidf[dictionary.doc2bow(sentence_president)]
sentence_orange = tfidf[dictionary.doc2bow(sentence_orange)]
glove = api.load("glove-wiki-gigaword-50")
similarity_index = WordEmbeddingSimilarityIndex(glove)
similarity_matrix = SparseTermSimilarityMatrix(similarity_index, dictionary, tfidf)
interpret_soft_cosine_measure(sentence_obama, sentence_president, dictionary, similarity_matrix)
interpret_soft_cosine_measure(sentence_obama, sentence_orange, dictionary, similarity_matrix)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment