Witiko · March 14, 2021 09:25
diff --git a/interpret_soft_cosine_measure.py b/interpret_soft_cosine_measure.py
 def interpret_soft_cosine_measure(doc1, doc2, dictionary, similarity_matrix):
    word_pair_importances = dict()
    for word1_id, word1_weight in doc1:
        for word2_id, word2_weight in doc2:
            word_similarity = similarity_matrix.matrix[word1_id, word2_id]
            word_pair_importance = word1_weight * word_similarity * word2_weight
            if word_pair_importance == 0:
                continue
            word1 = dictionary.id2token[word1_id]
            word2 = dictionary.id2token[word2_id]
            if (word1, word2) not in word_pair_importances:
                word_pair_importances[word1, word2] = 0.0
            word_pair_importances[word1, word2] += word_pair_importance
    norm = 1.0
    norm *= similarity_matrix.inner_product(doc1, doc1) or 1.0
    norm *= similarity_matrix.inner_product(doc2, doc2) or 1.0
    normalized_word_pair_importances = {
        (word1, word2): word_pair_importance / norm
        for (word1, word2), word_pair_importance
        in word_pair_importances.items()
    }
    similarity = sum(normalized_word_pair_importances.values())
    normalized_word_pair_importances = sorted(normalized_word_pair_importances.items(), key=lambda x: x[1], reverse=True)
    normalized_word_pair_importances = ' + '.join(
        '{:.02f} ({}:{})'.format(word_pair_importance, word1, word2) if word1 != word2 else '{:.02f} ({})'.format(word_pair_importance, word1)
        for (word1, word2), word_pair_importance
        in normalized_word_pair_importances
    )
    print('{:.02f} = {}'.format(similarity, normalized_word_pair_importances))

 import gensim.downloader as api
 from gensim.corpora import Dictionary
 from gensim.models import TfidfModel
 from gensim.similarities import SparseTermSimilarityMatrix, WordEmbeddingSimilarityIndex

 import nltk
 from nltk.corpus import stopwords

 sentence_obama = 'Obama speaks to the media in Illinois'.lower().split()
 sentence_president = 'The president greets the press in Chicago'.lower().split()
 sentence_orange = 'Having a tough time finding an orange juice press machine?'.lower().split()

 nltk.download('stopwords', quiet=True)
 stop_words = stopwords.words('english')
 sentence_obama = [w for w in sentence_obama if w not in stop_words]
 sentence_president = [w for w in sentence_president if w not in stop_words]
 sentence_orange = [w for w in sentence_orange if w not in stop_words]

 corpus = [sentence_obama, sentence_president, sentence_orange]
 dictionary = Dictionary(corpus)
 tfidf = TfidfModel(dictionary=dictionary)

 sentence_obama = tfidf[dictionary.doc2bow(sentence_obama)]
 sentence_president = tfidf[dictionary.doc2bow(sentence_president)]
 sentence_orange = tfidf[dictionary.doc2bow(sentence_orange)]

 glove = api.load("glove-wiki-gigaword-50")
 similarity_index = WordEmbeddingSimilarityIndex(glove)
 similarity_matrix = SparseTermSimilarityMatrix(similarity_index, dictionary, tfidf)

 interpret_soft_cosine_measure(sentence_obama, sentence_president, dictionary, similarity_matrix)
 interpret_soft_cosine_measure(sentence_obama, sentence_orange, dictionary, similarity_matrix)
	def interpret_soft_cosine_measure(doc1, doc2, dictionary, similarity_matrix):
	word_pair_importances = dict()
	for word1_id, word1_weight in doc1:
	for word2_id, word2_weight in doc2:
	word_similarity = similarity_matrix.matrix[word1_id, word2_id]
	word_pair_importance = word1_weight * word_similarity * word2_weight
	if word_pair_importance == 0:
	continue
	word1 = dictionary.id2token[word1_id]
	word2 = dictionary.id2token[word2_id]
	if (word1, word2) not in word_pair_importances:
	word_pair_importances[word1, word2] = 0.0
	word_pair_importances[word1, word2] += word_pair_importance
	norm = 1.0
	norm *= similarity_matrix.inner_product(doc1, doc1) or 1.0
	norm *= similarity_matrix.inner_product(doc2, doc2) or 1.0
	normalized_word_pair_importances = {
	(word1, word2): word_pair_importance / norm
	for (word1, word2), word_pair_importance
	in word_pair_importances.items()
	}
	similarity = sum(normalized_word_pair_importances.values())
	normalized_word_pair_importances = sorted(normalized_word_pair_importances.items(), key=lambda x: x[1], reverse=True)
	normalized_word_pair_importances = ' + '.join(
	'{:.02f} ({}:{})'.format(word_pair_importance, word1, word2) if word1 != word2 else '{:.02f} ({})'.format(word_pair_importance, word1)
	for (word1, word2), word_pair_importance
	in normalized_word_pair_importances
	)
	print('{:.02f} = {}'.format(similarity, normalized_word_pair_importances))

	import gensim.downloader as api
	from gensim.corpora import Dictionary
	from gensim.models import TfidfModel
	from gensim.similarities import SparseTermSimilarityMatrix, WordEmbeddingSimilarityIndex

	import nltk
	from nltk.corpus import stopwords

	sentence_obama = 'Obama speaks to the media in Illinois'.lower().split()
	sentence_president = 'The president greets the press in Chicago'.lower().split()
	sentence_orange = 'Having a tough time finding an orange juice press machine?'.lower().split()

	nltk.download('stopwords', quiet=True)
	stop_words = stopwords.words('english')
	sentence_obama = [w for w in sentence_obama if w not in stop_words]
	sentence_president = [w for w in sentence_president if w not in stop_words]
	sentence_orange = [w for w in sentence_orange if w not in stop_words]

	corpus = [sentence_obama, sentence_president, sentence_orange]
	dictionary = Dictionary(corpus)
	tfidf = TfidfModel(dictionary=dictionary)

	sentence_obama = tfidf[dictionary.doc2bow(sentence_obama)]
	sentence_president = tfidf[dictionary.doc2bow(sentence_president)]
	sentence_orange = tfidf[dictionary.doc2bow(sentence_orange)]

	glove = api.load("glove-wiki-gigaword-50")
	similarity_index = WordEmbeddingSimilarityIndex(glove)
	similarity_matrix = SparseTermSimilarityMatrix(similarity_index, dictionary, tfidf)

	interpret_soft_cosine_measure(sentence_obama, sentence_president, dictionary, similarity_matrix)
	interpret_soft_cosine_measure(sentence_obama, sentence_orange, dictionary, similarity_matrix)