Last active
April 24, 2017 17:30
-
-
Save jbencina/7f156fb4fc7504ef4e4343615d0c1d8b to your computer and use it in GitHub Desktop.
Scikit Topic Cohesion
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def topic_coherence(lda_model, corpus, num_words, sort_topics=True, avg_per_word=False): | |
topics = {} | |
# Get the top num_words within each model in topic | |
for idx, topic in enumerate(lda_model.components_): | |
topics[idx] = [i for i in topic.argsort()[:-num_words -1: -1]] | |
# Convert to csc for efficient column slicing | |
D = corpus.tocsc() | |
coherence_scores = [] | |
for topic,terms in topics.items(): | |
coherence = 0.0 | |
# Iterate such that m < l in terms of frequency | |
for m_index, m_term in enumerate(terms[1:]): | |
# Get all documents with term m | |
m_docs = D[:,m_term].nonzero()[0] | |
for l_term in terms[:m_index+1]: | |
# Get all documents with term l | |
l_docs = D[:,l_term].nonzero()[0] | |
# Compute coherence as log( (docs with m and l) / (docs with l) ) | |
wi = len(l_docs) | |
wj = len(np.intersect1d(m_docs, l_docs))+1 | |
coherence += np.log(wj/wi) | |
# Calculate average coherence per word to compare across different num_words | |
if avg_per_word: | |
coherence = coherence/len(terms) | |
coherence_scores.append((topic, coherence)) | |
if sort_topics: | |
return sorted(coherence_scores, key=lambda t: t[1], reverse=True) | |
else: | |
return coherence_scores |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment