Last active
January 30, 2023 15:16
-
-
Save avriiil/c5e72bd0654d10a875ebeb6715869a95 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# import library from gensim | |
from gensim.models import CoherenceModel | |
# define function to get words in topics | |
def get_topics_lists(model, top_clusters, n_words): | |
''' | |
Gets lists of words in topics as a list of lists. | |
model: gsdmm instance | |
top_clusters: numpy array containing indices of top_clusters | |
n_words: top n number of words to include | |
''' | |
# create empty list to contain topics | |
topics = [] | |
# iterate over top n clusters | |
for cluster in top_clusters: | |
#create sorted dictionary of word distributions | |
sorted_dict = sorted(model.cluster_word_distribution[cluster].items(), key=lambda k: k[1], reverse=True)[:n_words] | |
#create empty list to contain words | |
topic = [] | |
#iterate over top n words in topic | |
for k,v in sorted_dict: | |
#append words to topic list | |
topic.append(k) | |
#append topics to topics list | |
topics.append(topic) | |
return topics | |
# get topics to feed to coherence model | |
topics = get_topics_lists(gsdmm, top_index, 20) | |
# evaluate model using Topic Coherence score | |
cm_gsdmm = CoherenceModel(topics=topics, | |
dictionary=dictionary, | |
corpus=bow_corpus, | |
texts=docs, | |
coherence='c_v') | |
# get coherence value | |
coherence_gsdmm = cm_gsdmm.get_coherence() | |
print(coherence_gsdmm) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment