-
-
Save viksit/f63ea775314a8765f4e0228816184954 to your computer and use it in GitHub Desktop.
Calculate Kullback-Leibler Divergence of Given Corpus
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# -*- coding: utf-8 -*- | |
""" Caluculates symmetric Kullback-Leibler divergence. | |
@author Shingo OKAWA | |
""" | |
import numpy | |
import sys | |
import scipy.stats as stats | |
import matplotlib.pyplot as plotter | |
from gensim import corpora, models, similarities, matutils | |
# Defines dictionary from the specified corpus. | |
dictionary = corpora.Dictionary( | |
line.lower().split() for line in open('corpus_train.txt', 'rb') | |
) | |
# Holds token ids which appears only once. | |
unique_ids = [ | |
token_id for token_id, frequency in dictionary.dfs.iteritems() if frequency == 1 | |
] | |
# Filters out tokens which appears only once. | |
dictionary.filter_tokens(unique_ids) | |
# Filters out tokens which appears in more than no_above documents, | |
# and keeps only the first keep_n tokens. | |
dictionary.filter_extremes(no_above=5, keep_n=100000) | |
# Compactifies. | |
dictionary.compactify() | |
class Corpus(object): | |
""" Represents corpus. | |
""" | |
def __iter__(self): | |
""" Iterates over the specified corpus as bag-of-words object. | |
""" | |
for line in open('corpus_train.txt', 'r'): | |
yield dictionary.doc2bow(line.lower().split()) | |
# Instanciates corpus. | |
my_corpus = Corpus() | |
# Generates corpus length vectors. | |
corpus_length_vector = numpy.array( | |
[sum(frequency for _, frequency in document) for document in my_corpus] | |
) | |
def symmetric_kl_divergence(p, q): | |
""" Caluculates symmetric Kullback-Leibler divergence. | |
""" | |
return numpy.sum([stats.entropy(p, q), stats.entropy(q, p)]) | |
def arun_metric(corpus, dictionary, min_topics=1, max_topics=1, iteration=1): | |
""" Caluculates Arun et al metric.. | |
""" | |
result = []; | |
for i in range(min_topics, max_topics, iteration): | |
# Instanciates LDA. | |
lda = models.ldamodel.LdaModel( | |
corpus=corpus, | |
id2word=dictionary, | |
num_topics=i | |
) | |
# Caluculates raw LDA matrix. | |
matrix = lda.expElogbeta | |
# Caluculates SVD for LDA matris. | |
U, document_word_vector, V = numpy.linalg.svd(matrix) | |
# Gets LDA topics. | |
lda_topics = lda[my_corpus] | |
# Caluculates document-topic matrix. | |
term_document_matrix = matutils.corpus2dense( | |
lda_topics, lda.num_topics | |
).transpose() | |
document_topic_vector = corpus_length_vector.dot(term_document_matrix) | |
document_topic_vector = document_topic_matrix + 0.0001 | |
document_topic_norm = numpy.linalg.norm(corpus_length_vector) | |
document_topic_vector = document_topic_vector / document_topic_norm | |
result.append(symmetric_kl_divergence( | |
document_word_vector, | |
document_topic_vector | |
)) | |
return result | |
def main(argv=None): | |
# Caluculates symmetric KL divergence. | |
kl_divergence = arun_metric(my_corpus, dictionary, max_topics=200); | |
# Plots KL divergence against number of topics. | |
plotter.plot(kl_divergence) | |
plotter.ylabel('Symmetric KL Divergence') | |
plotter.xlabel('Number of Topics') | |
plotter.savefig('kl_topics.png', bbox_inches='tight') | |
if __name__ == '__main__': | |
sys.exit(main()) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment