Skip to content

Instantly share code, notes, and snippets.

@cigrainger
Created May 16, 2014 17:07
Show Gist options
  • Select an option

  • Save cigrainger/8eb4f4cb4fb7288a2ff4 to your computer and use it in GitHub Desktop.

Select an option

Save cigrainger/8eb4f4cb4fb7288a2ff4 to your computer and use it in GitHub Desktop.
# Imports and housekeeping
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
level=logging.INFO)
from gensim import corpora, models, similarities
import numpy as np
import matplotlib.pyplot as plt
# Define KL functions
def kl(p,q):
p = np.asarray(p, dtype=np.float)
q = np.asarray(q, dtype=np.float)
return np.sum(np.where(p != 0, p * np.log(p / q), 0))
def sym_kl(p,q):
return np.sum([kl(p,q),kl(q,p)])
# Generate corpus
stoplist = set(open('stoplist.txt','r').read().split())
dictionary = corpora.Dictionary(line.lower().split() for
line in open('.\data\\abstracts.txt','rb'))
stop_ids = [dictionary.token2id[stopword] for
stopword in stoplist if stopword in dictionary.token2id]
once_ids = [tokenid for tokenid, docfreq in
dictionary.dfs.iteritems() if docfreq == 1]
dictionary.filter_tokens(stop_ids + once_ids)
dictionary.filter_extremes(no_above=5,keep_n=100000)
dictionary.compactify()
class MyCorpus(object):
def __iter__(self):
for line in open('.\data\\abstracts.txt','rb'):
yield dictionary.doc2bow(line.lower().split())
# Run models to find natural number of topics
kl_num = []
for i in range(0,250000,10):
lda = models.ldamodel.LdaModel(corpus=my_corpus,
id2word=dictionary,num_topics=i)
"""
Divergence
"""
kl_num.append([div,i])
# Plot kl divergence against number of topics -- line and bins
plt.subplot(211)
plt.plot(kl_num[0:len(kl_num)][0],kl_num[0:len(kl_num)][1])
plt.ylabel('Symmetric KL Divergence')
plt.xlabel('Number of Topics')
plt.subplot(212)
# plt.hist()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment