Skip to content

Instantly share code, notes, and snippets.

@cigrainger
Last active August 29, 2015 14:01
Show Gist options
  • Select an option

  • Save cigrainger/dac4cb8ff3821951f5e6 to your computer and use it in GitHub Desktop.

Select an option

Save cigrainger/dac4cb8ff3821951f5e6 to your computer and use it in GitHub Desktop.
# Imports and housekeeping
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
level=logging.INFO)
from gensim import corpora, models, similarities
import numpy as np
import matplotlib.pyplot as plt
from subprocess import call
# Initialise distributed workers
call(['export PYRO_SERIALIZERS_ACCEPTED=pickle',
'export PYRO_SERIALIZER=pickle',
'python -m Pyro4.naming -n 0.0.0.0 &',
'python -m gensim.models.lda_worker &',
'python -m gensim.models.lda_dispatcher &'])
# Define KL functions
def kl(p,q):
p = np.asarray(p, dtype=np.float)
q = np.asarray(q, dtype=np.float)
return np.sum(np.where(p != 0, p * np.log(p / q), 0))
def sym_kl(p,q):
return np.sum([kl(p,q),kl(q,p)])
# Generate corpus
stoplist = set(open('stoplist.txt','r').read().split())
dictionary = corpora.Dictionary(line.lower().split() for
line in open('.\data\\abstracts.txt','rb'))
stop_ids = [dictionary.token2id[stopword] for
stopword in stoplist if stopword in dictionary.token2id]
once_ids = [tokenid for tokenid, docfreq in
dictionary.dfs.iteritems() if docfreq == 1]
dictionary.filter_tokens(stop_ids + once_ids)
dictionary.filter_extremes(no_above=5,keep_n=100000)
dictionary.compactify()
class MyCorpus(object):
def __iter__(self):
for line in open('.\data\\abstracts.txt','rb'):
yield dictionary.doc2bow(line.lower().split())
# Run models to find natural number of topics
kl = []
num = range(0,25000,10)
for i in num:
lda = models.ldamodel.LdaModel(corpus=my_corpus,
id2word=dictionary,num_topics=i,distributed=True)
"""
Divergence
"""
kl.append(div)
# Plot kl divergence against number of topics -- line and bins
plt.subplot(211)
plt.plot(kl,num)
plt.ylabel('Symmetric KL Divergence')
plt.xlabel('Number of Topics')
plt.subplot(212)
# plt.hist()
plt.savefig('kldiv.png', bbox_inches='tight')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment