Skip to content

Instantly share code, notes, and snippets.

@balamuru
Last active December 12, 2015 06:18
Show Gist options
  • Save balamuru/4727614 to your computer and use it in GitHub Desktop.
Save balamuru/4727614 to your computer and use it in GitHub Desktop.
Gensim Clustering attempt
import logging
from scipy.odr import models
import unittest
import os
import os.path
import tempfile
import numpy
import gensim
import logging
from gensim.corpora import mmcorpus, Dictionary
from gensim.models import lsimodel, ldamodel, tfidfmodel, rpmodel, logentropy_model, TfidfModel, LsiModel
from gensim import matutils,corpora
test_data_dir = "/home/vinayb/data/reuters-21578-subset-4315"
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
def iter_documents(top_directory):
"""Iterate over all documents, yielding a document (=list of utf8 tokens) at a time."""
for root, dirs, files in os.walk(top_directory):
for file in filter(lambda file: file.endswith('.txt'), files):
#print file
document = open(os.path.join(root, file)).read() # read the entire document, as one big string
yield gensim.utils.tokenize(document, lower=True) # or whatever tokenization suits you
class MyCorpus(object):
def __init__(self, top_dir):
self.top_dir = top_dir
self.dictionary = gensim.corpora.Dictionary(iter_documents(top_dir))
self.dictionary.filter_extremes(no_below=1, keep_n=30000) # check API docs for pruning params
def __iter__(self):
for tokens in iter_documents(self.top_dir):
yield self.dictionary.doc2bow(tokens)
corpus = MyCorpus(test_data_dir) # create a dictionary
#for vector in corpus: # convert each document to a bag-of-word vector
# print vector
print "Create models"
tfidf_model = TfidfModel(corpus)
lsi_model = LsiModel(corpus)
#topic_id = 0
#for topic in lsi_model.show_topics():
# topic_id+=1
# print "TOPIC (LSI) " + str(topic_id) + " : " + topic
#lsi_model.print_topic(20, topn=10)
corpus_tfidf = tfidf_model[corpus]
corpus_lsi = lsi_model[corpus]
lsi_model_2 = LsiModel(corpus_tfidf, id2word=corpus.dictionary, num_topics=300)
corpus_lsi_2 = lsi_model_2[corpus]
print "Done creating models"
#lsi_model_2 .print_topics(5)
topic_id = 0
for topic in lsi_model_2.show_topics():
print "TOPIC (LSI2) " + str(topic_id) + " : " + topic
group_topic = [doc for doc in corpus_lsi_2 if doc[topic_id] > 0.5]
print str(group_topic)
topic_id+=1
print "Docs Processed " + str(lsi_model_2.docs_processed)
#for doc in corpus_lsi_2: # both bow->tfidf and tfidf->lsi transformations are actually executed here, on the fly
# print "Doc " + str(doc)
#
#
#corpus.dictionary.save("dictionary.dump")
#
#tfidf_model.save("model_tfidf.dump")
#corpus_tfidf.save("corpus_tfidf.dump")
#
#lsi_model.save("model_lsi.dump")
#corpus_lsi.save("corpus_lsidump")
#
#
#lsi_model_2.save("model_lsi_2.dump")
#corpus_lsi_2.save("corpus_lsi_2.dump")
#for doc in corpus_tfidf:
# print doc
@mendynew
Copy link

mendynew commented Jun 8, 2013

This can not work well because some docs' have no topic correlative value larger than 0.5, so such docs can not clustered to any topic

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment