Last active
December 12, 2015 06:18
-
-
Save balamuru/4727614 to your computer and use it in GitHub Desktop.
Gensim Clustering attempt
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import logging | |
from scipy.odr import models | |
import unittest | |
import os | |
import os.path | |
import tempfile | |
import numpy | |
import gensim | |
import logging | |
from gensim.corpora import mmcorpus, Dictionary | |
from gensim.models import lsimodel, ldamodel, tfidfmodel, rpmodel, logentropy_model, TfidfModel, LsiModel | |
from gensim import matutils,corpora | |
test_data_dir = "/home/vinayb/data/reuters-21578-subset-4315" | |
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) | |
def iter_documents(top_directory): | |
"""Iterate over all documents, yielding a document (=list of utf8 tokens) at a time.""" | |
for root, dirs, files in os.walk(top_directory): | |
for file in filter(lambda file: file.endswith('.txt'), files): | |
#print file | |
document = open(os.path.join(root, file)).read() # read the entire document, as one big string | |
yield gensim.utils.tokenize(document, lower=True) # or whatever tokenization suits you | |
class MyCorpus(object): | |
def __init__(self, top_dir): | |
self.top_dir = top_dir | |
self.dictionary = gensim.corpora.Dictionary(iter_documents(top_dir)) | |
self.dictionary.filter_extremes(no_below=1, keep_n=30000) # check API docs for pruning params | |
def __iter__(self): | |
for tokens in iter_documents(self.top_dir): | |
yield self.dictionary.doc2bow(tokens) | |
corpus = MyCorpus(test_data_dir) # create a dictionary | |
#for vector in corpus: # convert each document to a bag-of-word vector | |
# print vector | |
print "Create models" | |
tfidf_model = TfidfModel(corpus) | |
lsi_model = LsiModel(corpus) | |
#topic_id = 0 | |
#for topic in lsi_model.show_topics(): | |
# topic_id+=1 | |
# print "TOPIC (LSI) " + str(topic_id) + " : " + topic | |
#lsi_model.print_topic(20, topn=10) | |
corpus_tfidf = tfidf_model[corpus] | |
corpus_lsi = lsi_model[corpus] | |
lsi_model_2 = LsiModel(corpus_tfidf, id2word=corpus.dictionary, num_topics=300) | |
corpus_lsi_2 = lsi_model_2[corpus] | |
print "Done creating models" | |
#lsi_model_2 .print_topics(5) | |
topic_id = 0 | |
for topic in lsi_model_2.show_topics(): | |
print "TOPIC (LSI2) " + str(topic_id) + " : " + topic | |
group_topic = [doc for doc in corpus_lsi_2 if doc[topic_id] > 0.5] | |
print str(group_topic) | |
topic_id+=1 | |
print "Docs Processed " + str(lsi_model_2.docs_processed) | |
#for doc in corpus_lsi_2: # both bow->tfidf and tfidf->lsi transformations are actually executed here, on the fly | |
# print "Doc " + str(doc) | |
# | |
# | |
#corpus.dictionary.save("dictionary.dump") | |
# | |
#tfidf_model.save("model_tfidf.dump") | |
#corpus_tfidf.save("corpus_tfidf.dump") | |
# | |
#lsi_model.save("model_lsi.dump") | |
#corpus_lsi.save("corpus_lsidump") | |
# | |
# | |
#lsi_model_2.save("model_lsi_2.dump") | |
#corpus_lsi_2.save("corpus_lsi_2.dump") | |
#for doc in corpus_tfidf: | |
# print doc |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
This can not work well because some docs' have no topic correlative value larger than 0.5, so such docs can not clustered to any topic