Last active
October 19, 2017 18:39
-
-
Save sangheestyle/11012878 to your computer and use it in GitHub Desktop.
Analyzing books in gutenberg with LDA
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import logging | |
import os | |
import zipfile | |
import multiprocessing | |
from subprocess import call | |
from gensim.corpora.textcorpus import TextCorpus | |
from gensim.corpora import Dictionary, MmCorpus | |
from gensim.models import TfidfModel | |
from gensim import utils | |
def get_list_of_files(root=None, file_ext=None): | |
""" | |
a. traverse directories | |
b. make a list including file paths which have given file extension. | |
c. return the list | |
""" | |
filename_list = [] | |
for root, dirs, files in os.walk(root): | |
for f in files: | |
if f.endswith(file_ext): | |
filename_list.append(os.path.join(root, f)) | |
return filename_list | |
def get_zip_file_size(file_path=None): | |
file_name = os.path.basename(file_path) | |
file_name, file_extention = os.path.splitext(file_name) | |
return_value = None | |
try: | |
with zipfile.ZipFile(file_path, 'r') as zf: | |
for i in zf.infolist(): | |
if i.filename == file_name + ".txt": | |
return_value = i.file_size | |
except: | |
e = sys.exc_info()[0] | |
print "ERROR:", e, file_path | |
return return_value | |
def get_filtered_zip_files(root=None, | |
file_ext=None, | |
number_of_files=None, | |
min_text_size=None, | |
max_text_size=None): | |
files_list = get_list_of_files(root, file_ext) | |
filtered_list = [] | |
for file_path in files_list: | |
if len(filtered_list) >= number_of_files: | |
break | |
file_size = get_zip_file_size(file_path) | |
if file_size > min_text_size and file_size < max_text_size: | |
filtered_list.append(file_path) | |
print ">>> number of files:", len(filtered_list) | |
return filtered_list | |
def read_zip_file(file_path=None): | |
""" | |
a. read a zip file including text file | |
b. return the text | |
""" | |
try: | |
# FIXME: work around | |
call(['unzip', '-o', file_path]) | |
except: | |
e = sys.exc_info()[0] | |
print ">>> ERROR:", e | |
file_name = os.path.basename(file_path) | |
file_name, ext = os.path.splitext(file_name) | |
unzipped_text_file_name = file_name + ".txt" | |
with open(unzipped_text_file_name, 'rb') as fp: | |
text = fp.read() | |
os.remove(unzipped_text_file_name) | |
return text | |
def process_text(filename): | |
text = read_zip_file(filename) | |
if text is not None: | |
text = utils.to_unicode(text, 'utf8', errors='ignore') | |
text = utils.lemmatize(text) | |
else: | |
text = [] | |
return [filename, text] | |
class GutenbergCorpus(TextCorpus): | |
def __init__(self, input=None): | |
self.processes = max(1, multiprocessing.cpu_count()) | |
self.iteration = 0 | |
self.filenames = [] | |
super(GutenbergCorpus, self).__init__(input) | |
def get_texts(self): | |
self.iteration += 1 | |
pool = multiprocessing.Pool(self.processes) | |
file_names = get_filtered_zip_files(self.input, '.zip', 20000, 100000, 700000) | |
for index, item in enumerate(pool.imap(process_text, file_names)): | |
print ">> processing", index + 1, "/", len(file_names) | |
if self.iteration >= 2 : | |
self.filenames.append(item[0]) | |
yield item[1] | |
DEFAULT_DICT_SIZE = 100000 | |
if __name__ == '__main__': | |
import sys | |
import logging | |
import gensim | |
import bz2 | |
root = '../www.gutenberg.lib.md.us' | |
prefix = 'gutenberg' | |
gutenberg = GutenbergCorpus(root) | |
""" | |
gutenberg.dictionary.filter_extremes(no_below=10, | |
no_above=0.2, | |
keep_n=DEFAULT_DICT_SIZE) | |
""" | |
MmCorpus.serialize(prefix + '_bow.mm', gutenberg, progress_cnt=10000) | |
gutenberg.dictionary.save_as_text(prefix + '_wordids.txt.bz2') | |
with open('gutenberg_filename.txt', 'wb') as f: | |
for filename in gutenberg.filenames: | |
print >> f, filename | |
dictionary = Dictionary.load_from_text(prefix + '_wordids.txt.bz2') | |
del gutenberg | |
mm = MmCorpus(prefix + '_bow.mm') | |
tfidf = TfidfModel(mm, id2word=dictionary, normalize=True) | |
MmCorpus.serialize(prefix + '_tfidf.mm', tfidf[mm], progress_cnt=10000) | |
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) | |
id2word = gensim.corpora.Dictionary.load_from_text('gutenberg_wordids.txt.bz2') | |
mm = gensim.corpora.MmCorpus('gutenberg_tfidf.mm') | |
lda = gensim.models.ldamodel.LdaModel(corpus=mm, id2word=id2word, num_topics=20, chunksize=100) | |
lda.save('gutenberg_idf.model') |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import gensim | |
def get_top_most_topic(topic_rate_pair): | |
trp = sorted(topic_rate_pair, key=lambda item: item[1], reverse=True) | |
top_most_topic = trp[0] | |
return top_most_topic | |
mm = gensim.corpora.MmCorpus('gutenberg_tfidf.mm') | |
lda = gensim.models.ldamodel.LdaModel.load('gutenberg_tfidf_lda.model') | |
doc2topic = lda[mm] | |
doc_topic_list = [] | |
for topic_rate_pair in doc2topic: | |
top_most_topic = get_top_most_topic(topic_rate_pair) | |
doc_topic_list.append(top_most_topic) | |
print top_most_topic[0] |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import logging, gensim, bz2 | |
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) | |
id2word = gensim.corpora.Dictionary.load_from_text('gutenberg_wordids.txt.bz2') | |
mm = gensim.corpora.MmCorpus('gutenberg_tfidf.mm') | |
lda = gensim.models.ldamodel.LdaModel(corpus=mm, | |
id2word=id2word, | |
num_topics=50, | |
update_every=1, | |
chunksize=100, | |
passes=10) | |
lda.print_topics() | |
lda.save('gutenberg_tfidf_lda.model') |
Training model
import logging, gensim, bz2
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
id2word = gensim.corpora.Dictionary.load_from_text('gutenberg_wordids.txt.bz2')
mm = gensim.corpora.MmCorpus('gutenberg_tfidf.mm')
print(mm)
lda = gensim.models.ldamodel.LdaModel(corpus=mm,
id2word=id2word,
num_topics=50,
update_every=1,
chunksize=1,
passes=10)
lda.print_topics(20)
lda.save('gutenberg_tfidf_lda.model')
How long takes to generate tf-idf and topics with 20,000 documents?
The experiment based on revision 18 (20,000 documents) took about 4500 mins (3 days) to make dictionary and tfidf. Also it took 704 mins to generate topics.
import logging, gensim, bz2
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
id2word = gensim.corpora.Dictionary.load_from_text('gutenberg_wordids.txt.bz2')
mm = gensim.corpora.MmCorpus('gutenberg_tfidf.mm')
print(mm)
lda = gensim.models.ldamodel.LdaModel(corpus=mm,
id2word=id2word,
num_topics=50,
update_every=1,
chunksize=100,
passes=10)
lda.print_topics(20)
lda.save('gutenberg_tfidf_lda.model')
Challenge
- remove punctuation
- use lemmatizer or stemmer
- size of each text
- use tf or tf-idf
- processing time (20,000 texts for 3 days in order to get tf-idf.mm in 4 cores)
- number of topic (considering HDA)
Decide number of topic
Reference
- http://mallet.cs.umass.edu/topics.php : they mentioned number of topics
--num-topics [NUMBER] The number of topics to use. The best number depends on what you are looking for in the model. The default (10) will provide a broad overview of the contents of the corpus. The number of topics should depend to some degree on the size of the collection, but 200 to 400 will produce reasonably fine-grained results.
Sample code
import logging, gensim, bz2
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
id2word = gensim.corpora.Dictionary.load_from_text('gutenberg_wordids.txt.bz2')
mm = gensim.corpora.MmCorpus('gutenberg_tfidf.mm')
print(mm)
mallet_path = '/home/sanghee/bin/mallet-2.0.7/bin/mallet'
lda = gensim.models.LdaMallet(mallet_path,
corpus=mm,
id2word=id2word,
num_topics=100)
lda.print_topics()
lda.save('gutenberg_tfidf_lda_mallet.model')
Exploring information via IPython
After processing corpus and model, we used IPython to explore tocpis and generate some final data sets such as number of document for each topics, and document to topic list.
Here are some code snippets.
with open('major_topics.txt', 'wb') as fp:
for topic in topic_num:
print >> fp, topic, [item[1] for item in lda.show_topic(topic)]
with open('topics_to_book_titles.txt', 'wb') as fp:
for topic in topic_num:
print >> fp, topic, ":", [bt[number] for number in d[topic][:10]]
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Downsize
61,673 documents are used when it cames to run revision 14.
file_names = get_filtered_zip_files(self.input, '.zip', 70000, 100, 100000000)
I did the experiment including 50,000 documents instead of 61,673 documents when it cames to run revision 15, because in revision 14, some files were too big so that they were spent so much time to be precessed.
file_names = get_filtered_zip_files(self.input, '.zip', 50000, 100000, 1200000)