Skip to content

Instantly share code, notes, and snippets.

@joews
Created February 11, 2014 20:46
Show Gist options
  • Save joews/8943758 to your computer and use it in GitHub Desktop.
Save joews/8943758 to your computer and use it in GitHub Desktop.
Gensim tutorials
from gensim import corpora, models, similarities
import logging
# Working through @RadimRehurek's Gensim tutorials: http://radimrehurek.com/gensim/index.html
#logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
# Tutorial 1: Acquire and normalise texts, build and serialise dictionary and corpus
# http://radimrehurek.com/gensim/tut1.html
# Using an in-memory list of short documents -
# stream documents from disk or other resource for real applications.
documents = ["Human machine interface for lab abc computer applications",
"A survey of user opinion of computer system response time",
"The EPS user interface management system",
"System and human system engineering testing of EPS",
"Relation of user perceived response time to error measurement",
"The generation of random binary unordered trees",
"The intersection graph of paths in trees",
"Graph minors IV Widths of trees and well quasi ordering",
"Graph minors A survey"]
stoplist = set('for a of the and to in'.split())
# Build tokenized, normalised word vectors for each document
# We could apply stemming here.
texts = [[word for word in document.lower().split() if word not in stoplist]
for document in documents]
# remove words that appear only once
all_tokens = sum(texts, [])
tokens_once = set(word for word in set(all_tokens) if all_tokens.count(word) == 1)
texts = [[word for word in text if word not in tokens_once]
for text in texts]
# now we have normalised texts, we need to turn them into a vector representation
# - here we use bag-of-words. We can use other techniques but it is vital to use
# the same vector space for all computations.
# Build a dictionary - a frequency distribution of integer IDs representing words.
# The dictionary object can translate feature id<->word.
dictionary = corpora.Dictionary(texts)
dictionary.save('deerwester.dict')
# Build a vector space corpus - use the dictionary to translate
# word vectors into sparse feature vectors
# We will use this corpus to train our models.
corpus = [dictionary.doc2bow(text) for text in texts]
corpora.MmCorpus.serialize('deerwester.mm', corpus)
#we could reload with:
#corpus = corpora.MmCorpus('deerwester.mm')
# TODO: work with streamed corpus from filesystem, rather than full in-memory corpus
# Tutorial 2: Transformations and modelling
# http://radimrehurek.com/gensim/tut2.html
# TFIDF
# Train a model that will "reward" tokens that are distinctive to documents
tfidf = models.TfidfModel(corpus)
# creates a lazy evaluating wrapper around corpus - doesn't transform the whole corpus
# at once because we don't want to load the whole corpus into memory.
corpus_tfidf = tfidf[corpus]
# Vector space model
# Train an LSI (latent semantic indexing) model on our data
# Gensim has other models:
# > LDA (Latent Dirichlet Allocation)
# > Random Projections (approximates TDIDF)
# > HDP (Hierarchical Dirichlet Process) - non-parametric Bayesian method, experimental.
#Train the model (can also use raw bag-of-words vectors, but TFIDF is better)
#Turns our TDIDF-weighted bag-of-words vectors into a lower dimension topic vector space
# - in this case, two dimensions (topics)
lsi_tfidf = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=2)
lsi_tfidf.save('deerwester.lsi_tfidf')
# Add a second lazy executing wrapper to our corpus
corpus_lsi_tfidf = lsi_tfidf[corpus_tfidf]
# Part 3 - Similarity
# http://radimrehurek.com/gensim/tut3.html
# Similarity index
# Build an index so we can quickly find similar topics
# MatrixSimilarity is in-memory: use similarities.Similarity
# for scalable similarity
#####################################################################
# Non-TFIDF, a la tutorial
# ( we need to retrain the model - the LSI model trained above used the
# TDIDF vector space)
lsi = models.LsiModel(corpus, id2word=dictionary, num_topics=2)
corpus_lsi = lsi[corpus]
index = similarities.MatrixSimilarity(corpus_lsi)
index.save('deerwester.index')
# Similarity queries
# The end goal! query all of our documents to find similar documents to an input document
query = "Human computer interaction"
# Tranform the query into bag-of-words vector space, then into the LSI
# vector space of our model
query_bow = dictionary.doc2bow(query.lower().split())
query_lsi = lsi[query_bow]
query_similarities = enumerate(index[query_lsi])
sort_key = lambda item: -item[1] #sort by similarity descending
sorted_similarities = sorted(query_similarities, key=sort_key)
print "Without TDIDF:"
print "\n".join(["%.3f - %s" % (score, documents[index]) for index, score in sorted_similarities])
#####################################################################
# With TFIDF- departing from tutorial
# (using the model we trained earlier)
index = similarities.MatrixSimilarity(corpus_lsi_tfidf)
index.save('deerwester_tfidf.index')
query_lsi = lsi_tfidf[query_bow]
query_similarities = enumerate(index[query_lsi])
sort_key = lambda item: -item[1]
sorted_similarities = sorted(query_similarities, key=sort_key)
print "-" * 80
print "With TDIDF:"
print "\n".join(["%.3f - %s" % (score, documents[index]) for index, score in sorted_similarities])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment