-
-
Save aronwc/8248457 to your computer and use it in GitHub Desktop.
""" Example using GenSim's LDA and sklearn. """ | |
import numpy as np | |
from gensim import matutils | |
from gensim.models.ldamodel import LdaModel | |
from sklearn import linear_model | |
from sklearn.datasets import fetch_20newsgroups | |
from sklearn.feature_extraction.text import CountVectorizer | |
def print_features(clf, vocab, n=10): | |
""" Print sorted list of non-zero features/weights. """ | |
coef = clf.coef_[0] | |
print 'positive features: %s' % (' '.join(['%s/%.2f' % (vocab[j], coef[j]) for j in np.argsort(coef)[::-1][:n] if coef[j] > 0])) | |
print 'negative features: %s' % (' '.join(['%s/%.2f' % (vocab[j], coef[j]) for j in np.argsort(coef)[:n] if coef[j] < 0])) | |
def fit_classifier(X, y, C=0.1): | |
""" Fit L1 Logistic Regression classifier. """ | |
# Smaller C means fewer features selected. | |
clf = linear_model.LogisticRegression(penalty='l1', C=C) | |
clf.fit(X, y) | |
return clf | |
def fit_lda(X, vocab, num_topics=5, passes=20): | |
""" Fit LDA from a scipy CSR matrix (X). """ | |
print 'fitting lda...' | |
return LdaModel(matutils.Sparse2Corpus(X), num_topics=num_topics, | |
passes=passes, | |
id2word=dict([(i, s) for i, s in enumerate(vocab)])) | |
def print_topics(lda, vocab, n=10): | |
""" Print the top words for each topic. """ | |
topics = lda.show_topics(topics=-1, topn=n, formatted=False) | |
for ti, topic in enumerate(topics): | |
print 'topic %d: %s' % (ti, ' '.join('%s/%.2f' % (t[1], t[0]) for t in topic)) | |
if (__name__ == '__main__'): | |
# Load data. | |
rand = np.random.mtrand.RandomState(8675309) | |
cats = ['rec.sport.baseball', 'sci.crypt'] | |
data = fetch_20newsgroups(subset='train', | |
categories=cats, | |
shuffle=True, | |
random_state=rand) | |
vec = CountVectorizer(min_df=10, stop_words='english') | |
X = vec.fit_transform(data.data) | |
vocab = vec.get_feature_names() | |
# Fit classifier. | |
clf = fit_classifier(X, data.target) | |
print_features(clf, vocab) | |
# Fit LDA. | |
lda = fit_lda(X, vocab) | |
print_topics(lda, vocab) |
Very helpful! Perhaps you should set documents_columns=False
now.
def fit_lda(X, vocab, num_topics=5, passes=20):
""" Fit LDA from a scipy CSR matrix (X). """
print 'fitting lda...'
- return LdaModel(matutils.Sparse2Corpus(X), num_topics=num_topics,
+ return LdaModel(matutils.Sparse2Corpus(X, documents_columns=False), num_topics=num_topics,
passes=passes,
id2word=dict([(i, s) for i, s in enumerate(vocab)]))
in Sparse2Corpus
should set documents_columns=False:
Sparse2Corpus(X, documents_columns=False)
else the columns of will be treated as documents (but in fact they should be features)
or just transpose: Sparse2Corpus(X.T)
Thanks for that @X-Wei, Transposing works
how to predict class for new document?
Updating for Python 3: https://github.com/EricSchles/sklearn_gensim_example/blob/master/example.py
@vinnitu - This is how you do it generally, I'm still trying to figure out how to do this for the feature engineering shown above: https://radimrehurek.com/gensim/models/ldamodel.html
@vinnitu - I figured it out! It turns out gensim assumes a feature engineering, usually bag of words. So as you can see in the above ldamodel - https://radimrehurek.com/gensim/models/ldamodel.html the bag of words representation is made use of to map from LDA category to the topic model. So the model isn't really aware of the full corpus, but instead is only area of the compressed version, aka the topics. So, you first have to do the feature engineering, converting the document to bag of words via the dictionary representation, which gensim explains how to do: https://radimrehurek.com/gensim/tut1.html
With this feature transformation in mind, it is possible to recover the categorization to categorize documents within the model. Then if you want to add new topics, you need to rerun the LDA on the new corpus, but then you can categorize new documents. Hopefully this is helpful.
This is so useful right now. Thanks.
Also, it's great to see someone else using the 8675309 seed!