Last active
May 22, 2021 01:27
-
-
Save ululh/d00ab305e937d8eba3ba963bd558d674 to your computer and use it in GitHub Desktop.
LDA (Latent Dirichlet Allocation) fitting with python scikit-learn
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# derived from http://scikit-learn.org/stable/auto_examples/applications/topics_extraction_with_nmf_lda.html | |
# explanations are located there : https://www.linkedin.com/pulse/dissociating-training-predicting-latent-dirichlet-lucien-tardres | |
from sklearn.feature_extraction.text import CountVectorizer | |
from sklearn.decomposition import LatentDirichletAllocation | |
import pickle | |
n_features = 50 | |
n_topics = 2 | |
# Training dataset | |
data_samples = ["I like to eat broccoli and bananas.", | |
"I ate a banana and spinach smoothie for breakfast.", | |
"Chinchillas and kittens are cute.", | |
"My sister adopted a kitten yesterday.", | |
"Look at this cute hamster munching on a piece of broccoli." | |
] | |
# extract fetures and vectorize dataset | |
tf_vectorizer = CountVectorizer(max_df=0.95, min_df=1, | |
max_features=n_features, | |
stop_words='english') | |
tf = tf_vectorizer.fit_transform(data_samples) | |
#save features | |
dic = tf_vectorizer.get_feature_names() | |
lda = LatentDirichletAllocation(n_topics=n_topics, max_iter=5, | |
learning_method='online', | |
learning_offset=50., | |
random_state=0) | |
# train LDA | |
p1 = lda.fit(tf) | |
# Save all data necessary for later prediction | |
model = (dic,lda.components_,lda.exp_dirichlet_component_,lda.doc_topic_prior_) | |
with open('outfile', 'wb') as fp: | |
pickle.dump(model, fp) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment