Last active
April 20, 2018 21:14
-
-
Save naranjja/e73dff6e9d58dd77ac461bf118177654 to your computer and use it in GitHub Desktop.
Simple implementation of topic modelling using NMF and LDA
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
sklearn>=0.18.2 | |
nltk>=3.2.3 |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer | |
from sklearn.decomposition import NMF, LatentDirichletAllocation | |
from nltk.corpus import stopwords | |
def display_topics(model, feature_names, no_top_words): | |
for topic_idx, topic in enumerate(model.components_): | |
print('{}'.format(topic_idx), ' '.join([feature_names[i] | |
for i in topic.argsort()[:-no_top_words - 1:-1]])) | |
def main(): | |
# read the file, line per line into an array | |
transcripts = [] | |
with open('/path/to/some/txt/file', encoding='some-encoding') as t: | |
transcripts = [_.strip() for _ in t.readlines()] | |
len(transcripts) | |
# create a term frecuency vectorizer for target language and fit to data | |
tfv = TfidfVectorizer(max_df=0.95, min_df=2, max_features=1000, stop_words=stopwords.words('target-language')) | |
tfv_data = tfv.fit_transform(transcripts) | |
tfv_features = tfv.get_feature_names() | |
# create a count vectorizer for target language and fit to data | |
cv = CountVectorizer(max_df=0.95, min_df=2, max_features=1000, stop_words=stopwords.words('target-language')) | |
cv_data = cv.fit_transform(transcripts) | |
cv_features = cv.get_feature_names() | |
# set number of topics | |
no_topics = 8 | |
# fit NMF | |
nmf = NMF( | |
n_components=no_topics, | |
alpha=.1, | |
l1_ratio=.5, | |
init='nndsvd', | |
random_state=1).fit(tfv_data) | |
# fit LDA | |
lda = LatentDirichletAllocation( | |
n_topics=no_topics, | |
max_iter=5, | |
learning_method='online', | |
learning_offset=50., | |
random_state=1).fit(cv_data) | |
# set number of words per topic to display | |
no_top_words = 10 | |
# show results for both algorithms | |
display_topics(nmf, tfv_features, no_top_words) | |
display_topics(lda, cv_features, no_top_words) | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment