Skip to content

Instantly share code, notes, and snippets.

@naranjja
Last active April 20, 2018 21:14
Show Gist options
  • Save naranjja/e73dff6e9d58dd77ac461bf118177654 to your computer and use it in GitHub Desktop.
Save naranjja/e73dff6e9d58dd77ac461bf118177654 to your computer and use it in GitHub Desktop.
Simple implementation of topic modelling using NMF and LDA
sklearn>=0.18.2
nltk>=3.2.3
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation
from nltk.corpus import stopwords
def display_topics(model, feature_names, no_top_words):
for topic_idx, topic in enumerate(model.components_):
print('{}'.format(topic_idx), ' '.join([feature_names[i]
for i in topic.argsort()[:-no_top_words - 1:-1]]))
def main():
# read the file, line per line into an array
transcripts = []
with open('/path/to/some/txt/file', encoding='some-encoding') as t:
transcripts = [_.strip() for _ in t.readlines()]
len(transcripts)
# create a term frecuency vectorizer for target language and fit to data
tfv = TfidfVectorizer(max_df=0.95, min_df=2, max_features=1000, stop_words=stopwords.words('target-language'))
tfv_data = tfv.fit_transform(transcripts)
tfv_features = tfv.get_feature_names()
# create a count vectorizer for target language and fit to data
cv = CountVectorizer(max_df=0.95, min_df=2, max_features=1000, stop_words=stopwords.words('target-language'))
cv_data = cv.fit_transform(transcripts)
cv_features = cv.get_feature_names()
# set number of topics
no_topics = 8
# fit NMF
nmf = NMF(
n_components=no_topics,
alpha=.1,
l1_ratio=.5,
init='nndsvd',
random_state=1).fit(tfv_data)
# fit LDA
lda = LatentDirichletAllocation(
n_topics=no_topics,
max_iter=5,
learning_method='online',
learning_offset=50.,
random_state=1).fit(cv_data)
# set number of words per topic to display
no_top_words = 10
# show results for both algorithms
display_topics(nmf, tfv_features, no_top_words)
display_topics(lda, cv_features, no_top_words)
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment