naranjja · April 20, 2018 21:14
diff --git a/requirements.txt b/requirements.txt
 sklearn>=0.18.2
 nltk>=3.2.3
diff --git a/topic-modelling.py b/topic-modelling.py
 from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
 from sklearn.decomposition import NMF, LatentDirichletAllocation
 from nltk.corpus import stopwords


 def display_topics(model, feature_names, no_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print('{}'.format(topic_idx), ' '.join([feature_names[i]
                        for i in topic.argsort()[:-no_top_words - 1:-1]]))
      
    
 def main():        
    # read the file, line per line into an array        
    transcripts = []
    with open('/path/to/some/txt/file', encoding='some-encoding') as t:
        transcripts = [_.strip() for _ in t.readlines()]
    len(transcripts)

    # create a term frecuency vectorizer for target language and fit to data
    tfv = TfidfVectorizer(max_df=0.95, min_df=2, max_features=1000, stop_words=stopwords.words('target-language'))
    tfv_data = tfv.fit_transform(transcripts)
    tfv_features = tfv.get_feature_names()

    # create a count vectorizer for target language and fit to data
    cv = CountVectorizer(max_df=0.95, min_df=2, max_features=1000, stop_words=stopwords.words('target-language'))
    cv_data = cv.fit_transform(transcripts)
    cv_features = cv.get_feature_names()

    # set number of topics
    no_topics = 8

    # fit NMF
    nmf = NMF(
        n_components=no_topics, 
        alpha=.1, 
        l1_ratio=.5, 
        init='nndsvd',
    random_state=1).fit(tfv_data)    

    # fit LDA
    lda = LatentDirichletAllocation(
        n_topics=no_topics, 
        max_iter=5, 
        learning_method='online', 
        learning_offset=50., 
        random_state=1).fit(cv_data)

    # set number of words per topic to display
    no_top_words = 10

    # show results for both algorithms
    display_topics(nmf, tfv_features, no_top_words)
    display_topics(lda, cv_features, no_top_words)
    

 if __name__ == '__main__':
    main()
	from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
	from sklearn.decomposition import NMF, LatentDirichletAllocation
	from nltk.corpus import stopwords


	def display_topics(model, feature_names, no_top_words):
	for topic_idx, topic in enumerate(model.components_):
	print('{}'.format(topic_idx), ' '.join([feature_names[i]
	for i in topic.argsort()[:-no_top_words - 1:-1]]))


	def main():
	# read the file, line per line into an array
	transcripts = []
	with open('/path/to/some/txt/file', encoding='some-encoding') as t:
	transcripts = [_.strip() for _ in t.readlines()]
	len(transcripts)

	# create a term frecuency vectorizer for target language and fit to data
	tfv = TfidfVectorizer(max_df=0.95, min_df=2, max_features=1000, stop_words=stopwords.words('target-language'))
	tfv_data = tfv.fit_transform(transcripts)
	tfv_features = tfv.get_feature_names()

	# create a count vectorizer for target language and fit to data
	cv = CountVectorizer(max_df=0.95, min_df=2, max_features=1000, stop_words=stopwords.words('target-language'))
	cv_data = cv.fit_transform(transcripts)
	cv_features = cv.get_feature_names()

	# set number of topics
	no_topics = 8

	# fit NMF
	nmf = NMF(
	n_components=no_topics,
	alpha=.1,
	l1_ratio=.5,
	init='nndsvd',
	random_state=1).fit(tfv_data)

	# fit LDA
	lda = LatentDirichletAllocation(
	n_topics=no_topics,
	max_iter=5,
	learning_method='online',
	learning_offset=50.,
	random_state=1).fit(cv_data)

	# set number of words per topic to display
	no_top_words = 10

	# show results for both algorithms
	display_topics(nmf, tfv_features, no_top_words)
	display_topics(lda, cv_features, no_top_words)


	if __name__ == '__main__':
	main()