Skip to content

Instantly share code, notes, and snippets.

def get_topics(df, n_components, number_words):
# Convert to list
data = df.lyric.values.tolist()
# Remove special characters
data = [re.sub('\S*@\S*\s?', '', sent) for sent in data]
# Remove new line characters
data = [re.sub('\s+', ' ', sent) for sent in data]
lda_model_angra, data_vectorized_angra, \
data_angra, lda_output_angra, vectorizer_angra = \
get_topics(df_raw_lyrics_angra, n_components=7, number_words=10)
lda_model_sepultura, data_vectorized_sepultura, \
data_sepultura, lda_output_sepultura, vectorizer_sepultura = \
get_topics(df_raw_lyrics_sepultura, n_components=7,number_words=10)
def get_topic_distribution(df_document_topic):
df_topic_distribution = df_document_topic['dominant_topic'].value_counts().reset_index(name="Num Documents")
df_topic_distribution.columns = ['Topic Num', 'Num Documents']
total_docs = df_topic_distribution['Num Documents'].sum()
df_topic_distribution['perc_per_topic'] = np.round(((df_topic_distribution['Num Documents'] /total_docs) * 100), 2)
return df_topic_distribution
get_topic_distribution(df_topic_per_document_angra)
get_topic_distribution(df_topic_per_document_sepultura)
def get_word_topics(vectorizer, lda_model):
# Show top n keywords for each topic
def show_topics(vectorizer=vectorizer, lda_model=lda_model, n_words=20):
keywords = np.array(vectorizer.get_feature_names())
topic_keywords = []
for topic_weights in lda_model.components_:
top_keyword_locs = (-topic_weights).argsort()[:n_words]
topic_keywords.append(keywords.take(top_keyword_locs))
return topic_keywords
get_word_topics(vectorizer_angra, lda_model_angra)
get_word_topics(vectorizer_sepultura, lda_model_sepultura)
def get_lda_plot(lda_model, data_vectorized, vectorizer):
pyLDAvis.enable_notebook()
panel = pyLDAvis.sklearn.prepare(lda_model, data_vectorized, vectorizer, mds='tsne')
return panel