Skip to content

Instantly share code, notes, and snippets.

get_wordcloud(df_raw_lyrics, 'angra')
get_wordcloud(df_raw_lyrics, 'sepultura')
def get_lexical_diversity(df, artist):
dataframe = df[df['artist'] == artist]
# Word stats
full_text_count = pd.DataFrame(Counter(" ".join(dataframe["lyric"]).split()), index=[0])
full_text_count = full_text_count.T
full_text_count = full_text_count.reset_index()
full_text_count.columns = ['word', 'qty']
# Distinct words to include in numerator
get_lexical_diversity(df_raw_lyrics, 'angra')
get_lexical_diversity(df_raw_lyrics, 'sepultura')
def get_word_ngrams_list(df, artist, word_ngram):
def get_top_word_n_bigram(corpus, n=None):
vec = CountVectorizer(ngram_range=(word_ngram, word_ngram)).fit(corpus)
bag_of_words = vec.transform(corpus)
sum_words = bag_of_words.sum(axis=0)
words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
return words_freq[:n]
get_word_ngrams_list(df_raw_lyrics, 'angra', 2)
get_word_ngrams_list(df_raw_lyrics, 'angra', 3)
get_word_ngrams_list(df_raw_lyrics, 'sepultura', 2)
get_word_ngrams_list(df_raw_lyrics, 'sepultura', 3)
# LDA Analysis dataframes
df_raw_lyrics_angra = df_raw_lyrics[df_raw_lyrics['artist'] == 'angra']
df_raw_lyrics_sepultura = df_raw_lyrics[df_raw_lyrics['artist'] == 'sepultura']