Created
July 3, 2019 10:39
-
-
Save fclesio/8c5c98a120f5ebc50656cb0af6d40287 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Data exploration in some specific class to see the most frequent words | |
def get_word_frequency(artist): | |
# Word Frequency per Category | |
def cleanup_text(docs, logging=False): | |
texts = [] | |
counter = 1 | |
for doc in docs: | |
if counter % 1000 == 0 and logging: | |
print("Processed %d out of %d documents." % (counter, len(docs))) | |
counter += 1 | |
doc = nlp(doc, disable=['parser', 'ner']) | |
tokens = [tok.lemma_.lower().strip() for tok in doc if tok.lemma_ != '-PRON-'] | |
tokens = [tok for tok in tokens if tok not in stoplist and tok not in punctuations] | |
tokens = ' '.join(tokens) | |
texts.append(tokens) | |
return pd.Series(texts) | |
df_text = [text for text in df_raw_lyrics[df_raw_lyrics['artist'] == artist]['lyric']] | |
df_text_clean = cleanup_text(df_text) | |
df_text_clean = ' '.join(df_text_clean).split() | |
df_text_clean_counts = Counter(df_text_clean) | |
df_common_words = [word[0] for word in df_text_clean_counts.most_common(31)] | |
df_common_counts = [word[1] for word in df_text_clean_counts.most_common(31)] | |
df_common_words.pop(0) | |
df_common_counts.pop(0) | |
fig = plt.figure(figsize=(18,6)) | |
sns.barplot(x=df_common_words, y=df_common_counts) | |
plt.title(f'Most Common Words used by {artist}') | |
plt.xticks(rotation=45) | |
plt.show() | |
fig.savefig(f'word_frequency_{artist}.png', format='png', dpi=500) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment