A "Best of the Best Practices" (BOBP) guide to developing in Python.
- "Build tools for others that you want to be built for you." - Kenneth Reitz
- "Simplicity is alway better than functionality." - Pieter Hintjens
| import matplotlib.pyplot as plt | |
| import nltk | |
| import numpy as np | |
| import os | |
| import pandas as pd | |
| import pyLDAvis | |
| import pyLDAvis.sklearn | |
| import random | |
| import re | |
| import seaborn as sns |
| def get_language(text): | |
| text = str(text) | |
| b = TextBlob(text) | |
| return b.detect_language() | |
| # Include language in the DF | |
| df_raw_lyrics['lang'] = df_raw_lyrics['lyric'].apply(get_language) | |
| # Show stats about the language per artist | |
| df_raw_lyrics.groupby(['artist', 'lang']).size().reset_index() |
| # Filtering out non-EN songs | |
| df_raw_lyrics = df_raw_lyrics[df_raw_lyrics['lang'] == 'en'] | |
| # Lyrics per album | |
| df_raw_lyrics.groupby(['artist', 'album']).size().reset_index() |
| # Average songs per album | |
| df_albuns = df_raw_lyrics.groupby(['artist', 'album']).size().reset_index() | |
| df_albuns.columns = ['artist', 'album', 'qty_tracks'] | |
| df_albuns.groupby(['artist']).agg({'qty_tracks': [np.size, np.mean]}).reset_index() |
| # Convert the lyrics to string to not break the posterior converts | |
| df_raw_lyrics['lyric'] = df_raw_lyrics['lyric'].astype(str) | |
| # Remove all stopwords | |
| df_raw_lyrics['lyric'] = df_raw_lyrics['lyric']\ | |
| .apply(lambda x: ' '.join([item for item in x.lower()\ | |
| .split() if item not in stoplist])) | |
| # Quick check | |
| df_raw_lyrics.head(5) |
| # Data exploration in some specific class to see the most frequent words | |
| def get_word_frequency(artist): | |
| # Word Frequency per Category | |
| def cleanup_text(docs, logging=False): | |
| texts = [] | |
| counter = 1 | |
| for doc in docs: | |
| if counter % 1000 == 0 and logging: | |
| print("Processed %d out of %d documents." % (counter, len(docs))) |
| # Most Common words: Angra | |
| get_word_frequency('angra') |
| # Most Common words: Sepultura | |
| get_word_frequency('sepultura') |
| # Word cloud with most common words | |
| def show_wordcloud(text, artist): | |
| # Create and generate a word cloud image: | |
| wordcloud = WordCloud(stopwords=stoplist, background_color="white").generate(text) | |
| # Display the generated image: | |
| fig = plt.figure(figsize=(25,10)) | |
| plt.imshow(wordcloud, interpolation='bilinear') | |
| plt.title(f'Word Cloud for {artist}', fontsize=20) | |
| plt.axis("off") |