Created
July 3, 2019 10:30
-
-
Save fclesio/eb5fedf5dd00b43fcab4ea955da7744c to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import matplotlib.pyplot as plt | |
import nltk | |
import numpy as np | |
import os | |
import pandas as pd | |
import pyLDAvis | |
import pyLDAvis.sklearn | |
import random | |
import re | |
import seaborn as sns | |
import spacy | |
import string | |
from collections import Counter | |
from PIL import Image | |
from sklearn.decomposition import LatentDirichletAllocation | |
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS | |
from sklearn.feature_extraction.text import CountVectorizer | |
from spacy.lang.en import English | |
from textblob import TextBlob | |
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator | |
# Generate graphs inline in Jupyter | |
%matplotlib inline | |
# Lock random seeds used by libraries | |
random.seed(42) | |
np.random.seed(42) | |
# Define default stopwords list | |
stoplist = ENGLISH_STOP_WORDS | |
# Define function to cleanup text by removing | |
# personal pronouns, stopwords, and puncuation | |
nlp = spacy.load("en_core_web_sm") | |
punctuations = string.punctuation | |
# Datasets | |
filedir = os.path.dirname(os.path.realpath('__file__')) | |
filename = os.path.join(filedir, 'data/rebirth-remains.csv') | |
# Load file | |
df_raw_lyrics = pd.read_csv(filename, index_col=False) | |
df_raw_lyrics.columns = ['index','artist','album','lyric'] | |
# One limitation of the wrapper that I used to get the data | |
# it's that contains a tons of bad records | |
df_raw_lyrics = df_raw_lyrics[pd.notnull(df_raw_lyrics['lyric'])] | |
df_raw_lyrics = df_raw_lyrics[~df_raw_lyrics["lyric"].str.contains("<span style=")] | |
df_raw_lyrics = df_raw_lyrics[~df_raw_lyrics["lyric"].str.contains("padding")] | |
df_raw_lyrics = df_raw_lyrics[~df_raw_lyrics["lyric"].str.contains("<img")] | |
# Basic counters | |
print(f'Qty rows: {df_raw_lyrics.shape[0]}, Qty columns: {df_raw_lyrics.shape[1]}') | |
# First look in the data | |
df_raw_lyrics.head(5) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment