fclesio · July 3, 2019 10:30
diff --git a/imports-data-preprocessing.py b/imports-data-preprocessing.py
 import matplotlib.pyplot as plt
 import nltk
 import numpy as np
 import os 
 import pandas as pd
 import pyLDAvis
 import pyLDAvis.sklearn
 import random
 import re
 import seaborn as sns
 import spacy
 import string
 from collections import Counter
 from PIL import Image
 from sklearn.decomposition import LatentDirichletAllocation
 from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS
 from sklearn.feature_extraction.text import CountVectorizer
 from spacy.lang.en import English
 from textblob import TextBlob
 from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator

 # Generate graphs inline in Jupyter
 %matplotlib inline
    
 # Lock random seeds used by libraries
 random.seed(42)
 np.random.seed(42)

 # Define default stopwords list
 stoplist = ENGLISH_STOP_WORDS
    
 # Define function to cleanup text by removing 
 # personal pronouns, stopwords, and puncuation
 nlp = spacy.load("en_core_web_sm")
 punctuations = string.punctuation

 # Datasets
 filedir = os.path.dirname(os.path.realpath('__file__'))
 filename = os.path.join(filedir, 'data/rebirth-remains.csv')

 # Load file
 df_raw_lyrics = pd.read_csv(filename, index_col=False)
 df_raw_lyrics.columns = ['index','artist','album','lyric']

 # One limitation of the wrapper that I used to get the data
 # it's that contains a tons of bad records
 df_raw_lyrics = df_raw_lyrics[pd.notnull(df_raw_lyrics['lyric'])]
 df_raw_lyrics = df_raw_lyrics[~df_raw_lyrics["lyric"].str.contains("<span style=")]
 df_raw_lyrics = df_raw_lyrics[~df_raw_lyrics["lyric"].str.contains("padding")]
 df_raw_lyrics = df_raw_lyrics[~df_raw_lyrics["lyric"].str.contains("<img")]

 # Basic counters
 print(f'Qty rows: {df_raw_lyrics.shape[0]}, Qty columns: {df_raw_lyrics.shape[1]}')

 # First look in the data
 df_raw_lyrics.head(5)
	import matplotlib.pyplot as plt
	import nltk
	import numpy as np
	import os
	import pandas as pd
	import pyLDAvis
	import pyLDAvis.sklearn
	import random
	import re
	import seaborn as sns
	import spacy
	import string
	from collections import Counter
	from PIL import Image
	from sklearn.decomposition import LatentDirichletAllocation
	from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS
	from sklearn.feature_extraction.text import CountVectorizer
	from spacy.lang.en import English
	from textblob import TextBlob
	from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator

	# Generate graphs inline in Jupyter
	%matplotlib inline

	# Lock random seeds used by libraries
	random.seed(42)
	np.random.seed(42)

	# Define default stopwords list
	stoplist = ENGLISH_STOP_WORDS

	# Define function to cleanup text by removing
	# personal pronouns, stopwords, and puncuation
	nlp = spacy.load("en_core_web_sm")
	punctuations = string.punctuation

	# Datasets
	filedir = os.path.dirname(os.path.realpath('__file__'))
	filename = os.path.join(filedir, 'data/rebirth-remains.csv')

	# Load file
	df_raw_lyrics = pd.read_csv(filename, index_col=False)
	df_raw_lyrics.columns = ['index','artist','album','lyric']

	# One limitation of the wrapper that I used to get the data
	# it's that contains a tons of bad records
	df_raw_lyrics = df_raw_lyrics[pd.notnull(df_raw_lyrics['lyric'])]
	df_raw_lyrics = df_raw_lyrics[~df_raw_lyrics["lyric"].str.contains("<span style=")]
	df_raw_lyrics = df_raw_lyrics[~df_raw_lyrics["lyric"].str.contains("padding")]
	df_raw_lyrics = df_raw_lyrics[~df_raw_lyrics["lyric"].str.contains("<img")]

	# Basic counters
	print(f'Qty rows: {df_raw_lyrics.shape[0]}, Qty columns: {df_raw_lyrics.shape[1]}')

	# First look in the data
	df_raw_lyrics.head(5)