Flavio Clesio fclesio

The Best of the Best Practices (BOBP) Guide for Python

A "Best of the Best Practices" (BOBP) guide to developing in Python.

	import matplotlib.pyplot as plt
	import nltk
	import numpy as np
	import os
	import pandas as pd
	import pyLDAvis
	import pyLDAvis.sklearn
	import random
	import re
	import seaborn as sns

	def get_language(text):
	text = str(text)
	b = TextBlob(text)
	return b.detect_language()

	# Include language in the DF
	df_raw_lyrics['lang'] = df_raw_lyrics['lyric'].apply(get_language)

	# Show stats about the language per artist
	df_raw_lyrics.groupby(['artist', 'lang']).size().reset_index()

	# Filtering out non-EN songs
	df_raw_lyrics = df_raw_lyrics[df_raw_lyrics['lang'] == 'en']

	# Lyrics per album
	df_raw_lyrics.groupby(['artist', 'album']).size().reset_index()

	# Average songs per album
	df_albuns = df_raw_lyrics.groupby(['artist', 'album']).size().reset_index()
	df_albuns.columns = ['artist', 'album', 'qty_tracks']
	df_albuns.groupby(['artist']).agg({'qty_tracks': [np.size, np.mean]}).reset_index()

	# Convert the lyrics to string to not break the posterior converts
	df_raw_lyrics['lyric'] = df_raw_lyrics['lyric'].astype(str)

	# Remove all stopwords
	df_raw_lyrics['lyric'] = df_raw_lyrics['lyric']\
	.apply(lambda x: ' '.join([item for item in x.lower()\
	.split() if item not in stoplist]))

	# Quick check
	df_raw_lyrics.head(5)

	# Data exploration in some specific class to see the most frequent words
	def get_word_frequency(artist):

	# Word Frequency per Category
	def cleanup_text(docs, logging=False):
	texts = []
	counter = 1
	for doc in docs:
	if counter % 1000 == 0 and logging:
	print("Processed %d out of %d documents." % (counter, len(docs)))

	# Most Common words: Sepultura
	get_word_frequency('sepultura')

	# Word cloud with most common words
	def show_wordcloud(text, artist):
	# Create and generate a word cloud image:
	wordcloud = WordCloud(stopwords=stoplist, background_color="white").generate(text)

	# Display the generated image:
	fig = plt.figure(figsize=(25,10))
	plt.imshow(wordcloud, interpolation='bilinear')
	plt.title(f'Word Cloud for {artist}', fontsize=20)
	plt.axis("off")