Created
April 7, 2022 17:57
-
-
Save gitryder/6c7fb7261de0e4b6d6d9814a29bb330a to your computer and use it in GitHub Desktop.
Rahul Spotify
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
import numpy as np | |
import pandas as pd | |
import seaborn as sns | |
import plotly.express as px | |
import matplotlib.pyplot as plt | |
%matplotlib inline | |
from sklearn.cluster import KMeans | |
from sklearn.preprocessing import StandardScaler | |
from sklearn.pipeline import Pipeline | |
from sklearn.manifold import TSNE | |
from sklearn.decomposition import PCA | |
from sklearn.metrics import euclidean_distances | |
from scipy.spatial.distance import cdist | |
import warnings | |
warnings.filterwarnings("ignore") | |
data = pd.read_csv("../content/drive/MyDrive/data/data.csv") | |
genre_data = pd.read_csv('../content/drive/MyDrive/data/data_by_genres.csv') | |
year_data = pd.read_csv('../content/drive/MyDrive/data/data_by_year.csv') | |
print(data.info()) | |
print(genre_data.info()) | |
print(year_data.info()) | |
from yellowbrick.target import FeatureCorrelation | |
feature_names = ['acousticness', 'danceability', 'energy', 'instrumentalness', | |
'liveness', 'loudness', 'speechiness', 'tempo', 'valence','duration_ms','explicit','key','mode','year'] | |
X, y = data[feature_names], data['popularity'] | |
# Create a list of the feature names | |
features = np.array(feature_names) | |
# Instantiate/show the instance of the visualizer | |
visualizer = FeatureCorrelation(labels=features) | |
plt.rcParams['figure.figsize']=(20,20) | |
visualizer.fit(X, y) # Fit the data to the visualizer | |
visualizer.show() | |
def get_decade(year): | |
period_start = int(year/10) * 10 | |
decade = '{}s'.format(period_start) | |
return decade | |
data['decade'] = data['year'].apply(get_decade) | |
sns.set(rc={'figure.figsize':(11 ,6)}) | |
sns.countplot(data['decade']) | |
sound_features = ['acousticness', 'danceability', 'energy', 'instrumentalness', 'liveness', 'valence'] | |
fig = px.line(year_data, x='year', y=sound_features) | |
fig.show() | |
top10_genres = genre_data.nlargest(10, 'popularity') | |
fig = px.bar(top10_genres, x='genres', y=['valence', 'energy', 'danceability', 'acousticness'], barmode='group') | |
fig.show() | |
from sklearn.cluster import KMeans | |
from sklearn.preprocessing import StandardScaler | |
from sklearn.pipeline import Pipeline | |
cluster_pipeline = Pipeline([('scaler', StandardScaler()), ('kmeans', KMeans(n_clusters=10))]) | |
X = genre_data.select_dtypes(np.number) | |
cluster_pipeline.fit(X) | |
genre_data['cluster'] = cluster_pipeline.predict(X) | |
# Visualizing the Clusters with t-SNE(t-distributed stochastic neighbor embedding) | |
from sklearn.manifold import TSNE | |
tsne_pipeline = Pipeline([('scaler', StandardScaler()), ('tsne', TSNE(n_components=2, verbose=1))]) | |
genre_embedding = tsne_pipeline.fit_transform(X) | |
projection = pd.DataFrame(columns=['x', 'y'], data=genre_embedding) | |
projection['genres'] = genre_data['genres'] | |
projection['cluster'] = genre_data['cluster'] | |
fig = px.scatter( | |
projection, x='x', y='y', color='cluster', hover_data=['x', 'y', 'genres']) | |
fig.show() | |
song_cluster_pipeline = Pipeline([('scaler', StandardScaler()), | |
('kmeans', KMeans(n_clusters=20, | |
verbose=False)) | |
], verbose=False) | |
X = data.select_dtypes(np.number) | |
number_cols = list(X.columns) | |
song_cluster_pipeline.fit(X) | |
song_cluster_labels = song_cluster_pipeline.predict(X) | |
data['cluster_label'] = song_cluster_labels | |
# Visualizing the Clusters with PCA(Principal component analysis) | |
from sklearn.decomposition import PCA | |
pca_pipeline = Pipeline([('scaler', StandardScaler()), ('PCA', PCA(n_components=2))]) | |
song_embedding = pca_pipeline.fit_transform(X) | |
projection = pd.DataFrame(columns=['x', 'y'], data=song_embedding) | |
projection['title'] = data['name'] | |
projection['cluster'] = data['cluster_label'] | |
fig = px.scatter( | |
projection, x='x', y='y', color='cluster', hover_data=['x', 'y', 'title']) | |
fig.show() | |
import spotipy | |
from spotipy.oauth2 import SpotifyClientCredentials | |
from collections import defaultdict | |
sp = spotipy.Spotify(auth_manager=SpotifyClientCredentials(client_id="faf71486a09844cfb2deaaa47bfa445c", client_secret="09362ebbd68449028f1059ae5c89915e")) | |
def find_song(name, year): | |
song_data = defaultdict() | |
results = sp.search(q= 'track: {} year: {}'.format(name,year), limit=1) | |
if results['tracks']['items'] == []: | |
return None | |
results = results['tracks']['items'][0] | |
track_id = results['id'] | |
audio_features = sp.audio_features(track_id)[0] | |
song_data['name'] = [name] | |
song_data['year'] = [year] | |
song_data['explicit'] = [int(results['explicit'])] | |
song_data['duration_ms'] = [results['duration_ms']] | |
song_data['popularity'] = [results['popularity']] | |
for key, value in audio_features.items(): | |
song_data[key] = value | |
return pd.DataFrame(song_data) | |
from collections import defaultdict | |
from sklearn.metrics import euclidean_distances | |
from scipy.spatial.distance import cdist | |
import difflib | |
number_cols = ['valence', 'year', 'acousticness', 'danceability', 'duration_ms', 'energy', 'explicit', | |
'instrumentalness', 'key', 'liveness', 'loudness', 'mode', 'popularity', 'speechiness', 'tempo'] | |
def get_song_data(song, spotify_data): | |
try: | |
song_data = spotify_data[(spotify_data['name'] == song['name']) | |
& (spotify_data['year'] == song['year'])].iloc[0] | |
return song_data | |
except IndexError: | |
return find_song(song['name'], song['year']) | |
def get_mean_vector(song_list, spotify_data): | |
song_vectors = [] | |
for song in song_list: | |
song_data = get_song_data(song, spotify_data) | |
if song_data is None: | |
print('Warning: {} does not exist in Spotify or in database'.format(song['name'])) | |
continue | |
song_vector = song_data[number_cols].values | |
song_vectors.append(song_vector) | |
song_matrix = np.array(list(song_vectors)) | |
return np.mean(song_matrix, axis=0) | |
def flatten_dict_list(dict_list): | |
flattened_dict = defaultdict() | |
for key in dict_list[0].keys(): | |
flattened_dict[key] = [] | |
for dictionary in dict_list: | |
for key, value in dictionary.items(): | |
flattened_dict[key].append(value) | |
return flattened_dict | |
def recommend_songs( song_list, spotify_data, n_songs=10): | |
metadata_cols = ['name', 'year', 'artists'] | |
song_dict = flatten_dict_list(song_list) | |
song_center = get_mean_vector(song_list, spotify_data) | |
scaler = song_cluster_pipeline.steps[0][1] | |
scaled_data = scaler.transform(spotify_data[number_cols]) | |
scaled_song_center = scaler.transform(song_center.reshape(1, -1)) | |
distances = cdist(scaled_song_center, scaled_data, 'cosine') | |
index = list(np.argsort(distances)[:, :n_songs][0]) | |
rec_songs = spotify_data.iloc[index] | |
rec_songs = rec_songs[~rec_songs['name'].isin(song_dict['name'])] | |
return rec_songs[metadata_cols].to_dict(orient='records') | |
recommend_songs([{'name': 'Come As You Are', 'year':1991}, | |
{'name': 'Smells Like Teen Spirit', 'year': 1991}, | |
{'name': 'Lithium', 'year': 1992}, | |
{'name': 'All Apologies', 'year': 1993}, | |
{'name': 'Stay Away', 'year': 1993}], data) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment