Created
July 1, 2023 14:02
-
-
Save dfreelon/a70c85796dd6f450d03f4d06ceb55739 to your computer and use it in GitHub Desktop.
The code I used to create the #BlackMusicMonthChallenge top tracks playlists for Spotify and YouTube. The code requires valid Spotify and YouTube API credentials.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
# run the following two snscrape commands from a CLI first -- you'll need the dev version from https://github.com/JustAnotherArchivist/snscrape | |
# future replicators: I recommend using Twitter's "since" and "until" date operators. I used snscrape's built in "since" operator because I ran it on June 30 so didn't need "until" | |
#for replies: snscrape --since 2023-05-31 --jsonl twitter-search @naima >@naima_replies_all.json | |
#for QTs: snscrape --since 2023-05-31 --jsonl twitter-search quoted_user_id:78417631 >@naima_qts_all.json | |
import html | |
import json | |
import pandas as pd | |
from pyyoutube import Api | |
import re | |
import spotipy | |
from spotipy.oauth2 import SpotifyClientCredentials | |
spotify_client_id = "" #you'll need proper Spotify and YT creds, obv | |
spotify_client_secret = "" | |
yt_api_key = '' | |
yt_long_regex = '(?:v=)(.+?)(?:&|$|\?)' | |
yt_short_regex = '(?:youtu.be/)(.+?)(?:&|$|/|\?)' | |
yt_extra_regex = '[^A-Za-z0-9-\':\",.& ].+' | |
vevo_user_regex = '(?<=[a-z])(?:)(?=[A-Z])' | |
pd.set_option('display.max_columns',None) | |
pd.set_option('display.max_colwidth',50) | |
def prep_field(field): | |
trimmed = re.sub('[^A-Z0-9_ ]','',field.upper()) | |
trimmed = trimmed.strip().replace(' ','_') | |
return re.sub('THE_|AND_','',trimmed) | |
naima = open('@naima_replies_all.json').read().splitlines() | |
naima_qt = open('@naima_qts_all.json').read().splitlines() | |
naima.extend(naima_qt) | |
#remove dupe tweets | |
n2 = [] | |
uniq_ids = [] | |
for i in naima: | |
j = json.loads(i) | |
if j['id'] not in uniq_ids: | |
n2.append(j) | |
uniq_ids.append(j['id']) | |
#parse out links into spotify and youtube | |
yt = [] | |
spot = [] | |
for i in n2: | |
if i['links'] is not None: | |
for j in i['links']: | |
yt_id = None | |
if 'open.spotify.com/track/' in j['url']: | |
spot.append(j['url']) | |
elif 'youtube.com/watch' in j['url']: | |
yt_id = re.findall(yt_long_regex,j['url'])[0] | |
elif 'youtu.be' in j['url']: | |
yt_id = re.findall(yt_short_regex,j['url'])[0] | |
if yt_id is not None: | |
yt.append(yt_id) | |
#spotify processing | |
sp = spotipy.Spotify(client_credentials_manager = | |
SpotifyClientCredentials(client_id=spotify_client_id, | |
client_secret=spotify_client_secret)) | |
#spotify API makes you split your URLs into groups of 50 | |
if len(spot) % 50 == 0: | |
n_bins = len(spot) // 50 | |
else: | |
n_bins = (len(spot) // 50) + 1 | |
spot_bins = [] | |
for n in range(n_bins): | |
spot_bins.append(spot[n*50:(n+1)*50]) | |
spot_data = [] | |
for b in spot_bins: | |
spot_tmp = sp.tracks(b)['tracks'] | |
spot_data.extend(spot_tmp) | |
spotify_list = [] | |
for n,i in enumerate(spot_data): | |
spotify_list.append([]) | |
if i is not None: | |
spotify_list[n].append(prep_field(i['artists'][0]['name'])) | |
spotify_list[n].append(prep_field(re.sub('( - |[^A-Za-z0-9\',& ]).+','',i['name']))) | |
spotify_list[n].append(spot[n]) | |
spotify_list[n].append('spotify') | |
else: | |
print('No data for Spotify track #',n) | |
spotify_list = [i for i in spotify_list if i != []] | |
#youtube processing | |
yt_list = [] | |
yt_ids = [] | |
yt_users = [] | |
api = Api(api_key=yt_api_key) | |
for y in yt: | |
video_data = api.get_video_by_id(video_id=y) | |
video_json = json.loads(video_data.to_json()) | |
try: | |
title = video_json['items'][0]['snippet']['title'] | |
user = video_json['items'][0]['snippet']['channelTitle'] | |
user = user.replace('VEVO','') | |
user = user.replace(' - Topic','') | |
if ' ' not in user: | |
user = ' '.join(re.split(vevo_user_regex,user)) | |
yt_list.append(title) | |
yt_users.append(user) | |
yt_ids.append(y) | |
except IndexError: | |
print('skipped ID',y) | |
#youtube split into artist/title | |
yt_list2 = [] | |
for n,i in enumerate(yt_list): | |
i_split = i.split(' - ') #if it uses VEVO-style title metadata, split it | |
if len(i_split) > 1: | |
i_split = i_split[:2] | |
i_split[0] = prep_field(i_split[0]) | |
i_split[1] = re.sub(yt_extra_regex,'',i_split[1]) | |
i_split[1] = prep_field(i_split[1]) | |
i_split.append(yt_ids[n]) | |
else: #if not, use the channel title (user) as the artist name | |
user = re.sub(yt_extra_regex,'',yt_users[n]) | |
user = prep_field(user) | |
i_split.insert(0,user) | |
i_split[1] = prep_field(i_split[1]) | |
i_split.append(yt_ids[n]) | |
yt_list2.append(i_split) | |
#merge yt and spotify data | |
sp_df = pd.DataFrame(spotify_list) | |
yt_df = pd.DataFrame(yt_list2) | |
yt_df[3] = 'youtube' | |
all_df = pd.concat([sp_df,yt_df]).reset_index(drop=True) | |
all_df.columns = ['artist','title','id','type'] | |
all_df['joined'] = all_df.artist + '_' + all_df.title | |
all_df['joined'] = all_df.joined.apply(lambda x: x[1:] if x[0] == '_' else x) | |
uniq_df = all_df[['artist','title','joined']].drop_duplicates('joined') | |
uniq_list = uniq_df.values.tolist() | |
song_vc = all_df.joined.value_counts() | |
song_dict = dict(zip(song_vc.index,song_vc)) | |
artist_vc = all_df.artist.value_counts() | |
artist_dict = dict(zip(artist_vc.index,artist_vc)) | |
#add in tweet text mentions | |
tweet_text = [prep_field(html.unescape(i['renderedContent'])) | |
for i | |
in n2] | |
for i in tweet_text: | |
for j in uniq_list: | |
if j[0] in i and j[1] in i: | |
song_dict[j[2]] += 1 | |
artist_dict[j[0]] += 1 | |
final_df = pd.DataFrame(zip(song_dict.keys(),song_dict.values())) | |
final_df.columns = ['song','ct'] | |
final_df = final_df.sort_values('ct',ascending=False).reset_index(drop=True) | |
# final_df[final_df.ct >= 12].to_csv('top_BMMC_tracks.csv',index=False) | |
# if you want to create a playlist or dataviz from this, you should remove duplicates first |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment