dfreelon · July 1, 2023 14:02
diff --git a/top_BMMC_tracks_public.py b/top_BMMC_tracks_public.py
 # -*- coding: utf-8 -*-

 # run the following two snscrape commands from a CLI first -- you'll need the dev version from https://github.com/JustAnotherArchivist/snscrape
 # future replicators: I recommend using Twitter's "since" and "until" date operators. I used snscrape's built in "since" operator because I ran it on June 30 so didn't need "until"
 #for replies: snscrape --since 2023-05-31 --jsonl twitter-search @naima >@naima_replies_all.json
 #for QTs: snscrape --since 2023-05-31 --jsonl twitter-search quoted_user_id:78417631 >@naima_qts_all.json

 import html
 import json
 import pandas as pd
 from pyyoutube import Api
 import re
 import spotipy
 from spotipy.oauth2 import SpotifyClientCredentials

 spotify_client_id = "" #you'll need proper Spotify and YT creds, obv
 spotify_client_secret = ""
 yt_api_key = ''
 yt_long_regex = '(?:v=)(.+?)(?:&|$|\?)'
 yt_short_regex = '(?:youtu.be/)(.+?)(?:&|$|/|\?)'
 yt_extra_regex = '[^A-Za-z0-9-\':\",.& ].+'
 vevo_user_regex = '(?<=[a-z])(?:)(?=[A-Z])'
 pd.set_option('display.max_columns',None)
 pd.set_option('display.max_colwidth',50)

 def prep_field(field):
    trimmed = re.sub('[^A-Z0-9_ ]','',field.upper())
    trimmed = trimmed.strip().replace(' ','_')
    return re.sub('THE_|AND_','',trimmed) 

 naima = open('@naima_replies_all.json').read().splitlines()
 naima_qt = open('@naima_qts_all.json').read().splitlines()
 naima.extend(naima_qt)

 #remove dupe tweets

 n2 = []
 uniq_ids = []
 for i in naima:
    j = json.loads(i)
    if j['id'] not in uniq_ids:     
        n2.append(j)
        uniq_ids.append(j['id'])
    
 #parse out links into spotify and youtube

 yt = []
 spot = []

 for i in n2:
    if i['links'] is not None:
        for j in i['links']:
            yt_id = None
            if 'open.spotify.com/track/' in j['url']:
                spot.append(j['url'])
            elif 'youtube.com/watch' in j['url']:
                yt_id = re.findall(yt_long_regex,j['url'])[0]
            elif 'youtu.be' in j['url']:
                yt_id = re.findall(yt_short_regex,j['url'])[0]
            if yt_id is not None:
                yt.append(yt_id)
                
 #spotify processing
                
 sp = spotipy.Spotify(client_credentials_manager = 
                     SpotifyClientCredentials(client_id=spotify_client_id,
                                              client_secret=spotify_client_secret))

 #spotify API makes you split your URLs into groups of 50

 if len(spot) % 50 == 0:
    n_bins = len(spot) // 50
 else:
    n_bins = (len(spot) // 50) + 1
 spot_bins = []
 for n in range(n_bins):
    spot_bins.append(spot[n*50:(n+1)*50])

 spot_data = []
 for b in spot_bins:
    spot_tmp = sp.tracks(b)['tracks']
    spot_data.extend(spot_tmp)
    
 spotify_list = []
 for n,i in enumerate(spot_data):
    spotify_list.append([])
    if i is not None:    
        spotify_list[n].append(prep_field(i['artists'][0]['name'])) 
        spotify_list[n].append(prep_field(re.sub('( - |[^A-Za-z0-9\',& ]).+','',i['name'])))
        spotify_list[n].append(spot[n])
        spotify_list[n].append('spotify')
    else:
        print('No data for Spotify track #',n)
        
 spotify_list = [i for i in spotify_list if i != []]

 #youtube processing

 yt_list = []
 yt_ids = []
 yt_users = []
 api = Api(api_key=yt_api_key)

 for y in yt:
    video_data = api.get_video_by_id(video_id=y)
    video_json = json.loads(video_data.to_json())
    try:
        title = video_json['items'][0]['snippet']['title']
        user = video_json['items'][0]['snippet']['channelTitle']
        user = user.replace('VEVO','')
        user = user.replace(' - Topic','')
        if ' ' not in user:
            user = ' '.join(re.split(vevo_user_regex,user))
        yt_list.append(title)
        yt_users.append(user)
        yt_ids.append(y)
    except IndexError:
        print('skipped ID',y)
        
 #youtube split into artist/title

 yt_list2 = []
 for n,i in enumerate(yt_list):
    i_split = i.split(' - ') #if it uses VEVO-style title metadata, split it
    if len(i_split) > 1:
        i_split = i_split[:2]
        i_split[0] = prep_field(i_split[0])
        i_split[1] = re.sub(yt_extra_regex,'',i_split[1])
        i_split[1] = prep_field(i_split[1])
        i_split.append(yt_ids[n])
    else: #if not, use the channel title (user) as the artist name
        user = re.sub(yt_extra_regex,'',yt_users[n])
        user = prep_field(user)
        i_split.insert(0,user)
        i_split[1] = prep_field(i_split[1])
        i_split.append(yt_ids[n])
    yt_list2.append(i_split)

 #merge yt and spotify data

 sp_df = pd.DataFrame(spotify_list)
 yt_df = pd.DataFrame(yt_list2)
 yt_df[3] = 'youtube'
 all_df = pd.concat([sp_df,yt_df]).reset_index(drop=True)
 all_df.columns = ['artist','title','id','type']
 all_df['joined'] = all_df.artist + '_' + all_df.title
 all_df['joined'] = all_df.joined.apply(lambda x: x[1:] if x[0] == '_' else x)
 uniq_df = all_df[['artist','title','joined']].drop_duplicates('joined')
 uniq_list = uniq_df.values.tolist()
 song_vc = all_df.joined.value_counts()
 song_dict = dict(zip(song_vc.index,song_vc))
 artist_vc = all_df.artist.value_counts()
 artist_dict = dict(zip(artist_vc.index,artist_vc))

 #add in tweet text mentions

 tweet_text = [prep_field(html.unescape(i['renderedContent'])) 
              for i 
              in n2]

 for i in tweet_text:
    for j in uniq_list:
        if j[0] in i and j[1] in i:
            song_dict[j[2]] += 1
            artist_dict[j[0]] += 1

 final_df = pd.DataFrame(zip(song_dict.keys(),song_dict.values()))
 final_df.columns = ['song','ct']
 final_df = final_df.sort_values('ct',ascending=False).reset_index(drop=True)
 # final_df[final_df.ct >= 12].to_csv('top_BMMC_tracks.csv',index=False)
 # if you want to create a playlist or dataviz from this, you should remove duplicates first
	# -- coding: utf-8 --

	# run the following two snscrape commands from a CLI first -- you'll need the dev version from https://github.com/JustAnotherArchivist/snscrape
	# future replicators: I recommend using Twitter's "since" and "until" date operators. I used snscrape's built in "since" operator because I ran it on June 30 so didn't need "until"
	#for replies: snscrape --since 2023-05-31 --jsonl twitter-search @naima >@naima_replies_all.json
	#for QTs: snscrape --since 2023-05-31 --jsonl twitter-search quoted_user_id:78417631 >@naima_qts_all.json

	import html
	import json
	import pandas as pd
	from pyyoutube import Api
	import re
	import spotipy
	from spotipy.oauth2 import SpotifyClientCredentials

	spotify_client_id = "" #you'll need proper Spotify and YT creds, obv
	spotify_client_secret = ""
	yt_api_key = ''
	yt_long_regex = '(?:v=)(.+?)(?:&\|$\|\?)'
	yt_short_regex = '(?:youtu.be/)(.+?)(?:&\|$\|/\|\?)'
	yt_extra_regex = '[^A-Za-z0-9-\':\",.& ].+'
	vevo_user_regex = '(?<=[a-z])(?:)(?=[A-Z])'
	pd.set_option('display.max_columns',None)
	pd.set_option('display.max_colwidth',50)

	def prep_field(field):
	trimmed = re.sub('[^A-Z0-9_ ]','',field.upper())
	trimmed = trimmed.strip().replace(' ','_')
	return re.sub('THE_\|AND_','',trimmed)

	naima = open('@naima_replies_all.json').read().splitlines()
	naima_qt = open('@naima_qts_all.json').read().splitlines()
	naima.extend(naima_qt)

	#remove dupe tweets

	n2 = []
	uniq_ids = []
	for i in naima:
	j = json.loads(i)
	if j['id'] not in uniq_ids:
	n2.append(j)
	uniq_ids.append(j['id'])

	#parse out links into spotify and youtube

	yt = []
	spot = []

	for i in n2:
	if i['links'] is not None:
	for j in i['links']:
	yt_id = None
	if 'open.spotify.com/track/' in j['url']:
	spot.append(j['url'])
	elif 'youtube.com/watch' in j['url']:
	yt_id = re.findall(yt_long_regex,j['url'])[0]
	elif 'youtu.be' in j['url']:
	yt_id = re.findall(yt_short_regex,j['url'])[0]
	if yt_id is not None:
	yt.append(yt_id)

	#spotify processing

	sp = spotipy.Spotify(client_credentials_manager =
	SpotifyClientCredentials(client_id=spotify_client_id,
	client_secret=spotify_client_secret))

	#spotify API makes you split your URLs into groups of 50

	if len(spot) % 50 == 0:
	n_bins = len(spot) // 50
	else:
	n_bins = (len(spot) // 50) + 1
	spot_bins = []
	for n in range(n_bins):
	spot_bins.append(spot[n50:(n+1)50])

	spot_data = []
	for b in spot_bins:
	spot_tmp = sp.tracks(b)['tracks']
	spot_data.extend(spot_tmp)

	spotify_list = []
	for n,i in enumerate(spot_data):
	spotify_list.append([])
	if i is not None:
	spotify_list[n].append(prep_field(i['artists'][0]['name']))
	spotify_list[n].append(prep_field(re.sub('( - \|[^A-Za-z0-9\',& ]).+','',i['name'])))
	spotify_list[n].append(spot[n])
	spotify_list[n].append('spotify')
	else:
	print('No data for Spotify track #',n)

	spotify_list = [i for i in spotify_list if i != []]

	#youtube processing

	yt_list = []
	yt_ids = []
	yt_users = []
	api = Api(api_key=yt_api_key)

	for y in yt:
	video_data = api.get_video_by_id(video_id=y)
	video_json = json.loads(video_data.to_json())
	try:
	title = video_json['items'][0]['snippet']['title']
	user = video_json['items'][0]['snippet']['channelTitle']
	user = user.replace('VEVO','')
	user = user.replace(' - Topic','')
	if ' ' not in user:
	user = ' '.join(re.split(vevo_user_regex,user))
	yt_list.append(title)
	yt_users.append(user)
	yt_ids.append(y)
	except IndexError:
	print('skipped ID',y)

	#youtube split into artist/title

	yt_list2 = []
	for n,i in enumerate(yt_list):
	i_split = i.split(' - ') #if it uses VEVO-style title metadata, split it
	if len(i_split) > 1:
	i_split = i_split[:2]
	i_split[0] = prep_field(i_split[0])
	i_split[1] = re.sub(yt_extra_regex,'',i_split[1])
	i_split[1] = prep_field(i_split[1])
	i_split.append(yt_ids[n])
	else: #if not, use the channel title (user) as the artist name
	user = re.sub(yt_extra_regex,'',yt_users[n])
	user = prep_field(user)
	i_split.insert(0,user)
	i_split[1] = prep_field(i_split[1])
	i_split.append(yt_ids[n])
	yt_list2.append(i_split)

	#merge yt and spotify data

	sp_df = pd.DataFrame(spotify_list)
	yt_df = pd.DataFrame(yt_list2)
	yt_df[3] = 'youtube'
	all_df = pd.concat([sp_df,yt_df]).reset_index(drop=True)
	all_df.columns = ['artist','title','id','type']
	all_df['joined'] = all_df.artist + '_' + all_df.title
	all_df['joined'] = all_df.joined.apply(lambda x: x[1:] if x[0] == '_' else x)
	uniq_df = all_df[['artist','title','joined']].drop_duplicates('joined')
	uniq_list = uniq_df.values.tolist()
	song_vc = all_df.joined.value_counts()
	song_dict = dict(zip(song_vc.index,song_vc))
	artist_vc = all_df.artist.value_counts()
	artist_dict = dict(zip(artist_vc.index,artist_vc))

	#add in tweet text mentions

	tweet_text = [prep_field(html.unescape(i['renderedContent']))
	for i
	in n2]

	for i in tweet_text:
	for j in uniq_list:
	if j[0] in i and j[1] in i:
	song_dict[j[2]] += 1
	artist_dict[j[0]] += 1

	final_df = pd.DataFrame(zip(song_dict.keys(),song_dict.values()))
	final_df.columns = ['song','ct']
	final_df = final_df.sort_values('ct',ascending=False).reset_index(drop=True)
	# final_df[final_df.ct >= 12].to_csv('top_BMMC_tracks.csv',index=False)
	# if you want to create a playlist or dataviz from this, you should remove duplicates first