Last active
February 23, 2024 00:19
-
-
Save k5cents/f52a38286dfea494996db8b7f5338a4c to your computer and use it in GitHub Desktop.
Create some plots from Spotify streaming history data
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(tidyverse) | |
library(jsonlite) | |
library(scales) | |
library(fs) | |
library(k5) | |
# functions --------------------------------------------------------------- | |
most_common <- function (x, n = 6) { | |
as.vector(na.omit(names(sort(table(x), decreasing = TRUE)[1:n]))) | |
} | |
is_even <- function (x) { | |
x %% 2 == 0 | |
} | |
# read and format --------------------------------------------------------- | |
# map all json files into one data frame | |
x <- map_df( | |
.x = dir_ls("~/Downloads/my_spotify_data/MyData/", regexp = "endsong_\\d"), | |
.f = read_json, | |
simplifyDataFrame = TRUE | |
) | |
x <- as_tibble(x) | |
x$ts <- as_datetime(x$ts, tz = "EST") | |
# remove rows and columns with podcast data | |
x <- x %>% | |
filter(is.na(episode_name)) %>% | |
select(-contains("episode")) | |
x <- x %>% | |
filter(!is.na(master_metadata_track_name)) | |
# format and select columns | |
x <- x %>% | |
mutate(min = ms_played / 6e4) %>% | |
select( | |
ts, | |
min, | |
platform, | |
track = master_metadata_track_name, | |
album = master_metadata_album_album_name, | |
artist = master_metadata_album_artist_name, | |
reason_start, | |
reason_end | |
) | |
# clean up platform | |
x <- x %>% | |
mutate( | |
platform = platform %>% | |
str_to_lower() %>% | |
str_remove("partner windows_tv microsoft;") %>% | |
str_remove("_") %>% | |
str_remove("[:punct:].*$") %>% | |
str_replace("os x", "osx") %>% | |
word(1) %>% | |
str_replace("xboxone", "xbox") %>% | |
str_replace("xboxseries", "xbox") %>% | |
str_replace("partner", "cast") | |
) | |
# artist bar -------------------------------------------------------------- | |
x %>% | |
group_by(artist) %>% | |
summarise(hours = sum(min) / 60) %>% | |
arrange(desc(hours)) %>% | |
head(20) %>% | |
ggplot(aes(x = reorder(artist, hours), y = hours)) + | |
geom_col(color = "black", aes(fill = hours)) + | |
scale_fill_viridis_c(option = "B", guide = "none", end = 0.8) + | |
coord_flip() + | |
labs(x = "Artist", y = "Hours") + | |
theme_classic() | |
x %>% | |
group_by(artist) %>% | |
filter() %>% | |
summarise(min = sum(min), n = n()) %>% | |
arrange(desc(n)) %>% | |
head(100) %>% | |
write_csv("~/Documents/spotify_artists.csv", na = "") | |
# artist month ------------------------------------------------------------ | |
x %>% | |
group_by(month = month(ts)) %>% | |
# select top N per month | |
filter( | |
artist %in% most_common(artist, 3), | |
year(ts) == max(year(ts)) - 1 | |
) %>% | |
# add Y for labels and bars | |
group_by(month, artist) %>% | |
summarise( | |
total_min = sum(min), | |
.groups = "drop_last" | |
) %>% | |
mutate( | |
lbl_y = cumsum(total_min), | |
lbl_abb = if_else( | |
total_min < 100, "", abbreviate(artist) | |
) | |
) %>% | |
ggplot( | |
mapping = aes(x = month, y = total_min) | |
) + | |
geom_col( | |
mapping = aes(fill = str_trunc(artist, width = 25)), | |
color = "black", | |
position = position_stack(reverse = TRUE) | |
) + | |
geom_text( | |
mapping = aes(y = lbl_y, label = lbl_abb), | |
vjust = 1.5, | |
color = "black" | |
) + | |
scale_x_continuous( | |
breaks = 1:12, | |
labels = month.abb | |
) + | |
scale_y_continuous( | |
labels = comma | |
) + | |
labs( | |
title = "Most listened to artists each month", | |
fill = "Artist", | |
x = "Month", | |
y = "Minutes" | |
) + | |
theme_classic() | |
# album bar --------------------------------------------------------------- | |
x %>% | |
group_by(album, artist) %>% | |
filter(album %out% c("")) %>% | |
summarise(hours = sum(min) / 60) %>% | |
arrange(desc(hours)) %>% | |
head(20) %>% | |
mutate( | |
lbl = paste( | |
str_trunc(album, 30, side = "right"), | |
str_trunc(artist, 30, side = "right"), | |
sep = " | " | |
) | |
) %>% | |
ggplot(aes(x = reorder(lbl, hours), y = hours)) + | |
geom_col(color = "black", aes(fill = hours)) + | |
scale_fill_viridis_c(option = "B", guide = "none", end = 0.8) + | |
coord_flip() + | |
labs(x = "Artist", y = "Hours") + | |
theme_classic() | |
x %>% | |
mutate( | |
album = str_to_title(str_remove(album, "\\s\\(.*\\)$")), | |
) %>% | |
group_by(album, artist) %>% | |
filter() %>% | |
summarise(min = sum(min), n = n()) %>% | |
arrange(desc(n)) %>% | |
head(100) %>% | |
write_csv("~/Documents/spotify_albums.csv", na = "") | |
# song bar ---------------------------------------------------------------- | |
x %>% | |
group_by(track, artist) %>% | |
filter(track %out% c("")) %>% | |
summarise(hours = sum(min) / 60) %>% | |
arrange(desc(hours)) %>% | |
head(20) %>% | |
mutate( | |
lbl = paste( | |
str_trunc(track, 30, side = "right"), | |
str_trunc(artist, 30, side = "right"), | |
sep = " | " | |
) | |
) %>% | |
ggplot(aes(x = reorder(lbl, hours), y = hours)) + | |
geom_col(color = "black", aes(fill = hours)) + | |
scale_fill_viridis_c(option = "B", guide = "none", end = 0.8) + | |
coord_flip() + | |
labs(x = "Artist", y = "Hours") + | |
theme_classic() | |
x %>% | |
mutate( | |
album = str_to_title(str_remove(album, "\\s\\(.*\\)$")), | |
track = str_to_title(track) | |
) %>% | |
group_by(track, album, artist) %>% | |
filter() %>% | |
summarise(min = sum(min), n = n()) %>% | |
arrange(desc(n)) %>% | |
head(100) %>% | |
write_csv("~/Documents/spotify_songs.csv", na = "") | |
# platform ---------------------------------------------------------------- | |
x %>% | |
group_by(platform, yr = year(ts)) %>% | |
summarise(hours = sum(min) / 60) %>% | |
ggplot(aes(x = yr, y = hours)) + | |
geom_col(color = "black", aes(fill = platform)) + | |
labs(x = "Year", y = "Hours") + | |
theme_classic() | |
x %>% | |
group_by(yr = year(ts), platform) %>% | |
summarise(hours = sum(min) / 60) %>% | |
mutate(p = hours / sum(hours)) %>% | |
ggplot(aes(x = yr, y = p)) + | |
geom_col(color = "black", aes(fill = platform)) + | |
labs(x = "Year", y = "Hours") + | |
theme_classic() | |
# time polar -------------------------------------------------------------- | |
x %>% | |
mutate( | |
hour = hour(ts), | |
year = year(ts), | |
) %>% | |
group_by(year, hour) %>% | |
summarise(totalMin = sum(min)) %>% | |
filter(year > 2014) %>% | |
# complete(hour = 0:23) %>% | |
ggplot(aes(x = hour, y = totalMin)) + | |
geom_col( | |
mapping = aes(fill = totalMin), | |
color = "black" | |
) + | |
scale_fill_viridis_c( | |
end = 0.90, | |
option = "B", | |
guide = "none" | |
) + | |
scale_x_continuous( | |
breaks = 0:23, | |
minor_breaks = NULL, | |
# labels = function(x) { | |
# format(as.POSIXct(as.character(x), format = "%H"), format = "%I %p") | |
# } | |
) + | |
scale_y_continuous( | |
labels = label_comma(), | |
n.breaks = 10 | |
) + | |
# coord_polar(start = 0) + | |
labs( | |
title = "Listening time by hour of the day", | |
x = "Hour", | |
y = "Minutes" | |
) + | |
theme_classic() + | |
theme(axis.text.y = element_blank()) + | |
facet_wrap(~year, ncol = 1) | |
# cumulative time --------------------------------------------------------- | |
y <- x %>% | |
filter( | |
artist %in% most_common(artist, 5), | |
year(ts) == max(year(ts)) - 1 | |
) %>% | |
group_by(artist, wk = week(ts)) %>% | |
summarise(wk_min = sum(min)) %>% | |
mutate( | |
cum_min = cumsum(wk_min), | |
lbl_abb = abbreviate(artist) | |
) | |
y %>% | |
ggplot( | |
mapping = aes(x = wk, y = cum_min) | |
) + | |
geom_step( | |
mapping = aes(color = artist) | |
) + | |
geom_text( | |
data = filter(y, cum_min == max(cum_min)), | |
x = max(y$wk), | |
nudge_y = max(y$cum_min) * 0.02, | |
mapping = aes( | |
label = artist, | |
color = artist, | |
x = wk | |
) | |
) + | |
scale_y_continuous( | |
labels = comma | |
) + | |
scale_color_discrete( | |
guide = "none" | |
) + | |
labs( | |
title = "Cumulative listening time over year", | |
color = "Artist", | |
x = "Week", | |
y = "Minutes" | |
) + | |
theme_classic() | |
# calendar ---------------------------------------------------------------- | |
x %>% | |
group_by(date = as_date(ts)) %>% | |
summarise(min_total = sum(min)) %>% | |
mutate( | |
year = year(date), | |
month = month(date, label = TRUE, abbr = TRUE), | |
wday = fct_relevel( | |
.f = wday(date, label = TRUE, abbr = TRUE), | |
c("Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun") | |
), | |
day = day(date), | |
wk = format(date, "%W"), | |
### cap outlier | |
min_total = ifelse(min_total > 500, 500, min_total) | |
) %>% | |
filter(year > 2014, year < 2023) %>% | |
ggplot( | |
mapping = aes(x = wk, y = wday) | |
) + | |
geom_tile( | |
mapping = aes(fill = min_total), | |
color = "black", | |
linewidth = 0.5 | |
) + | |
coord_equal() + | |
scale_fill_viridis_c( | |
end = 0.95, | |
option = "B", | |
na.value = "black" | |
) + | |
labs( | |
title = "Listened time each day of the year", | |
x = "Week", | |
y = "Weekday", | |
fill = "Minutes" | |
) + | |
scale_x_discrete( | |
labels = if_else(is_even(0:53), "", as.character(0:53)) | |
) + | |
theme_classic() + | |
theme(panel.background = element_rect(fill = "black")) + | |
facet_wrap(~year, ncol = 1) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment