Skip to content

Instantly share code, notes, and snippets.

@k5cents
Last active February 23, 2024 00:19
Show Gist options
  • Save k5cents/f52a38286dfea494996db8b7f5338a4c to your computer and use it in GitHub Desktop.
Save k5cents/f52a38286dfea494996db8b7f5338a4c to your computer and use it in GitHub Desktop.
Create some plots from Spotify streaming history data
library(tidyverse)
library(jsonlite)
library(scales)
library(fs)
library(k5)
# functions ---------------------------------------------------------------
most_common <- function (x, n = 6) {
as.vector(na.omit(names(sort(table(x), decreasing = TRUE)[1:n])))
}
is_even <- function (x) {
x %% 2 == 0
}
# read and format ---------------------------------------------------------
# map all json files into one data frame
x <- map_df(
.x = dir_ls("~/Downloads/my_spotify_data/MyData/", regexp = "endsong_\\d"),
.f = read_json,
simplifyDataFrame = TRUE
)
x <- as_tibble(x)
x$ts <- as_datetime(x$ts, tz = "EST")
# remove rows and columns with podcast data
x <- x %>%
filter(is.na(episode_name)) %>%
select(-contains("episode"))
x <- x %>%
filter(!is.na(master_metadata_track_name))
# format and select columns
x <- x %>%
mutate(min = ms_played / 6e4) %>%
select(
ts,
min,
platform,
track = master_metadata_track_name,
album = master_metadata_album_album_name,
artist = master_metadata_album_artist_name,
reason_start,
reason_end
)
# clean up platform
x <- x %>%
mutate(
platform = platform %>%
str_to_lower() %>%
str_remove("partner windows_tv microsoft;") %>%
str_remove("_") %>%
str_remove("[:punct:].*$") %>%
str_replace("os x", "osx") %>%
word(1) %>%
str_replace("xboxone", "xbox") %>%
str_replace("xboxseries", "xbox") %>%
str_replace("partner", "cast")
)
# artist bar --------------------------------------------------------------
x %>%
group_by(artist) %>%
summarise(hours = sum(min) / 60) %>%
arrange(desc(hours)) %>%
head(20) %>%
ggplot(aes(x = reorder(artist, hours), y = hours)) +
geom_col(color = "black", aes(fill = hours)) +
scale_fill_viridis_c(option = "B", guide = "none", end = 0.8) +
coord_flip() +
labs(x = "Artist", y = "Hours") +
theme_classic()
x %>%
group_by(artist) %>%
filter() %>%
summarise(min = sum(min), n = n()) %>%
arrange(desc(n)) %>%
head(100) %>%
write_csv("~/Documents/spotify_artists.csv", na = "")
# artist month ------------------------------------------------------------
x %>%
group_by(month = month(ts)) %>%
# select top N per month
filter(
artist %in% most_common(artist, 3),
year(ts) == max(year(ts)) - 1
) %>%
# add Y for labels and bars
group_by(month, artist) %>%
summarise(
total_min = sum(min),
.groups = "drop_last"
) %>%
mutate(
lbl_y = cumsum(total_min),
lbl_abb = if_else(
total_min < 100, "", abbreviate(artist)
)
) %>%
ggplot(
mapping = aes(x = month, y = total_min)
) +
geom_col(
mapping = aes(fill = str_trunc(artist, width = 25)),
color = "black",
position = position_stack(reverse = TRUE)
) +
geom_text(
mapping = aes(y = lbl_y, label = lbl_abb),
vjust = 1.5,
color = "black"
) +
scale_x_continuous(
breaks = 1:12,
labels = month.abb
) +
scale_y_continuous(
labels = comma
) +
labs(
title = "Most listened to artists each month",
fill = "Artist",
x = "Month",
y = "Minutes"
) +
theme_classic()
# album bar ---------------------------------------------------------------
x %>%
group_by(album, artist) %>%
filter(album %out% c("")) %>%
summarise(hours = sum(min) / 60) %>%
arrange(desc(hours)) %>%
head(20) %>%
mutate(
lbl = paste(
str_trunc(album, 30, side = "right"),
str_trunc(artist, 30, side = "right"),
sep = " | "
)
) %>%
ggplot(aes(x = reorder(lbl, hours), y = hours)) +
geom_col(color = "black", aes(fill = hours)) +
scale_fill_viridis_c(option = "B", guide = "none", end = 0.8) +
coord_flip() +
labs(x = "Artist", y = "Hours") +
theme_classic()
x %>%
mutate(
album = str_to_title(str_remove(album, "\\s\\(.*\\)$")),
) %>%
group_by(album, artist) %>%
filter() %>%
summarise(min = sum(min), n = n()) %>%
arrange(desc(n)) %>%
head(100) %>%
write_csv("~/Documents/spotify_albums.csv", na = "")
# song bar ----------------------------------------------------------------
x %>%
group_by(track, artist) %>%
filter(track %out% c("")) %>%
summarise(hours = sum(min) / 60) %>%
arrange(desc(hours)) %>%
head(20) %>%
mutate(
lbl = paste(
str_trunc(track, 30, side = "right"),
str_trunc(artist, 30, side = "right"),
sep = " | "
)
) %>%
ggplot(aes(x = reorder(lbl, hours), y = hours)) +
geom_col(color = "black", aes(fill = hours)) +
scale_fill_viridis_c(option = "B", guide = "none", end = 0.8) +
coord_flip() +
labs(x = "Artist", y = "Hours") +
theme_classic()
x %>%
mutate(
album = str_to_title(str_remove(album, "\\s\\(.*\\)$")),
track = str_to_title(track)
) %>%
group_by(track, album, artist) %>%
filter() %>%
summarise(min = sum(min), n = n()) %>%
arrange(desc(n)) %>%
head(100) %>%
write_csv("~/Documents/spotify_songs.csv", na = "")
# platform ----------------------------------------------------------------
x %>%
group_by(platform, yr = year(ts)) %>%
summarise(hours = sum(min) / 60) %>%
ggplot(aes(x = yr, y = hours)) +
geom_col(color = "black", aes(fill = platform)) +
labs(x = "Year", y = "Hours") +
theme_classic()
x %>%
group_by(yr = year(ts), platform) %>%
summarise(hours = sum(min) / 60) %>%
mutate(p = hours / sum(hours)) %>%
ggplot(aes(x = yr, y = p)) +
geom_col(color = "black", aes(fill = platform)) +
labs(x = "Year", y = "Hours") +
theme_classic()
# time polar --------------------------------------------------------------
x %>%
mutate(
hour = hour(ts),
year = year(ts),
) %>%
group_by(year, hour) %>%
summarise(totalMin = sum(min)) %>%
filter(year > 2014) %>%
# complete(hour = 0:23) %>%
ggplot(aes(x = hour, y = totalMin)) +
geom_col(
mapping = aes(fill = totalMin),
color = "black"
) +
scale_fill_viridis_c(
end = 0.90,
option = "B",
guide = "none"
) +
scale_x_continuous(
breaks = 0:23,
minor_breaks = NULL,
# labels = function(x) {
# format(as.POSIXct(as.character(x), format = "%H"), format = "%I %p")
# }
) +
scale_y_continuous(
labels = label_comma(),
n.breaks = 10
) +
# coord_polar(start = 0) +
labs(
title = "Listening time by hour of the day",
x = "Hour",
y = "Minutes"
) +
theme_classic() +
theme(axis.text.y = element_blank()) +
facet_wrap(~year, ncol = 1)
# cumulative time ---------------------------------------------------------
y <- x %>%
filter(
artist %in% most_common(artist, 5),
year(ts) == max(year(ts)) - 1
) %>%
group_by(artist, wk = week(ts)) %>%
summarise(wk_min = sum(min)) %>%
mutate(
cum_min = cumsum(wk_min),
lbl_abb = abbreviate(artist)
)
y %>%
ggplot(
mapping = aes(x = wk, y = cum_min)
) +
geom_step(
mapping = aes(color = artist)
) +
geom_text(
data = filter(y, cum_min == max(cum_min)),
x = max(y$wk),
nudge_y = max(y$cum_min) * 0.02,
mapping = aes(
label = artist,
color = artist,
x = wk
)
) +
scale_y_continuous(
labels = comma
) +
scale_color_discrete(
guide = "none"
) +
labs(
title = "Cumulative listening time over year",
color = "Artist",
x = "Week",
y = "Minutes"
) +
theme_classic()
# calendar ----------------------------------------------------------------
x %>%
group_by(date = as_date(ts)) %>%
summarise(min_total = sum(min)) %>%
mutate(
year = year(date),
month = month(date, label = TRUE, abbr = TRUE),
wday = fct_relevel(
.f = wday(date, label = TRUE, abbr = TRUE),
c("Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun")
),
day = day(date),
wk = format(date, "%W"),
### cap outlier
min_total = ifelse(min_total > 500, 500, min_total)
) %>%
filter(year > 2014, year < 2023) %>%
ggplot(
mapping = aes(x = wk, y = wday)
) +
geom_tile(
mapping = aes(fill = min_total),
color = "black",
linewidth = 0.5
) +
coord_equal() +
scale_fill_viridis_c(
end = 0.95,
option = "B",
na.value = "black"
) +
labs(
title = "Listened time each day of the year",
x = "Week",
y = "Weekday",
fill = "Minutes"
) +
scale_x_discrete(
labels = if_else(is_even(0:53), "", as.character(0:53))
) +
theme_classic() +
theme(panel.background = element_rect(fill = "black")) +
facet_wrap(~year, ncol = 1)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment