Skip to content

Instantly share code, notes, and snippets.

@benjaminrobinson
Created April 2, 2023 12:22
Show Gist options
  • Save benjaminrobinson/dab9cff80c233b9b5d287435dbd6b988 to your computer and use it in GitHub Desktop.
Save benjaminrobinson/dab9cff80c233b9b5d287435dbd6b988 to your computer and use it in GitHub Desktop.
Springsteen Setlist Similarity
library(dplyr)
library(rvest)
library(stringr)
library(purrr)
library(tidyr)
library(janitor)
library(RecordLinkage)
library(ggplot2)
library(scales)
library(ggthemes)
library(magick)
## ENTER LOCAL DIRECTORY WITHOUT QUOTE MARKS ##
dir = readline()
download.file(url = 'https://blog.ticketmaster.com/wp-content/uploads/BruceSpringsteen_Blog.jpg',
destfile = paste0(dir, "tour.png"),
mode = 'wb')
add_logo <- function(plot_path,
logo_path,
logo_position = 'top left',
logo_scale = 5,
save = FALSE) {
if (!logo_position %in% c("top right", "top left", "bottom right", "bottom left")) {
stop(
"Error Message: Uh oh! Logo Position not recognized\n Try: logo_positon = 'top left', 'top right', 'bottom left', or 'bottom right'"
)
}
# read in raw images
plot <- image_read(plot_path)
logo_raw <- image_read(logo_path)
# get dimensions of plot for scaling
plot_height <- image_info(plot)$height
plot_width <- image_info(plot)$width
# default scale to 1/10th width of plot
# Can change with logo_scale
logo <-
image_scale(logo_raw, as.character(plot_width / logo_scale))
# Get width of logo
logo_width <- image_info(logo)$width
logo_height <- image_info(logo)$height
# Set position of logo
# Position starts at 0,0 at top left
# Using 0.01 for 1% - aesthetic padding
if (logo_position == "top right") {
x_pos = plot_width - logo_width - 0.01 * plot_width
y_pos = 0.01 * plot_height
} else if (logo_position == "top left") {
x_pos = 0.01 * plot_width
y_pos = 0.01 * plot_height
} else if (logo_position == "bottom right") {
x_pos = plot_width - logo_width - 0.01 * plot_width
y_pos = plot_height - logo_height - 0.01 * plot_height
} else if (logo_position == "bottom left") {
x_pos = 0.01 * plot_width
y_pos = plot_height - logo_height - 0.01 * plot_height
}
# Compose the actual overlay
image_composite(plot, logo, offset = paste0("+", x_pos, "+", y_pos)) -> plot
if (save == TRUE) {
image_ggplot(plot)
ggsave(
plot_path,
height = 8,
width = 8,
units = 'in',
dpi = 'retina'
)
} else if (save == FALSE) {
plot(plot)
}
}
map(1:3, function(x) {
paste0('https://www.setlist.fm/setlists/bruce-springsteen-2bd6dcce.html?page=',
x) %>%
read_html %>%
html_nodes('a') %>%
html_attr('href') %>%
str_squish %>%
.[grepl('setlist/bruce-springsteen/2023/', .)] %>%
sub("..", "", .) %>%
paste0('https://www.setlist.fm/', .)
}) %>%
unlist -> boss_set
map_dfr(boss_set, function(x) {
Sys.sleep(5)
print(x)
read_html(x) -> web
tibble(
date = web %>%
html_nodes('.dateBlock') %>%
html_text %>%
str_squish %>%
as.Date(format = "%b %d %Y"),
location = web %>%
html_nodes('.setlistHeadline') %>%
html_text %>%
str_squish %>%
sub(" Edit.*", "", .) %>%
sub("Bruce Springsteen Setlist at ", "", .),
songs = web %>%
html_nodes('.songPart') %>%
html_text %>%
str_squish,
set_length = web %>%
html_nodes('.hiddenCollapsed') %>%
html_text %>%
str_squish %>%
unique %>%
.[grepl("End", .)] %>%
sub(".*[A-Z][M]", "", .) %>%
sub(" long", "", .)
) %>%
mutate(
song_number = row_number(),
set_hrs = sub("[h].*", "", set_length) %>% as.numeric,
set_mins = sub(".*[h] ", "", set_length) %>%
sub("m", "", .) %>%
as.numeric
)
}) %>%
group_by(date, location, set_length, set_hrs, set_mins) %>%
mutate(show_id = cur_group_id()) %>%
ungroup %>%
select(8,
1:2,
4,
6:7,
5:3) -> boss_songs
boss_songs %>%
group_by(show = sub(", USA", "", location) %>%
sub(", ", " (", .) %>%
paste0(., ")"),
show_id) %>%
summarize(songs = paste(songs %>% sort, collapse = '| ')) %>%
ungroup %>%
select(-show_id) -> boss_sum
suppressWarnings(windowsFonts("Tahoma" = windowsFont("Tahoma")))
theme_gtm <- function(base_size = 14,
base_family = "Tahoma") {
(
theme_foundation(base_size = base_size, base_family = base_family)
+ theme(
plot.title = element_text(
face = "bold",
size = rel(1.2),
hjust = 0.5
),
text = element_text(),
panel.background = element_rect(colour = NA),
plot.background = element_rect(colour = NA),
panel.border = element_rect(colour = NA),
axis.title = element_text(face = "bold", size = rel(1)),
axis.title.y = element_text(angle = 90, vjust = 2),
axis.title.x = element_text(vjust = -0.2),
axis.text = element_text(),
axis.line.x = element_line(colour = "black"),
axis.line.y.left = element_line(colour = "black"),
axis.line.y.right = element_blank(),
axis.ticks = element_line(),
panel.grid.major = element_line(colour = "#f0f0f0"),
panel.grid.minor = element_blank(),
legend.key = element_rect(colour = NA),
legend.position = "bottom",
legend.direction = "horizontal",
legend.key.size = unit(0.2, "cm"),
legend.spacing = unit(0, "cm"),
legend.title = element_text(face = "bold"),
plot.margin = unit(c(15, 5, 5, 5), "mm"),
plot.subtitle = element_text(hjust = 0.5),
plot.caption = element_text(hjust = 0.5),
strip.background = element_rect(colour = "#f0f0f0", fill = "#f0f0f0"),
strip.text = element_text(face = "bold")
)
)
}
boss_sum %>%
select(-songs) %>%
bind_cols(map_dfc(1:nrow(boss_sum), function(x) {
boss_sum %>%
mutate(sim = jarowinkler(songs, songs[x])) %>%
select(sim) %>%
setNames(boss_sum[1][x, ])
})) %>%
gather(comp, similarity,-show) %>%
ggplot(aes(
x = show,
y = comp,
fill = similarity,
label = percent(similarity, accuracy = .1),
fontface = ifelse(show == comp, "bold", "plain")
)) +
geom_tile(color = 'black') +
geom_text(size = 3) +
scale_fill_gradient(low = '#FFFFFF',
high = '#f44c3c',
labels = percent) +
scale_x_discrete(position = "top") +
theme_foundation(base_size = 14, base_family = 'Tahoma') +
theme(
axis.text.x = element_text(angle = 45, hjust = -.05),
legend.position = 'right',
legend.direction = 'vertical',
plot.title = element_text(
face = "bold",
size = rel(1.2),
hjust = 0.5
),
strip.background = element_rect(colour = "#f0f0f0", fill = "#f0f0f0"),
strip.text = element_text(face = "bold"),
panel.background = element_rect(colour = NA),
plot.background = element_rect(colour = NA),
panel.border = element_rect(colour = NA),
axis.title = element_text(face = "bold", size = rel(1)),
axis.title.y = element_text(angle = 90, vjust = 2),
axis.title.x = element_text(vjust = -0.2),
axis.text = element_text(),
axis.line.x = element_line(colour = "black"),
axis.line.y.left = element_line(colour = "black"),
axis.line.y.right = element_blank(),
axis.ticks = element_line(),
panel.grid.major = element_line(colour = "#f0f0f0"),
panel.grid.minor = element_blank(),
legend.key = element_rect(colour = NA)
) +
labs(
x = NULL,
y = NULL,
fill = 'Jarow-Winkler Score',
title = "Bruce Springsteen 2023 Tour Setlist Similarity",
subtitle = NULL,
caption = "Chart: Benjamin Robinson (@benj_robinson) | Data: SetList.FM, 2023."
)
ggsave(
paste0(dir, "bruce.PNG"),
height = 7.5,
width = 15,
units = 'in',
dpi = 'retina'
)
add_logo(plot_path = paste0(dir, "bruce.PNG"),
logo_path = paste0(dir, "tour.PNG"),
save = TRUE)
image_read(path = paste0(dir, "bruce.PNG")) %>%
image_trim %>%
image_write(path = paste0(dir, "bruce.PNG"))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment