Skip to content

Instantly share code, notes, and snippets.

@erictleung
Last active December 10, 2024 15:19
Show Gist options
  • Save erictleung/4431fa8976d9a1847147307f8fa28e93 to your computer and use it in GitHub Desktop.
Save erictleung/4431fa8976d9a1847147307f8fa28e93 to your computer and use it in GitHub Desktop.
Plot critics and audience ratings from Rotten Tomatoes
library(tidyverse)
library(rvest)
library(ggtext)
library(extrafont)
rt <- read_html("https://editorial.rottentomatoes.com/guide/best-christmas-movies/")
film_regex <- regex(
"^([A-Za-z0-9.' ,-:&]* \\([0-9]{4,4}\\))\\n\\s+([0-9]{0,4})\\%\\n\\s+([0-9]{0,4})\\%"
)
# Wrangle data
rt_tbl <-
rt %>%
html_elements("h2") %>%
html_text() %>%
as_tibble() %>%
mutate(value = trimws(value)) %>%
mutate(
film = str_extract(value, film_regex, group = 1),
critics_review = str_extract(value, film_regex, group = 2),
audience_review = str_extract(value, film_regex, group = 3)
) %>%
drop_na() %>%
select(-value)
# Make into format that makes it easier to plot
rt_tbl_long <-
rt_tbl %>%
mutate(critics_review = as.numeric(critics_review),
audience_review = as.numeric(audience_review)) %>%
mutate(film = fct_reorder(film, critics_review)) %>% # Order by critics
pivot_longer(cols = c(critics_review, audience_review),
names_to = "review")
# Split out data
critics <- rt_tbl_long %>%
filter(review == "critics_review")
audience <- rt_tbl_long %>%
filter(review == "audience_review")
# Create plots
p <-
rt_tbl_long %>%
ggplot() +
geom_segment(
data = critics,
aes(
x = value,
y = film,
yend = audience$film,
xend = audience$value
),
color = "#aeb6bf",
size = 1.5,
#Note that I sized the segment to fit the points
alpha = .5
) +
geom_point(aes(x = value, y = film, color = review),
size = 4,
show.legend = TRUE) +
ggtitle("Discrepancies between Tomatometer and Popcornmeter reviews on Rotten Tomatoes")
p
# Order data based on biggest difference
rt_tbl_long_diff <-
rt_tbl %>%
mutate(critics_review = as.numeric(critics_review),
audience_review = as.numeric(audience_review)) %>%
mutate(review_abs_diff = abs(critics_review - audience_review),
review_raw_diff = critics_review - audience_review) %>%
mutate(film = fct_reorder(film, review_abs_diff)) %>% # Order by biggest diff
pivot_longer(cols = c(critics_review, audience_review),
names_to = "review")
# Split out data
critics_diff <- rt_tbl_long_diff %>%
filter(review == "critics_review")
audience_diff <- rt_tbl_long_diff %>%
filter(review == "audience_review")
# Create plots
p <-
rt_tbl_long_diff %>%
ggplot() +
geom_segment(
data = critics_diff,
aes(
x = value,
y = film,
yend = audience_diff$film,
xend = audience_diff$value
),
color = "#aeb6bf",
size = 1.5,
#Note that I sized the segment to fit the points
alpha = .5
) +
geom_point(aes(x = value, y = film, color = review),
size = 4,
show.legend = TRUE) +
ggtitle("Discrepancies between Tomatometer and Popcornmeter reviews on Rotten Tomatoes",
"Ordered by absolute value difference of critics review minus audience review")
p
# Order data based on biggest raw difference
rt_tbl_long_diff <-
rt_tbl %>%
mutate(critics_review = as.numeric(critics_review),
audience_review = as.numeric(audience_review)) %>%
mutate(review_abs_diff = abs(critics_review - audience_review),
review_raw_diff = critics_review - audience_review) %>%
mutate(film = fct_reorder(film, review_raw_diff)) %>% # Order by biggest diff
pivot_longer(cols = c(critics_review, audience_review),
names_to = "review") %>%
mutate(review = if_else(review == "critics_review", "Tomatometer", "Popcornmeter"))
# Split out data
critics_diff <- rt_tbl_long_diff %>%
filter(review == "Tomatometer")
audience_diff <- rt_tbl_long_diff %>%
filter(review == "Popcornmeter")
# Create plots
p <-
rt_tbl_long_diff %>%
ggplot() +
geom_segment(
data = critics_diff,
aes(
x = value,
y = film,
yend = audience_diff$film,
xend = audience_diff$value
),
color = "#aeb6bf",
size = 1.5,
#Note that I sized the segment to fit the points
alpha = .5
) +
geom_point(aes(x = value, y = film, color = review),
size = 4,
show.legend = TRUE) +
scale_color_manual(values = c("#FFD600", "#F93001")) +
ggtitle(
"Discrepancies between Tomatometer and Popcornmeter reviews on Rotten Tomatoes",
"Ordered by raw difference of critics review minus audience review"
) +
labs(x = "Rating", y = "Film", color = "Review type") +
scale_x_continuous(limits = c(0, 100),
label = scales::label_percent(scale = 1)) +
theme_minimal() +
theme(legend.position = "top", )
p +
annotate(
"rect",
xmin = 10,
xmax = 15,
ymin = 0.5,
ymax = 23.5,
alpha = .2
) +
annotate(
"rect",
xmin = 50,
xmax = 55,
ymin = 23.5,
ymax = 42.5,
alpha = .2
) +
annotate(
"rect",
xmin = 10,
xmax = 15,
ymin = 42.5,
ymax = 100.5,
alpha = .2
)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment