Last active
December 10, 2024 15:19
-
-
Save erictleung/4431fa8976d9a1847147307f8fa28e93 to your computer and use it in GitHub Desktop.
Plot critics and audience ratings from Rotten Tomatoes
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(tidyverse) | |
library(rvest) | |
library(ggtext) | |
library(extrafont) | |
rt <- read_html("https://editorial.rottentomatoes.com/guide/best-christmas-movies/") | |
film_regex <- regex( | |
"^([A-Za-z0-9.' ,-:&]* \\([0-9]{4,4}\\))\\n\\s+([0-9]{0,4})\\%\\n\\s+([0-9]{0,4})\\%" | |
) | |
# Wrangle data | |
rt_tbl <- | |
rt %>% | |
html_elements("h2") %>% | |
html_text() %>% | |
as_tibble() %>% | |
mutate(value = trimws(value)) %>% | |
mutate( | |
film = str_extract(value, film_regex, group = 1), | |
critics_review = str_extract(value, film_regex, group = 2), | |
audience_review = str_extract(value, film_regex, group = 3) | |
) %>% | |
drop_na() %>% | |
select(-value) | |
# Make into format that makes it easier to plot | |
rt_tbl_long <- | |
rt_tbl %>% | |
mutate(critics_review = as.numeric(critics_review), | |
audience_review = as.numeric(audience_review)) %>% | |
mutate(film = fct_reorder(film, critics_review)) %>% # Order by critics | |
pivot_longer(cols = c(critics_review, audience_review), | |
names_to = "review") | |
# Split out data | |
critics <- rt_tbl_long %>% | |
filter(review == "critics_review") | |
audience <- rt_tbl_long %>% | |
filter(review == "audience_review") | |
# Create plots | |
p <- | |
rt_tbl_long %>% | |
ggplot() + | |
geom_segment( | |
data = critics, | |
aes( | |
x = value, | |
y = film, | |
yend = audience$film, | |
xend = audience$value | |
), | |
color = "#aeb6bf", | |
size = 1.5, | |
#Note that I sized the segment to fit the points | |
alpha = .5 | |
) + | |
geom_point(aes(x = value, y = film, color = review), | |
size = 4, | |
show.legend = TRUE) + | |
ggtitle("Discrepancies between Tomatometer and Popcornmeter reviews on Rotten Tomatoes") | |
p | |
# Order data based on biggest difference | |
rt_tbl_long_diff <- | |
rt_tbl %>% | |
mutate(critics_review = as.numeric(critics_review), | |
audience_review = as.numeric(audience_review)) %>% | |
mutate(review_abs_diff = abs(critics_review - audience_review), | |
review_raw_diff = critics_review - audience_review) %>% | |
mutate(film = fct_reorder(film, review_abs_diff)) %>% # Order by biggest diff | |
pivot_longer(cols = c(critics_review, audience_review), | |
names_to = "review") | |
# Split out data | |
critics_diff <- rt_tbl_long_diff %>% | |
filter(review == "critics_review") | |
audience_diff <- rt_tbl_long_diff %>% | |
filter(review == "audience_review") | |
# Create plots | |
p <- | |
rt_tbl_long_diff %>% | |
ggplot() + | |
geom_segment( | |
data = critics_diff, | |
aes( | |
x = value, | |
y = film, | |
yend = audience_diff$film, | |
xend = audience_diff$value | |
), | |
color = "#aeb6bf", | |
size = 1.5, | |
#Note that I sized the segment to fit the points | |
alpha = .5 | |
) + | |
geom_point(aes(x = value, y = film, color = review), | |
size = 4, | |
show.legend = TRUE) + | |
ggtitle("Discrepancies between Tomatometer and Popcornmeter reviews on Rotten Tomatoes", | |
"Ordered by absolute value difference of critics review minus audience review") | |
p | |
# Order data based on biggest raw difference | |
rt_tbl_long_diff <- | |
rt_tbl %>% | |
mutate(critics_review = as.numeric(critics_review), | |
audience_review = as.numeric(audience_review)) %>% | |
mutate(review_abs_diff = abs(critics_review - audience_review), | |
review_raw_diff = critics_review - audience_review) %>% | |
mutate(film = fct_reorder(film, review_raw_diff)) %>% # Order by biggest diff | |
pivot_longer(cols = c(critics_review, audience_review), | |
names_to = "review") %>% | |
mutate(review = if_else(review == "critics_review", "Tomatometer", "Popcornmeter")) | |
# Split out data | |
critics_diff <- rt_tbl_long_diff %>% | |
filter(review == "Tomatometer") | |
audience_diff <- rt_tbl_long_diff %>% | |
filter(review == "Popcornmeter") | |
# Create plots | |
p <- | |
rt_tbl_long_diff %>% | |
ggplot() + | |
geom_segment( | |
data = critics_diff, | |
aes( | |
x = value, | |
y = film, | |
yend = audience_diff$film, | |
xend = audience_diff$value | |
), | |
color = "#aeb6bf", | |
size = 1.5, | |
#Note that I sized the segment to fit the points | |
alpha = .5 | |
) + | |
geom_point(aes(x = value, y = film, color = review), | |
size = 4, | |
show.legend = TRUE) + | |
scale_color_manual(values = c("#FFD600", "#F93001")) + | |
ggtitle( | |
"Discrepancies between Tomatometer and Popcornmeter reviews on Rotten Tomatoes", | |
"Ordered by raw difference of critics review minus audience review" | |
) + | |
labs(x = "Rating", y = "Film", color = "Review type") + | |
scale_x_continuous(limits = c(0, 100), | |
label = scales::label_percent(scale = 1)) + | |
theme_minimal() + | |
theme(legend.position = "top", ) | |
p + | |
annotate( | |
"rect", | |
xmin = 10, | |
xmax = 15, | |
ymin = 0.5, | |
ymax = 23.5, | |
alpha = .2 | |
) + | |
annotate( | |
"rect", | |
xmin = 50, | |
xmax = 55, | |
ymin = 23.5, | |
ymax = 42.5, | |
alpha = .2 | |
) + | |
annotate( | |
"rect", | |
xmin = 10, | |
xmax = 15, | |
ymin = 42.5, | |
ymax = 100.5, | |
alpha = .2 | |
) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment