A replication of the results in Matt Parker’s video: https://youtu.be/aokNwKx7gM8
library(tidyverse)
#> Warning: package 'tidyr' was built under R version 4.0.2
# Download the raw data from Matt's Dropbox link
file_url <- "https://dl.dropboxusercontent.com/s/7gusbewresl6zsg/Kent%20County%20Precinct%20Data%20from%20Stand-up%20Maths%20video.xlsx"
file <- tempfile(fileext = ".xlsx")
r <- httr::GET(file_url, httr::write_disk(file))
httr::stop_for_status(r)
raw_data <- readxl::read_excel(file)
#> New names:
#> * `` -> ...1
#> * `` -> ...4
#> * `` -> ...5
#> * `` -> ...6
#> * `` -> ...7
#> * ...
# Clean up the spreadsheet data
votes <- raw_data %>%
fill(...1) %>%
slice(6:n()) %>%
set_names(.[1, ]) %>%
rename(Contest = `Straight Party`) %>%
mutate(Contest = case_when(
Contest == "Straight Party" ~ "Party",
Contest == "President / Vice Pres." ~ "President"
)) %>%
mutate_all(str_replace, "[\r\n]+", " ") %>%
filter(!is.na(Precinct)) %>%
# Seems that contest changes but not recorded in 1st column
filter(cumsum(trimws(Precinct) == "Total US Senator") == 0) %>%
filter(str_starts(Precinct, "\\d")) %>%
mutate(across(Precinct, str_remove, "^\\d+\\s+")) %>%
pivot_longer(
cols = -c(Contest, Precinct),
names_to = "Candidate",
values_to = "Votes"
) %>%
mutate(across(Candidate, str_remove, "\\s+Party")) %>%
mutate(across(Votes, as.integer))
# Calculate direct presidential vote and vote shares
shares <- votes %>%
pivot_wider(
names_from = Contest,
values_from = Votes
) %>%
mutate(Direct = President - Party) %>%
pivot_longer(
cols = -c(Precinct, Candidate),
names_to = "Contest",
values_to = "Votes"
) %>%
group_by(Precinct, Contest) %>%
mutate(VoteShare = Votes / sum(Votes))
shares
#> # A tibble: 5,292 x 5
#> # Groups: Precinct, Contest [756]
#> Precinct Candidate Contest Votes VoteShare
#> <chr> <chr> <chr> <int> <dbl>
#> 1 Ada Township Precinct 1 Democratic Party 254 0.450
#> 2 Ada Township Precinct 1 Democratic President 589 0.548
#> 3 Ada Township Precinct 1 Democratic Direct 335 0.656
#> 4 Ada Township Precinct 1 Republican Party 307 0.544
#> 5 Ada Township Precinct 1 Republican President 472 0.439
#> 6 Ada Township Precinct 1 Republican Direct 165 0.323
#> 7 Ada Township Precinct 1 Libertarian Party 0 0
#> 8 Ada Township Precinct 1 Libertarian President 13 0.0121
#> 9 Ada Township Precinct 1 Libertarian Direct 13 0.0254
#> 10 Ada Township Precinct 1 US Taxpayers Party 0 0
#> # ... with 5,282 more rows
Scatterplot of direct candidate vote share vs. party vote share:
two_party_contest_shares <- shares %>%
pivot_wider(
values_from = c(Votes, VoteShare),
names_from = Contest,
names_sep = "_"
) %>%
filter(Candidate %in% c("Democratic", "Republican"))
two_party_contest_shares %>%
ggplot(aes(VoteShare_Party, VoteShare_Direct)) +
facet_wrap(~ Candidate) +
coord_fixed() +
geom_abline(slope = c(0, 1), lty = 2) +
geom_point() +
geom_smooth(method = "lm", se = FALSE)
#> `geom_smooth()` using formula 'y ~ x'
And the differenced scatterplot:
last_plot() + aes(y = VoteShare_Direct - VoteShare_Party)
#> `geom_smooth()` using formula 'y ~ x'
Does the slope differ between parties?
model <- lm(
VoteShare_Direct ~ VoteShare_Party + Candidate,
data = two_party_contest_shares
)
summary(update(model, . ~ . + VoteShare_Party:Candidate))
#>
#> Call:
#> lm(formula = VoteShare_Direct ~ VoteShare_Party + Candidate +
#> VoteShare_Party:Candidate, data = two_party_contest_shares)
#>
#> Residuals:
#> Min 1Q Median 3Q Max
#> -0.149350 -0.038463 -0.000564 0.037402 0.149718
#>
#> Coefficients:
#> Estimate Std. Error t value Pr(>|t|)
#> (Intercept) 0.27053 0.01030 26.275 <2e-16 ***
#> VoteShare_Party 0.63746 0.01928 33.058 <2e-16 ***
#> CandidateRepublican -0.17518 0.01424 -12.305 <2e-16 ***
#> VoteShare_Party:CandidateRepublican -0.03844 0.02701 -1.423 0.155
#> ---
#> Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#>
#> Residual standard error: 0.0569 on 500 degrees of freedom
#> Multiple R-squared: 0.8815, Adjusted R-squared: 0.8808
#> F-statistic: 1240 on 3 and 500 DF, p-value: < 2.2e-16
broom::augment(model) %>%
ggplot(aes(VoteShare_Party, .resid)) +
facet_wrap(~ Candidate) +
geom_point() +
geom_smooth(method = "lm")
#> `geom_smooth()` using formula 'y ~ x'
Not really.