Last active
November 17, 2019 00:50
-
-
Save elliottmorris/cecf0eea4cf6019367bd0e9d9c6d1777 to your computer and use it in GitHub Desktop.
Analysis of Buttigieg's rise
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# This file runs an analysis of 2020 Democratic primary polls to | |
# assess relationships between candidate's polling averages using | |
# variance-covariance matrices. | |
# | |
# Read more about it here: | |
# https://thecrosstab.substack.com/p/buttigiegs-rise-has-cost-biden-and | |
rm(list=ls()) | |
library(tidyverse) | |
library(lubridate) | |
library(zoo) | |
options(scipen = 1000) | |
# read in data from 538 | |
polls <- read_csv('https://projects.fivethirtyeight.com/polls-page/president_primary_polls.csv') | |
# filter to 2020 polls, select correct columns, and correct candidates, in certain geographies | |
candidates <- c("Biden", "Warren", "Sanders", "Harris", "Buttigieg") | |
polls <- polls %>% | |
filter(cycle==2020, | |
state %in% c("Iowa","New Hampshire",NA), | |
answer %in% candidates, | |
is.na(notes)) %>% | |
select(date=end_date, | |
geo = state, | |
pollster, | |
sponsors, | |
sample_size, | |
population, | |
tracking, | |
candidate=answer, | |
pct) %>% | |
mutate(geo = ifelse(is.na(geo),'National',geo), | |
date=mdy(date)) | |
# all polls over the last 4 months | |
polls <- polls %>% | |
filter(date >= (ymd(Sys.Date())-(30*5))) %>% | |
arrange(date) | |
# filter out all but most recent tracking polls each week | |
polls <- polls %>% | |
group_by(week = floor(difftime(Sys.Date(),date,units = 'weeks')), | |
geo, | |
pollster,sponsors, | |
candidate) %>% | |
mutate(poll_that_week = row_number()) %>% | |
filter(!tracking | poll_that_week == 1) %>% | |
ungroup() | |
# cap sample size and generate weight | |
polls <- polls %>% | |
mutate(sample_size = ifelse(sample_size>5000,5000,sample_size), | |
wt = sqrt(sample_size/mean(sample_size))) | |
# select only relevant columns | |
polls <- polls %>% | |
select(date, geo, pollster, candidate, sample_size, wt, pct) | |
# function to compute rolling weighted average | |
my_wtd_avg <- function(z){ | |
print(z) | |
avg <- rollapplyr(data = z, | |
width = 60, | |
by.column = F, | |
fill = NA, | |
FUN = function(z){ | |
pcts <- as.numeric(z[,'pct']) | |
wts <- as.numeric(z[,'wt']) | |
# exponential decay on weights | |
wts = wts * exp(-0.04 * rev(index(wts))) | |
# return weighted average | |
return(weighted.mean(pcts,wts,na.rm=T)) | |
}) | |
return(z %>% as_tibble() %>% cbind(avg)) | |
#return(wtdmeans) | |
} | |
# get all the data into the right format | |
avgs <- tibble(date = seq.Date((ymd(Sys.Date())-30*5), | |
ymd(Sys.Date()),by = 'days')) %>% # get 30 extra rows of NAs for the rollapplyr function, which for some reason doesn't know how to handle na.rm = T | |
left_join(polls %>% | |
select(date, geo, candidate)) %>% | |
expand(date,geo,candidate) %>% | |
na.omit() %>% | |
left_join(polls) %>% | |
group_by(candidate, geo) | |
# actually run the averages! | |
avgs <- avgs %>% | |
group_map(~ my_wtd_avg(.), keep = T) %>% | |
do.call('bind_rows',.) | |
# plot | |
ggplot(avgs %>% na.omit(), | |
aes(x=date,y=pct/100,col=candidate,weight=wt)) + | |
geom_point(aes(size=sample_size),alpha=0.2) + | |
geom_line(aes(y=avg/100)) + | |
scale_size(name = 'n',range = c(0.5,3)) + | |
facet_wrap(~geo) + | |
theme_minimal() + | |
scale_color_brewer("Candidate",palette="Dark2") + | |
theme(legend.position = 'top',legend.justification = 'left') + | |
labs(title='2020 Democratic primary polling averages', | |
subtitle = 'Weighted by recency and sample size', | |
y='Percent of Democratic voters', | |
x="Date") | |
# get data for covariance matrix | |
avgs.cov <- avgs %>% | |
filter(!is.na(avg)) %>% | |
select(date, geo, pollster, candidate, avg) %>% | |
group_by(date, geo, pollster, candidate) %>% | |
mutate(col = row_number()) %>% | |
spread(candidate, avg) | |
# function to get covariance | |
get_cov <- function(x){ | |
y <- x[,5:ncol(x)] | |
z <- cov(y,use='complete.obs') | |
print(y) | |
dat <- z %>% | |
as_tibble() %>% | |
mutate(geo = unique(x$geo), | |
candidate = rownames(z)) | |
return(dat) | |
} | |
# get covariances | |
covs <- avgs.cov %>% | |
filter(date >= (ymd(Sys.Date())-30)) %>% | |
mutate("Other_NA" = 100 - (Biden + Buttigieg + Harris + Sanders + Warren)) %>% | |
group_by(geo) %>% | |
# standardize avg | |
#mutate_at(c("Biden","Buttigieg","Harris","Sanders","Warren","Other_NA"),function(x){x/first(x)}) %>% | |
group_map(~ get_cov(.), keep = T) %>% | |
do.call('bind_rows',.) | |
covs %>% | |
select(geo,candidate,cov_with_buttigieg = Buttigieg) %>% | |
filter(candidate != "Buttigieg") %>% View() | |
# melt dataframe for plotting | |
avgs_for_cov_plots <- avgs.cov %>% | |
filter(date >= (ymd(Sys.Date())-30)) %>% | |
mutate("Other_NA" = 100 - (Biden + Buttigieg + Harris + Sanders + Warren)) %>% | |
group_by(geo) %>% | |
# standardize avg | |
#mutate_at(c("Biden","Buttigieg","Harris","Sanders","Warren","Other_NA"),function(x){x/first(x)}) %>% | |
gather(candidate,pct,c("Biden","Harris","Sanders","Warren","Other_NA")) | |
# plot buttigieg v other candidates over time | |
ggplot(avgs_for_cov_plots, aes(x=Buttigieg,y=pct,col=geo)) + | |
geom_point() + | |
stat_smooth(geom='line',method='lm') + | |
facet_wrap(~candidate) + | |
theme_minimal() + | |
scale_color_brewer("Geography",palette="Dark2") + | |
theme(legend.position = 'top',legend.justification = 'left') + | |
labs(title='Buttigieg has risen while Biden, Harris have fallen', | |
subtitle = 'Each point is a candidate\'s average on a different day', | |
y='Other candidate\'s polling average', | |
x="Buttigieg\'s polling average") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment