Skip to content

Instantly share code, notes, and snippets.

@elliottmorris
Last active November 17, 2019 00:50
Show Gist options
  • Save elliottmorris/cecf0eea4cf6019367bd0e9d9c6d1777 to your computer and use it in GitHub Desktop.
Save elliottmorris/cecf0eea4cf6019367bd0e9d9c6d1777 to your computer and use it in GitHub Desktop.
Analysis of Buttigieg's rise
# This file runs an analysis of 2020 Democratic primary polls to
# assess relationships between candidate's polling averages using
# variance-covariance matrices.
#
# Read more about it here:
# https://thecrosstab.substack.com/p/buttigiegs-rise-has-cost-biden-and
rm(list=ls())
library(tidyverse)
library(lubridate)
library(zoo)
options(scipen = 1000)
# read in data from 538
polls <- read_csv('https://projects.fivethirtyeight.com/polls-page/president_primary_polls.csv')
# filter to 2020 polls, select correct columns, and correct candidates, in certain geographies
candidates <- c("Biden", "Warren", "Sanders", "Harris", "Buttigieg")
polls <- polls %>%
filter(cycle==2020,
state %in% c("Iowa","New Hampshire",NA),
answer %in% candidates,
is.na(notes)) %>%
select(date=end_date,
geo = state,
pollster,
sponsors,
sample_size,
population,
tracking,
candidate=answer,
pct) %>%
mutate(geo = ifelse(is.na(geo),'National',geo),
date=mdy(date))
# all polls over the last 4 months
polls <- polls %>%
filter(date >= (ymd(Sys.Date())-(30*5))) %>%
arrange(date)
# filter out all but most recent tracking polls each week
polls <- polls %>%
group_by(week = floor(difftime(Sys.Date(),date,units = 'weeks')),
geo,
pollster,sponsors,
candidate) %>%
mutate(poll_that_week = row_number()) %>%
filter(!tracking | poll_that_week == 1) %>%
ungroup()
# cap sample size and generate weight
polls <- polls %>%
mutate(sample_size = ifelse(sample_size>5000,5000,sample_size),
wt = sqrt(sample_size/mean(sample_size)))
# select only relevant columns
polls <- polls %>%
select(date, geo, pollster, candidate, sample_size, wt, pct)
# function to compute rolling weighted average
my_wtd_avg <- function(z){
print(z)
avg <- rollapplyr(data = z,
width = 60,
by.column = F,
fill = NA,
FUN = function(z){
pcts <- as.numeric(z[,'pct'])
wts <- as.numeric(z[,'wt'])
# exponential decay on weights
wts = wts * exp(-0.04 * rev(index(wts)))
# return weighted average
return(weighted.mean(pcts,wts,na.rm=T))
})
return(z %>% as_tibble() %>% cbind(avg))
#return(wtdmeans)
}
# get all the data into the right format
avgs <- tibble(date = seq.Date((ymd(Sys.Date())-30*5),
ymd(Sys.Date()),by = 'days')) %>% # get 30 extra rows of NAs for the rollapplyr function, which for some reason doesn't know how to handle na.rm = T
left_join(polls %>%
select(date, geo, candidate)) %>%
expand(date,geo,candidate) %>%
na.omit() %>%
left_join(polls) %>%
group_by(candidate, geo)
# actually run the averages!
avgs <- avgs %>%
group_map(~ my_wtd_avg(.), keep = T) %>%
do.call('bind_rows',.)
# plot
ggplot(avgs %>% na.omit(),
aes(x=date,y=pct/100,col=candidate,weight=wt)) +
geom_point(aes(size=sample_size),alpha=0.2) +
geom_line(aes(y=avg/100)) +
scale_size(name = 'n',range = c(0.5,3)) +
facet_wrap(~geo) +
theme_minimal() +
scale_color_brewer("Candidate",palette="Dark2") +
theme(legend.position = 'top',legend.justification = 'left') +
labs(title='2020 Democratic primary polling averages',
subtitle = 'Weighted by recency and sample size',
y='Percent of Democratic voters',
x="Date")
# get data for covariance matrix
avgs.cov <- avgs %>%
filter(!is.na(avg)) %>%
select(date, geo, pollster, candidate, avg) %>%
group_by(date, geo, pollster, candidate) %>%
mutate(col = row_number()) %>%
spread(candidate, avg)
# function to get covariance
get_cov <- function(x){
y <- x[,5:ncol(x)]
z <- cov(y,use='complete.obs')
print(y)
dat <- z %>%
as_tibble() %>%
mutate(geo = unique(x$geo),
candidate = rownames(z))
return(dat)
}
# get covariances
covs <- avgs.cov %>%
filter(date >= (ymd(Sys.Date())-30)) %>%
mutate("Other_NA" = 100 - (Biden + Buttigieg + Harris + Sanders + Warren)) %>%
group_by(geo) %>%
# standardize avg
#mutate_at(c("Biden","Buttigieg","Harris","Sanders","Warren","Other_NA"),function(x){x/first(x)}) %>%
group_map(~ get_cov(.), keep = T) %>%
do.call('bind_rows',.)
covs %>%
select(geo,candidate,cov_with_buttigieg = Buttigieg) %>%
filter(candidate != "Buttigieg") %>% View()
# melt dataframe for plotting
avgs_for_cov_plots <- avgs.cov %>%
filter(date >= (ymd(Sys.Date())-30)) %>%
mutate("Other_NA" = 100 - (Biden + Buttigieg + Harris + Sanders + Warren)) %>%
group_by(geo) %>%
# standardize avg
#mutate_at(c("Biden","Buttigieg","Harris","Sanders","Warren","Other_NA"),function(x){x/first(x)}) %>%
gather(candidate,pct,c("Biden","Harris","Sanders","Warren","Other_NA"))
# plot buttigieg v other candidates over time
ggplot(avgs_for_cov_plots, aes(x=Buttigieg,y=pct,col=geo)) +
geom_point() +
stat_smooth(geom='line',method='lm') +
facet_wrap(~candidate) +
theme_minimal() +
scale_color_brewer("Geography",palette="Dark2") +
theme(legend.position = 'top',legend.justification = 'left') +
labs(title='Buttigieg has risen while Biden, Harris have fallen',
subtitle = 'Each point is a candidate\'s average on a different day',
y='Other candidate\'s polling average',
x="Buttigieg\'s polling average")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment