Last active
August 16, 2020 15:16
-
-
Save elliottmorris/ae1747e1bbb2fc795e49164df4c85987 to your computer and use it in GitHub Desktop.
Code to make a chart that compares Biden's polling numbers to Hillary Clinton's 2016 performance
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(tidyverse) | |
library(janitor) | |
library(lubridate) | |
library(zoo) | |
library(politicaldata) | |
RUN_DATE <- Sys.Date() | |
start_date <- ymd("2020-02-01") | |
election_day <- ymd('2020-11-03') | |
# all_polls <- read.csv("data/all_polls.csv", stringsAsFactors = FALSE, header = TRUE) | |
url<- 'https://docs.google.com/spreadsheets/d/e/2PACX-1vQ56fySJKLL18Lipu1_i3ID9JE06voJEz2EXm6JW4Vh11zmndyTwejMavuNntzIWLY0RyhA1UsVEen0/pub?gid=0&single=true&output=csv' | |
all_polls <- read_csv(url) | |
# remove any polls if biden or trump blank | |
all_polls <- all_polls %>% filter(!is.na(biden),!is.na(trump))#, include == "TRUE") | |
# save output for later | |
write_csv(all_polls %>% filter(state=='--') %>% clean_names %>% | |
mutate_at(c('biden','trump','other','undecided','biden_margin'), | |
function(x){x/100}) %>% | |
mutate(end_date = mdy(end_date), | |
biden_two_party = biden / (biden + trump), | |
trump_two_party = trump / (biden + trump)) %>% | |
filter(end_date >= ymd("2020-03-01")), | |
'output/site_data/national_polls.csv') | |
write_csv(all_polls %>% filter(state!='--') %>% clean_names %>% | |
mutate_at(c('biden','trump','other','undecided','biden_margin'), | |
function(x){x/100}) %>% | |
mutate(end_date = mdy(end_date), | |
biden_two_party = biden / (biden + trump), | |
trump_two_party = trump / (biden + trump)) %>% | |
filter(end_date >= ymd("2020-03-01")), | |
'output/site_data/state_polls.csv') | |
# remove polls from before today | |
all_polls <- all_polls %>% | |
mutate(entry.date = as_date(entry.date.time..et., format = '%m/%d/%Y %H:%M',tz='US/Eastern')) %>% | |
filter(entry.date <= (ymd(RUN_DATE))) | |
# take out state-level online polls | |
#all_polls <- all_polls %>% filter(state == '--' | grepl('live',tolower(mode))) | |
# limit the number of observations | |
all_polls$number.of.observations = ifelse(all_polls$number.of.observations > 3000, 3000, all_polls$number.of.observations) | |
# select relevant columns google sheet, make mutations | |
all_polls <- all_polls %>% | |
dplyr::select(state, pollster, number.of.observations, population, mode, | |
start.date, end.date, entry.date, | |
biden, trump, undecided, other) %>% | |
#filter(mdy(end.date) <= RUN_DATE) %>% | |
mutate(start.date = as.character(mdy(start.date)), | |
end.date = as.character(mdy(end.date))) %>% | |
mutate(population = case_when(population == 'lv' ~ 'Likely Voters', | |
population == 'rv' ~ 'Registered Voters', | |
population == 'a' ~ 'Adults')) | |
# basic mutations | |
df <- all_polls %>% | |
tbl_df %>% | |
rename(n = number.of.observations) %>% | |
mutate(begin = ymd(start.date), | |
end = ymd(end.date), | |
t = end - (1 + as.numeric(end-begin)) %/% 2) %>% | |
filter(t >= start_date & !is.na(t) | |
& (population == "Likely Voters" | | |
population == "Registered Voters" | | |
population == "Adults") # get rid of disaggregated polls | |
& n > 1) | |
# pollster mutations | |
df <- df %>% | |
mutate(pollster = str_extract(pollster, pattern = "[A-z0-9 ]+") %>% sub("\\s+$", "", .), | |
pollster = replace(pollster, pollster == "Fox News", "FOX"), # Fixing inconsistencies in pollster names | |
pollster = replace(pollster, pollster == "WashPost", "Washington Post"), | |
pollster = replace(pollster, pollster == "ABC News", "ABC"), | |
pollster = replace(pollster, pollster == "DHM Research", "DHM"), | |
pollster = replace(pollster, pollster == "Public Opinion Strategies", "POS"), | |
undecided = ifelse(is.na(undecided), 0, undecided), | |
other = ifelse(is.na(other), 0, other))# + | |
# mode mutations | |
table(df$mode) | |
df <- df %>% | |
mutate(mode = case_when(mode == 'Internet' | mode == 'Online' ~ 'Online poll', | |
grepl("live phone",tolower(mode)) ~ 'Live phone component', | |
TRUE ~ 'Other')) | |
table(df$mode) | |
table(df$population) | |
# vote shares etc | |
df <- df %>% | |
mutate(two_party_sum = biden + trump, | |
polltype = population,#as.integer(as.character(recode(population, | |
# "Likely Voters" = "0", | |
# "Registered Voters" = "1", | |
# "Adults" = "2"))), | |
n_respondents = round(n), | |
# biden | |
n_biden = round(n * biden/100), | |
pct_biden = biden/two_party_sum, | |
n_trump = round(n * trump/100), | |
pct_trump = trump/two_party_sum) | |
## --- numerical indices | |
state_abb_list <- read.csv("data/potus_results_76_16.csv") %>% | |
pull(state) %>% unique() | |
df <- df %>% | |
mutate(poll_day = t - min(t) + 1, | |
# Factors are alphabetically sorted: 1 = --, 2 = AL, 3 = AK, 4 = AZ... | |
index_s = as.numeric(factor(as.character(state), | |
levels = c('--',state_abb_list))), | |
index_s = ifelse(index_s == 1, 52, index_s - 1), | |
index_t = 1 + as.numeric(t) - min(as.numeric(t)), | |
index_p = as.numeric(as.factor(as.character(pollster))), | |
index_m = as.numeric(as.factor(as.character(mode))), | |
index_pop = as.numeric(as.factor(as.character(polltype)))) %>% | |
# selections | |
arrange(state, t, polltype, two_party_sum) %>% | |
distinct(state, t, pollster, .keep_all = TRUE) %>% | |
select( | |
# poll information | |
state, t, begin, end, pollster, polltype, method = mode, n_respondents, | |
# vote shares | |
pct_biden, n_biden, | |
pct_trump, n_trump, | |
poll_day, index_s, index_p, index_m, index_pop, index_t) | |
# useful vectors | |
all_polled_states <- df$state %>% unique %>% sort | |
# day indices | |
first_day <- min(df$begin) | |
ndays <- max(df$t) - min(df$t) | |
all_t <- min(df$t) + days(0:(ndays)) | |
all_t_until_election <- min(all_t) + days(0:(election_day - min(all_t))) | |
# pollster indices | |
all_pollsters <- levels(as.factor(as.character(df$pollster))) | |
# graph of polls relative to Clinton 2016 | |
delta_df <- df %>% | |
select(state,end,pct_biden,method,n_respondents) %>% | |
left_join(politicaldata::pres_results %>% | |
filter(year==2016) %>% | |
mutate(pct_clinton = dem/(dem+rep)) %>% | |
select(state,pct_clinton) %>% | |
bind_rows(tibble(state='--',pct_clinton = 0.511))) %>% | |
filter(end <= Sys.Date()) %>% | |
arrange(end) %>% | |
mutate(level = ifelse(state=='--','National','State'), | |
delta = (pct_biden - pct_clinton)*2*100) | |
delta_ts <- lapply(unique(delta_df$level), | |
function(i){ | |
lapply(unique(delta_df$method), | |
function(j){ | |
tmp <- delta_df %>% filter(level == i, method == j) | |
lapply(seq.Date(ymd("2020-02-01"),Sys.Date(),'day'), | |
function(k){ | |
tmp$day_from_k <- as.numeric(abs(difftime(k,tmp$end,units = 'days'))) + 1 | |
tibble(end = k, | |
level = i, | |
method = j, | |
average_delta = weighted.mean(tmp[tmp$end >= (k-60) & tmp$end <= k,]$delta, | |
tmp[tmp$end >= (k-60) & tmp$end <= k,]$n_respondents * | |
exp(-0.04 * tmp[tmp$end >= (k-60) & tmp$end <= k,]$day_from_k) | |
)) | |
}) %>% bind_rows | |
})%>% bind_rows | |
}) %>% bind_rows %>% | |
group_by(level,method) %>% | |
mutate(average_delta = imputeTS::na_kalman(average_delta)) | |
delta_ts_all <- lapply(unique(delta_df$level), | |
function(i){ | |
tmp <- delta_df %>% filter(level == i) | |
lapply(seq.Date(ymd("2020-02-01"),Sys.Date(),'day'), | |
function(k){ | |
tmp$day_from_k <- as.numeric(abs(difftime(k,tmp$end,units = 'days'))) + 1 | |
tibble(end = k, | |
level = i, | |
method = 'All polls', | |
average_delta = weighted.mean(tmp[tmp$end >= (k-60) & tmp$end <= k,]$delta, | |
tmp[tmp$end >= (k-60) & tmp$end <= k,]$n_respondents * | |
exp(-0.04 * tmp[tmp$end >= (k-60) & tmp$end <= k,]$day_from_k) | |
)) | |
}) %>% bind_rows | |
}) %>% bind_rows %>% | |
group_by(level,method) %>% | |
mutate(average_delta = imputeTS::na_kalman(average_delta)) | |
gg <- ggplot(delta_df,aes(x=end,group=method,col=method)) + | |
geom_hline(yintercept = 0,col='gray30') + | |
# polls and sooth | |
geom_point(aes(y=delta),alpha=0.2) + | |
#geom_smooth(aes(y=delta),span=0.8,show.legend = F,se=F) + | |
# average lines | |
#geom_line(data=delta_ts,aes(y=average_delta)) + | |
#geom_line(data=delta_ts_all,inherit.aes = F,aes(x=end,y=average_delta),col='black',linetype=2,show.legend=F) + | |
# average smooths | |
geom_smooth(data=delta_ts,aes(y=average_delta),span=0.1,se=F,size=0.5) + | |
geom_smooth(data=delta_ts_all,inherit.aes = F,aes(x=end,y=average_delta),col='black',linetype=2,span=0.1,show.legend=F,se=F,size=0.7) + | |
labs(x='Date',y='',subtitle='Biden margin in polls minus Clinton margin in 2016, by geography and poll mode') + | |
# the rest | |
theme_minimal() + | |
theme(legend.position = 'top',panel.grid.minor = element_blank()) + | |
scale_y_continuous(breaks=seq(-100,100,2),limits=c(-2,14)) + | |
scale_x_date(date_breaks = 'month',date_labels = '%b', | |
limits = c(ymd('2020-03-01'),Sys.Date())) + | |
scale_color_brewer(palette='Set1',name='Poll mode') + | |
facet_wrap(~level) | |
print(gg) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment