Skip to content

Instantly share code, notes, and snippets.

@elliottmorris
Last active April 9, 2018 11:55
Show Gist options
  • Save elliottmorris/905bd514110ea7d0b599cfca8a8e7332 to your computer and use it in GitHub Desktop.
Save elliottmorris/905bd514110ea7d0b599cfca8a8e7332 to your computer and use it in GitHub Desktop.
Trump approval house effects
##################################
# Code for house-correcting Trump approval polls
# Written by Alexander Agadjanian and G. Elliott Morris
# for use on Elliott's blog, TheCrosstab.com.
# @a_agadjanian @GElliottMorris
#
# Github Gist lives at https://gist.github.com/elliottmorris/905bd514110ea7d0b599cfca8a8e7332
####################################
# libraries ####
rm(list = ls())
#setwd("/Users/Victor/Desktop/Blog-data/excel files/post election/")
library(matrixStats)
library(ggrepel)
library(maps)
library(ggthemes)
library(stringr)
library(gridExtra)
library(survey)
library(haven)
library(dplyr)
library(tidyr)
library(ggplot2)
library(lubridate)
library(broom)
library(plotly)
options(stringsAsFactors = FALSE)
library(stargazer)
library(utils)
library(ggthemes)
library(ggrepel)
library(png)
library(grid)
library(gridExtra)
library(knitr)
library(reshape)
source('~/Desktop/theme_elliott.R')
source('~/Desktop/theme_elliott.R')
exponent_weight <- function(i) {
exp(-0.2*i)
}
weight_data <- data.frame("Days" = c(1:14),"Weights" = exponent_weight(c(1:14)))
weight_data
# data ####
download.file("http://elections.huffingtonpost.com/pollster/api/v2/questions/00c%20-Pres-45-Trump%20-%20Job%20Approval%20-%20National/poll-responses-clean.tsv",
"approvalrating.tsv")
df <- read.table("approvalrating.tsv", sep = "\t", header = TRUE)
df1 <- df %>%
filter(sample_subpopulation == "Likely Voters" |
sample_subpopulation == "Adults" |
sample_subpopulation == "Registered Voters")
write.csv(df1,"cleaned_polls.csv",row.names = FALSE)
polls <- df1 %>%
mutate(net.approval = Approve - Disapprove,
population = ifelse(sample_subpopulation == "Adults", "1. Adults",
ifelse(sample_subpopulation == "Registered Voters", "2. Registered Voters",
ifelse(sample_subpopulation == "Likely Voters", "3. Likely Voters", NA))),
mode = ifelse(mode == "Live Phone", "1. Live Phone",
ifelse(mode == "IVR/Online", "2. IVR/Online",
ifelse(mode == "Internet", "3. Internet", NA))),
field.time = difftime(end_date, start_date),
sample = observations,
since.inaug = difftime(end_date, "2017-01-20"),
undecided.dk = 100 - (Approve + Disapprove))
for (i in unique(polls$survey_house)){
polls[[i]] <- ifelse(polls$survey_house == i, paste("2. ",i),"1. Other Polls")
}
#################### Approval & Disapproval ################################
# + field.time + undecided.dk
approval.lm <- lm(Approve ~ population + mode + since.inaug + survey_house, data = polls)
summary(approval.lm)
df.approval.lm <- tidy(approval.lm)
# loop through to pull coefficients **********************
# creating vector that contains index term
temp <- polls[21:length(polls)]
pollsters <- colnames(temp)
# only want pollsters with 3+ polls
for (i in pollsters){
if(nrow(polls[polls$survey_house == i,]) < 2){
pollsters <- pollsters[pollsters != i]
}
}
# creating empty data frame to which house effects will be attached
house <- data.frame(term = character(),
estimate = numeric())
# looping through regressions for each individual pollster to get house effect for each
for (i in pollsters) {
approval.lm <- lm(polls$Approve ~ polls$population + polls$mode + polls$since.inaug + polls$field.time + polls$undecided.dk + polls[[i]])
summary(approval.lm)
df.approval.lm <- tidy(approval.lm) %>%
select(term, estimate)
df.approval.lm <- df.approval.lm[nrow(df.approval.lm),]
if (df.approval.lm[1,]$term == "polls$undecided.dk"){next}
print(paste(i,df.approval.lm))
house <- rbind(house, df.approval.lm)
}
# fixing data frame that contains house effects
house$term <- str_replace_all(house$term, "polls\\[\\[i\\]\\]2.", "")
house <- house %>%
select("Pollster" = term, "Effect" = estimate)
# order
house <- house[order(house$Effect),]
house$Pollster <- factor(house$Pollster, levels = house$Pollster[order(house$Effect)])
house$Pollster # notice the changed order of factor levels
house$Bias <- ifelse(house$Effect <=0,"Democratic Bias","Republican Bias")
# + field.time + undecided.dk
disapproval.lm <- lm(Disapprove ~ population + mode + since.inaug + survey_house, data = polls)
summary(disapproval.lm)
disdf.approval.lm <- tidy(approval.lm)
# loop through to pull coefficients **********************
# creating vector that contains index term
temp <- polls[21:length(polls)]
pollsters <- colnames(temp)
# creating empty data frame to which house effects will be attached
house_2 <- data.frame(term = character(),
estimate = numeric())
# looping through regressions for each individual pollster to get house effect for each
for (i in pollsters) {
disapproval.lm <- lm(polls$Disapprove ~ polls$population + polls$mode + polls$since.inaug + polls$field.time + polls$undecided.dk + polls[[i]])
summary(disapproval.lm)
df.disapproval.lm <- tidy(disapproval.lm) %>%
select(term, estimate)
df.disapproval.lm <- df.disapproval.lm[nrow(df.disapproval.lm),]
if (df.disapproval.lm[1,]$term == "polls$undecided.dk"){next}
print(paste(i,df.approval.lm))
house_2 <- rbind(house_2, df.disapproval.lm)
}
# fixing data frame that contains house effects
house_2$term <- str_replace_all(house_2$term, "polls\\[\\[i\\]\\]2.", "")
house_2 <- house_2 %>%
select("Pollster" = term, "Effect" = estimate)
# order
house_2 <- house_2[order(house_2$Effect),]
house_2$Pollster <- factor(house_2$Pollster, levels = house_2$Pollster[order(house_2$Effect)])
house_2$Pollster # notice the changed order of factor levels
house_2$Bias <- ifelse(house_2$Effect <=0,"Democratic Bias","Republican Bias")
names(house_2) <- c("Pollster","D_Effect","Bias")
house <- left_join(house,
house_2 %>% select(-Bias),
by="Pollster")
# creating graph for pollster effects ####
pollster_effect<- ggplot() +
geom_bar(data = house, aes(x = reorder(Pollster,Effect), y = Effect, fill = Effect),col = "#616161", stat = "identity") +
theme_elliott() +
labs(title = "House Effects for Trump Job Approval Polls",
subtitle = "After taking mode, population, and date of poll into account.\nOnly computed for polling firms that have released more than 2 polls of Trump's Job Approval",
x = "Pollster",
y = "House Effect") +
scale_fill_gradient2(low = "blue",mid="white",high="red") +
scale_y_continuous(breaks=c(-10,-5,0,5,10),labels=c("-10%","-5%","0","5%","10%")) +
coord_flip()
grid.arrange(pollster_effect,my_g,heights=c(9, .5))
png("Trump_approval.png",unit="in",height=6,width=8,res=300)
grid.arrange(pollster_effect,my_g,heights=c(9, .5))
dev.off()
kable(house,align="l",format="html",digits=2,row.names = FALSE)
## stargazing
approval.lm1 <- lm(net.approval ~ population + mode + since.inaug + field.time + undecided.dk, data = polls)
approval.lm2 <- lm(net.approval ~ population + mode + since.inaug + field.time + undecided.dk + survey_house, data = polls)
approval.lm3 <- lm(net.approval ~ population + mode + since.inaug + survey_house, data = polls)
# approve
approval.lm4 <- lm(Approve ~ population + mode + since.inaug + field.time + undecided.dk, data = polls)
approval.lm5 <- lm(Approve ~ population + mode + since.inaug + field.time + undecided.dk + survey_house, data = polls)
approval.lm6 <- lm(Approve ~ population + mode + since.inaug + survey_house, data = polls)
text<- stargazer(approval.lm1,approval.lm2,approval.lm3,approval.lm4,approval.lm5,approval.lm6, type = "html")
#box plots
polling <- df %>%
filter(sample_subpopulation == "Likely Voters" |
sample_subpopulation == "Adults" |
sample_subpopulation == "Registered Voters")
mode_effect <- ggplot(polling[as.Date(polling$end_date) > as.Date(max(as.Date(polling$end_date))) - 21,], aes(x = reorder(mode,Approve), y= Approve, col = mode)) +
geom_boxplot(width=.4) +
theme_elliott() +
coord_flip()+
coord_flip() +
labs(title = "Mode Effects for Trump Job Approval Polls",
subtitle = "A simple comparison of averages over the past 3 weeks",
y = "Approval Rating (%)",
x = "Pollster")
grid.arrange(mode_effect,my_g,heights=c(9, .5))
png("Trump_mode.png",unit="in",height=6,width=8,res=300)
grid.arrange(mode_effect,my_g,heights=c(9, .5))
dev.off()
pop_effect <- ggplot(polling[as.Date(polling$end_date) > as.Date(max(as.Date(polling$end_date))) - 21,], aes(x = reorder(sample_subpopulation,Approve), y= Approve, color = sample_subpopulation)) +
geom_boxplot(width=.4) +
theme_elliott() +
coord_flip() +
labs(title = "Population Effects for Trump Job Approval Polls",
subtitle = "A simple comparison of averages over the past 3 weeks",
y = "Approval Rating (%)",
x = "Pollster")
grid.arrange(pop_effect,my_g,heights=c(9, .5))
png("Trump_population.png",unit="in",height=6,width=8,res=300)
grid.arrange(pop_effect,my_g,heights=c(9, .5))
dev.off()
avg_compare <- ggplot(polling[as.Date(polling$end_date) > as.Date(max(as.Date(polling$end_date))) - 21,],
aes(x = reorder(survey_house,Approve), y= Approve,color=survey_house)) +
geom_boxplot() +
theme_elliott() +
coord_flip() +
labs(title = "House Effects for Trump Job Approval Polls",
subtitle = "A simple comparison of averages over the past 3 weeks",
y = "Approval Rating (%)",
x = "Pollster")
grid.arrange(avg_compare,my_g,heights=c(9, .5))
png("Trump_house_avg.png",unit="in",height=6,width=8,res=300)
grid.arrange(avg_compare,my_g,heights=c(9, .5))
dev.off()
########## why not try a correction? ###################
df2 <- df1 %>% mutate(Adjusted = NA,Adjusted_Dis = NA)
house$Pollster_split <- substr(as.character(house$Pollster),3,10000)
for (i in unique(df2$survey_house)){
if(nrow(house[house$Pollster_split == i,])>0){
effect <- as.numeric(house[house$Pollster_split == i,]$Effect)}else{effect <- 0}
df2[df2$survey_house == i,]$Adjusted = df2[df2$survey_house == i,]$Approve - effect
if(nrow(house[house$Pollster_split == i,])>0){
dis_effect <- as.numeric(house[house$Pollster_split == i,]$D_Effect)}else{dis_effect <- 0}
df2[df2$survey_house == i,]$Adjusted_Dis = df2[df2$survey_house == i,]$Disapprove - dis_effect
}
df2$Net <- df2$Adjusted - df2$Adjusted_Dis
df2<-df2[order(df2$end_date),]
# function for WMA
ma <- function(x,n=7){stats::filter(x,rep(1/n,n), sides=1)}
df3 <- as.data.frame(df2 %>%
group_by(end_date) %>%
summarise(Adjusted = mean(Adjusted),
Adjusted_Dis = mean(Adjusted_Dis),
Net = mean(Net)))
# for i in date
# all polls in last 2 weeks
# get days since date
# get weights
# make weighted average
# save average to that row
df3$M_Adjusted <- NA
df3$M_Adjusted_Dis <- NA
for (i in unique(df3$end_date)){
temp <- df3 %>%
filter((as.Date(end_date) > as.Date(i) - 15) & as.Date(end_date)<= as.Date(i)) %>%
mutate(Days = as.numeric(difftime(as.Date(Sys.Date()),ymd(end_date))),
weight = exponent_weight(Days))
df3[df3$end_date == i,]$M_Adjusted = weighted.mean(temp$Adjusted,temp$weight)
df3[df3$end_date == i,]$M_Adjusted_Dis = weighted.mean(temp$Adjusted_Dis,temp$weight)
}
df3$M_Net = df3$M_Adjusted - df3$M_Adjusted_Dis
df3 <- df3[complete.cases(df3),]
names(df3) <- as.character(names(df3))
df3.m <- melt(df3,id.vars = c("end_date","Adjusted","Adjusted_Dis","Net","M_Net"))
df3.m$variable <- gsub("M_Adjusted","Approve",df3.m$variable)
df3.m$variable <- gsub("Approve_Dis","Disapprove",df3.m$variable)
# need the points for polls
df2.m <- melt(df2, id.vars = c(names(df2)[!(names(df2) %in% c("Adjusted","Adjusted_Dis"))]))
df2.m$variable <- gsub("Adjusted","Approve",df2.m$variable)
df2.m$variable <- gsub("Approve_Dis","Disapprove",df2.m$variable)
#avg today
disapprove <- df3[as.Date(df3$end_date) == max(as.Date(df3$end_date)),]$M_Adjusted_Dis
approve <- df3[as.Date(df3$end_date) == max(as.Date(df3$end_date)),]$M_Adjusted
trump_polls_ggplot <- ggplot(df3.m,
aes(x=as.Date(end_date),y=value,col = variable))+
geom_point(data=df2.m,
aes(x=as.Date(end_date),y=value,col=variable,fill=survey_house,
label=value,label2=survey_house),
shape=1,alpha=.8)+
geom_line(size=1) +
theme_elliott() +
labs(x="Date",
y="",
title = "Trump Job Approval Polls",
subtitle="Polls corrected for house effects and averaged with\nhigher weight given to most recent polls") +
geom_text(aes(x = as.Date(max(as.Date(df3$end_date)))+7,
y=disapprove,
label=paste0(round(disapprove,1),"% Disapprove"),col="Disapprove")) +
geom_text(aes(x = as.Date(max(as.Date(df3$end_date)))+7,
y=approve,
label=paste0(round(approve,1),"% Approve"),col="Approve")) +
scale_color_manual("",values = c("Approve" = "blue","Disapprove"="red")) +
coord_cartesian(xlim = c(as.Date("2017-01-20"),max(as.Date(df3$end_date))+15),
ylim = c(35,65))
#geom_smooth(data=df2,aes(x=as.Date(end_date),y=Net),col="black",se=FALSE,span=.3)
grid.arrange(trump_polls_ggplot,my_g,heights=c(9, .5))
png("MasterPlot.png",unit="in",height=15,width=15,res=400)
grid.arrange(trump_polls_ggplot,
grid.arrange(
pollster_effect,
avg_compare,
mode_effect,
pop_effect,
ncol = 2),
my_g,
heights=c(5,9, .5)
)
dev.off()
house$Pollster = substr(as.character(house$Pollster),3,100)
left_join(df2 %>%
select("Pollster" = survey_house,
"Date" = end_date,
Approve,
Disapprove,
"Approve (Adj.)" = Adjusted,
"Disapprove (Adj.)"= Adjusted_Dis) %>%
arrange(desc(Date)),
house %>% select(Pollster,"House Effect" = Effect),by="Pollster")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment