Skip to content

Instantly share code, notes, and snippets.

@pitakakariki
Last active August 4, 2016 09:28
Show Gist options
  • Save pitakakariki/2791866 to your computer and use it in GitHub Desktop.
Save pitakakariki/2791866 to your computer and use it in GitHub Desktop.
Code to generate an interactive SVG graph of NZ general election polling.
###
##
## --- New Zealand General Election Polling ---
##
## Plots polling data scraped from Wikipedia.
##
## - GAM -
##
## The smoothed curves are calculated using a generalised additive model (GAM). The
## smoothing parameter is estimated using cross-validation. This means that the curves
## are estimated based on subsets of the data, and the parameter that best predicts
## the hold-out data is chosen. Note that the error bounds are based on the assumption
## that the smoothing parameter is correct, i.e. uncertainty in the smoothing parameter
## estimate are not accounted for.
##
## A separate smoothed curve is estimated for each combination of polling firm and party
## (the election is treated as a poll and given more weight than the opinion polls). The
## shape of the curves for each party is constrained to be the same for every polling firm,
## although they can be vertically offset from each other. The displayed curve is aligned
## so that it passes through the election result, but in the interactive version you can
## see the offset curves for each polling outfit.
##
## - SVG -
##
## The svg graphics format allows interactivity via Javascript. Note that the
## SVGAnnotation library comes from http://www.bioconductor.org/ rather that
## the CRAN repositories. This requires Cairo to be available (use
## capabilities("cairo") to check this).
##
###
library(stringr)
library(plyr)
library(XML)
library(xts)
library(ggplot2)
library(lubridate)
library(mgcv)
library(reshape)
library(Hmisc)
library(rje)
###
##
## Settings
##
###
# Options
opt_begin <- as.Date("2008-01-01") #as.Date("1996-01-01") #as.Date("2002-06-01")
opt_end <- today()
opt_filename <- "test"
opt_svg <- TRUE # set to FALSE if you can't get SVGAnnotation to work
opt_png <- !opt_svg
opt_filename <- str_c(opt_filename, if(opt_svg) ".svg" else ".png")
opt_bc <- TRUE # bias corrections
opt_bcplot <- FALSE # plot bias-corrected points
opt_bcreport <- TRUE # print bias estimates
opt_bccurves <- opt_svg # curves for each polling firm
opt_events <- FALSE # event markers
opt_thresh <- TRUE # 5% threshold marker
opt_ewt <- 1000 # weighting for election data points
opt_adapt <- FALSE # Adaptive smoother
opt_svgwidth <- 12 # width and height if the output file is svg
opt_svgheight <- 7 # these will be multiplied by 72 for png
opt_xwidth <- 3/2 * opt_svgwidth # 1 / opt_xwidth of the plot is reserved for final results
opt_xheight <- 3/2 * opt_svgheight # 1 / opt_xheight for event markers
opt_dashes <- "[\u002D\u2013\u2014\u2212]"
opt_spaces <- "[\u0020\u00A0]"
opt_plotcurves <- TRUE
opt_curvepts <- 500 # number of line segements to make "curves"
opt_curvett <- TRUE # tooltips on curves
opt_moe <- 0
opt_legend <- TRUE
# Wikipedia urls
urls <- str_c("http://en.wikipedia.org/wiki/Opinion_polling_for_the_New_Zealand_general_election,_", c(2005, 2008, 2011))
urls <- c(urls, "http://en.wikipedia.org/wiki/Opinion_polling_for_the_next_New_Zealand_general_election")
# Party details
party_ctl <- read.table(
header = TRUE,
row.names = "Party",
stringsAsFactors = FALSE,
comment = "",
text = "
Party Colour Contrast Include
ACT #FFE401 #FFFF80 -
Conservative #00AEEF #33CCFF -
Destiny #FF0000 #000000 -
Green #098137 #B3FFB3 Y
Labour #FF0000 #FFBAA8 Y
Mana #770808 #FF6E6E -
Maori #EF4A42 #FFCC80 -
National #00529F #CCDDFF Y
'NZ First' #000000 #CCCCCC Y
Progressive #9E9E9E #DDCCDD -
Internet #662C92 #EE77FF -
'Internet/Mana' #662C92 #FF6E6E -
'United Future' #501557 #DD99DD -
'Labour/Green' #FF0000 #B3FFB3 -
")
party_ctl[, "Include"] <- (party_ctl[, "Include"] == "Y")
rownames(party_ctl)[match("Maori", rownames(party_ctl))] <- "M\u0101ori"
if(opt_svg) library(SVGAnnotation, quietly=TRUE)
###
##
## Helper function definitions
##
###
get_data <- function(data_url, quiet=FALSE, i=1) {
if(!quiet) message(str_c("Reading data from ", data_url))
h <- htmlParse(data_url, encoding="UTF-8")
x <- readHTMLTable(h, encoding="UTF-8", stringsAsFactors=FALSE)
x $ toc <- NULL
x <- x[[i]] # the poll data usually the first table, not counting "table" of contents
names(x) <- trim(names(x))
x[,1] <- trim(x[,1])
# If the second column is NA, then we have some sort of non-poll event
r_events <- is.na(x[2])
events <- unlist(str_split(x[r_events, 1], "\n"))
events <- str_split(events, opt_dashes, n=2)
Date <- sapply(events, `[`, 1)
Date <- as.Date(Date, "%d %b %Y")
Event <- sapply(events, `[`, 2)
Event <- trim(Event)
events <- data.frame(Date=Date, Event=Event, stringsAsFactors=FALSE)
# Polls are everything else, minus "heading" rows
r_polls <- !r_events & !(x$Date == "Date")
c_polls <- (names(x) != "")
polls <- x[r_polls, c_polls]
# Conversions
polls $ Poll [str_detect(polls $ Poll, "election")] <- "Election"
polls $ Poll <- as.factor(polls $ Poll)
polls $ Poll <- reorder(polls $ Poll, polls $ Poll, function(x) if(x[1] == "Election") 1 else 2)
polls $ Date <- as.character(polls $ Date)
for(col in names(polls)[-c(1,2)])
polls[col] <- suppressWarnings(as.numeric(polls[[col]])) # suppress conversion to NA warnings
# Return both data frames
attr(polls, "events") <- events
null0 <- function(x) if(is.null(x)) 0 else ifelse(is.na(x), 0, x)
# Combinations
polls[['Labour/Green']] <- rep(0, nrow(polls)) +
null0(polls$Labour) +
null0(polls$Green)
IM <- rep(0, nrow(polls)) +
null0(polls$Internet) +
null0(polls$Mana) +
null0(polls$`Internet Mana`)
IM <- ifelse(IM==0, NA, IM)
polls[['Internet/Mana']] <- IM
return(polls)
}
trim <- function(x) {
x <- str_replace_all(x, "\\[(.*?)\\]", "") # remove any Wikipedia footnotes, e.g. [1]
x <- str_trim(x) # trim leading/trailing spaces
return(x)
}
alpha <- function(x, alpha=1) rgb(t(col2rgb(x)), alpha=255*alpha, max=255)
date <- function(x) {
temp <- lapply(str_split(x, opt_dashes), rev) # Four different kinds of dashes used!
# Date that poll coverage ends
a <- sapply(temp, `[`, 1)
a <- str_replace(a, "c\\.", "")
a <- str_replace(a, "Released", "")
a <- str_trim(a)
date_end <- as.Date(a, "%d %b %Y")
# Date that poll coverage starts
b <- sapply(temp, `[`, 2)
b <- str_trim(b)
b <- str_split(b, opt_spaces) # Second space is a non-breaking space.
year_start <- as.numeric(sapply(b, `[`, 3))
month_start <- match(sapply(b, `[`, 2), month.name)
day_start <- as.numeric(sapply(b, `[`, 1))
date_start <- date_end
s <- !is.na(year_start); if(any(s)) date_start[s] <- update(date_start[s], years=year_start[s])
s <- !is.na(month_start); if(any(s)) date_start[s] <- update(date_start[s], months=month_start[s])
s <- !is.na(day_start); if(any(s)) date_start[s] <- update(date_start[s], days=day_start[s])
# Polls with only month and year
s <- is.na(date_start)
if(any(s)) {
date_start[s] <- as.Date(as.yearmon(a[s]))
date_end[s] <- date_start[s] + new_period(month=1) - new_period(day=1)
}
# Combine and return
date_mean <- as.Date(apply(cbind(date_start, date_end), 1, mean))
data.frame(orig=x, date_start, date_end, date_mean)
}
loop <- function(x, y=x) c(x, rev(y))
format_num <- function(x) formatC(x, digits=1, format="f")
format_date <- function(x) format(as.Date(x), "%d %B %Y")
format_date_range <- function(x, y) {
x_day <- day(x)
x_month <- month.name[month(x)]
x_year <- year(x)
y_day <- day(y)
y_month <- month.name[month(y)]
y_year <- year(y)
en <- "\u2013"
ifelse(
x_year != y_year,
str_c(x_day, x_month, x_year, en, y_day, y_month, y_year, sep=" "),
ifelse(
x_month != y_month,
str_c(x_day, x_month, en, y_day, y_month, y_year, sep=" "),
ifelse(
x_day != y_day,
str_c(x_day, en, y_day, y_month, y_year, sep=" "),
str_c(y_day, y_month, y_year, sep=" ")
)
)
)
}
embiggen <- function(node) {
for(i in seq_along(node)) {
xmlAttrs(node[[i]]) <- c(
`onmouseover` = "embiggen(this)",
`onmouseout` = "unembiggen(this)",
`onclick` = "click(this)"
)
}
}
link <- function(pt_node, sm_node, party, poll) {
id <- str_c(party, poll)
for(i in seq_along(pt_node)) {
xmlAttrs(pt_node[[i]]) <- c(
`id` = str_c(id, i),
`onmouseover` = sprintf("link('%s', %i)", id, length(pt_node)),
`onmouseout` = sprintf("unlink('%s', %i)", id, length(pt_node)),
`onclick` = sprintf("clink('%s', %i)", id, length(pt_node))
)
}
if(!is.null(sm_node)) {
xmlAttrs(sm_node) <- c(`id` = id)
modifyStyle(sm_node, `stroke-opacity`="0")
}
}
get_party_data <- function(p) {
s <- !is.na(poll_matrix[, p])
n <<- sum(s)
x <<- as.numeric(x_date[s])
x0 <<- poll_data $ date_start[s]
x1 <<- poll_data $ date_end[s]
y <<- poll_matrix[s, p]
z <<- poll_data $ Poll[s]
this_colour <<- party_ctl[p, "Colour"]
this_contrast <<- party_ctl[p, "Contrast"]
xs <<- seq(min(x), max(x), length=opt_curvepts)
new_X <<- data.frame(x=xs, z="Election")
s_elect <<- (z == "Election")
invisible()
}
###
##
## Collect the data.
##
###
data_list <- lapply(urls, get_data)
#load('datalist.rda')
#data_list[[5]] <- data.frame(Poll="3 News Reid Research", Date="25 May 2014", Labour=29.5, National=50.3, `NZ First`=5.6, Green=10.2, check.names=FALSE, stringsAsFactors=FALSE)
#attr(data_list[[5]], "events") <- data.frame(Date=as.Date(Sys.Date()), Event="Today")
poll_data <- Reduce(function(x, y) merge(x, y, all=TRUE), data_list)
event_data_list <- lapply(data_list, attr, "events")
event_data <- Reduce(function(x, y) merge(x, y, all=TRUE), event_data_list)
event_data <- event_data[!is.na(event_data $ Date), ]
poll_data <- cbind(poll_data, date(poll_data $ Date)) # Parse dates
poll_data <- poll_data[!is.na(poll_data $ date_mean), ] # Remove any polls where we can't parse a date
poll_data <- sort_df(poll_data, 'date_mean') # Sort from earliest to latest poll
# Restrict to selected dates
s_date <- (poll_data $ date_start >= opt_begin & poll_data $ date_end <= opt_end)
poll_data <- poll_data[s_date, ]
poll_data <- droplevels(poll_data) # Remove unused polling firms
incl_parties <- rownames(party_ctl)[party_ctl $ Include]
poll_matrix <- as.matrix(poll_data[, incl_parties]) # Restrict to parties of interest
x_date <- poll_data $ date_mean
###
##
## Analysis and plotting.
##
###
if(opt_svg) {
svg(opt_filename, width=opt_svgwidth, height=opt_svgheight)
}
if(opt_png && !opt_svg) {
png(opt_filename, width=opt_svgwidth*72, height=opt_svgheight*72)
#pdf(opt_filename, width=opt_svgwidth, height=opt_svgheight, bg='white')
}
svg_counter <- 1
svg_index <- list()
xw <- as.numeric(opt_end - opt_begin) / (opt_xwidth - 1)
xh <- if(opt_events) max(poll_matrix, na.rm=TRUE) / (opt_xheight - 1) else 0
# This sets up the plot region
matplot(
x_date, poll_matrix,
pch = NA,
main = "New Zealand General Election Polling",
xlim = c(min(x_date), max(x_date) + xw), xlab = "", xaxt = "n",
ylim = c(-xh, max(6, 1.1 * max(poll_matrix, na.rm=TRUE))), ylab = "Party Percentage (%)"
)
axis.Date(1, x_date)
abline(h=0); inc(svg_counter) <- 1
is_election <- (poll_data $ Poll == "Election")
abline(v=x_date[is_election]); inc(svg_counter) <- sum(is_election)
if(opt_thresh) {
abline(h=5, lty=2)
inc(svg_counter) <- 1
}
# Fit the Generalised Additive Model
a_list <- list()
b_list <- list()
cf_list <- list()
final <- character()
for(p in incl_parties) {
get_party_data(p)
##logit - add logit links
#y <- y / 100
if(opt_bc) {
a <- if(opt_adapt) {
gam(y ~ s(x, bs="ad") + z, weights=ifelse(z=="Election", opt_ewt, 1))
#gam(y ~ s(x, bs="ad") + z, weights=ifelse(z=="Election", opt_ewt, 1), family=gaussian(link="logit"))
} else {
gam(y ~ s(x) + z, weights=ifelse(z=="Election", opt_ewt, 1))
###gamm(y ~ s(x) + z)
#gam(y ~ s(x) + z, weights=ifelse(z=="Election", opt_ewt, 1), family=gaussian(link="logit"))
}
a_list[[p]] <- a
###a_list[[p]] <- a$gam
cf <- coef(a)
###cf <- coef(a $ gam)
names(cf)[1] <- "zElection"; cf[1] <- 0
cf_list[[p]] <- cf[str_c("z", levels(poll_data $ Poll)[-1])] # Keep track of bias estimates
names(cf_list[[p]]) <- levels(poll_data $ Poll)[-1]
}
else {
a <- if(opt_adapt) {
gam(y ~ s(x, bs="ad"), weights=ifelse(z=="Election", opt_ewt, 1))
#gam(y ~ s(x, bs="ad"), weights=ifelse(z=="Election", opt_ewt, 1), family=gaussian(link="logit"))
} else {
gam(y ~ s(x), weights=ifelse(z=="Election", opt_ewt, 1))
#gam(y ~ s(x), weights=ifelse(z=="Election", opt_ewt, 1), family=gaussian(link="logit"))
}
a_list[[p]] <- a
}
}
bias <- do.call(cbind, cf_list)
colnames(bias)[grep("Maori", colnames(bias))] <- "M\u0101ori"
# Plot spline curves
if(opt_plotcurves) for(p in incl_parties) {
get_party_data(p)
a <- a_list[[p]]
b <- predict(a, newdata=new_X, se=TRUE)
#b$fit <- 100 * expit(b$fit) ##logit
b_list[[p]] <- b
polygon(
loop(xs),
loop(b$fit) + 1.96 * loop(b$se.fit, -b$se.fit),
col = alpha(this_contrast, 1/2),
border = NA
)
lines(xs, b$fit, lwd=3, col=this_colour)
svg_index[[str_c("sm_err:", p)]] <- svg_counter; inc(svg_counter) <- 1
svg_index[[str_c("sm:", p)]] <- svg_counter; inc(svg_counter) <- 1
if(opt_curvett) {
points(xs, b$fit, col=alpha(this_colour, 0), bg=alpha(this_contrast, 0.1), pch=21)
svg_index[[str_c("smpts:", p)]] <- seq(svg_counter, length=opt_curvepts)
inc(svg_counter) <- opt_curvepts
}
}
# Plot data points
for(p in incl_parties) {
get_party_data(p)
if(opt_bc && opt_bcplot) {
cf <- cf_list[[p]]
y[!s_elect] <- y[!s_elect] - cf[as.character(z[!s_elect])] # Bias correction
}
n <- sum(!is.na(y))
if(opt_moe > 0) {
x_me <- ggplot2:::interleave(x, x, NA)
y_me <- ggplot2:::interleave(y-opt_moe, y+opt_moe, NA)
lines(x_me, y_me, col=this_colour)
inc(svg_counter) <- n
}
points(x, y, pch=21, cex=1, col=this_colour, bg=this_contrast)
svg_index[[str_c("pts:", p)]] <- seq(svg_counter, length=n); inc(svg_counter) <- n
}
# Uncorrected smooths
for(p in incl_parties) {
get_party_data(p)
b <- b_list[[p]]
if(opt_bc && opt_bccurves && !opt_bcplot) {
for(i in levels(droplevels(z))[-1]) {
lines(xs, b$fit + bias[i, p], lwd=2, col=alpha(this_colour, 1/2))
svg_index[[sprintf("psm:%s:%s", p, i)]] <- svg_counter; inc(svg_counter) <- 1
}
}
}
# Election results
for(p in incl_parties) {
get_party_data(p)
points(
x[s_elect], y[s_elect],
pch = 21, cex = 2,
col = party_ctl[p, "Colour"],
bg = party_ctl[p, "Contrast"]
)
n_elect <- sum(s_elect)
svg_index[[str_c("election:", p)]] <- seq(svg_counter, length=n_elect); inc(svg_counter) <- n_elect
}
# Plot final estimates
for(p in incl_parties) {
get_party_data(p)
b <- b_list[[p]]
final_x <- tail(x_date, 1)
final_fit <- tail(b $ fit, 1)
final_err <- 1.96 * tail(b $ se.fit, 1)
final[p] <- sprintf("%s \u00B1 %s%%", format_num(final_fit), format_num(final_err))
final_date <- tail(x_date, 1)
right_x <- par("usr")[2]
avail_w <- right_x - as.numeric(final_date)
test_str <- sprintf("%s \u00B1 %s%%_", format_num(800/9), format_num(80/9)) # "88.8 ± 8.8%_"
test_w <- strwidth(test_str, font=4)
text(
final_x, final_fit, final[p],
pos = 4, font = 4, xpd = TRUE,
col = this_colour,
cex = avail_w / test_w,
)
}
# Events
if(opt_events) {
s_date <- (event_data $ Date >= opt_begin & event_data $ Date <= opt_end)
event_data <- event_data[s_date, ]
points(event_data $ Date, rep(-c(0, 2, 4, 1, 3), length=nrow(event_data)) * (xh / 4), pch=21, cex=1.5, bg="#FC7928")
svg_index[["events"]] <- seq(svg_counter, length=nrow(event_data)); inc(svg_counter) <- nrow(event_data)
}
# Legend (SVG is broken for this, do it last)
if(opt_legend) {
party_col <- sapply(incl_parties, function(p) {get_party_data(p); this_colour})
party_bg <- sapply(incl_parties, function(p) {get_party_data(p); this_contrast})
legend("topleft", legend=incl_parties, pch=21, col=party_col, pt.bg=party_bg)
for(p in incl_parties) {
svg_index[[str_c("legend:", p)]] <- svg_counter
inc(svg_counter) <- 1
}
}
if(opt_svg || opt_png) dev.off()
# Bias estimates
if(opt_bc && opt_bcreport) print(round(bias, 1))
###
##
## Fancy SVG stuff.
##
###
if(opt_svg) {
doc <- xmlParse(opt_filename)
svg_points <- unlist(getPlotPoints(doc))
addChildren(xmlChildren(doc) $ svg, kids=list(newXMLNode(name="title", "New Zealand General Election Polling")), at=0)
addECMAScripts(doc, I('
function embiggen(path) {
d = path.getAttributeNS(null, "d").match(/-?\\d+(\\.\\d+)?/ig)
x = ( parseFloat(d[0]) + parseFloat(d[4]) ) / 4;
y = parseFloat(d[1]) / 2;
path.setAttributeNS(null, "transform", "scale(2) translate(-" + x + ", -" + y + ")");
}
function unembiggen(path) {
clicked = path.getAttributeNS(null, "desc")
if(clicked != "clicked") {
path.setAttributeNS(null, "transform", "scale(1)");
}
}
function link(desc, n) {
for(i = 1; i <= n; i++) {
path = document.getElementById(desc + i);
embiggen(path);
}
path = document.getElementById(desc);
bigline(path);
}
function unlink(desc, n) {
for(i = 1; i <= n; i++) {
path = document.getElementById(desc + i);
unembiggen(path);
}
path = document.getElementById(desc);
smallline(path);
}
function clink(desc, n) {
for(i =1; i <= n; i++) {
path = document.getElementById(desc + i);
click(path)
}
path = document.getElementById(desc);
click(path)
}
function click(path) {
desc = path.getAttributeNS(null, "desc")
if(desc == "clicked") {
path.setAttributeNS(null, "desc", "");
}
else {
path.setAttributeNS(null, "desc", "clicked");
}
}
function bigline(path) {
path.style.setProperty("stroke-opacity", "0.5");
}
function smallline(path) {
clicked = path.getAttributeNS(null, "desc")
if(clicked != "clicked") {
path.style.setProperty("stroke-opacity", "0");
}
}
'), at=1)
# Poll results
for(p in incl_parties) {
get_party_data(p)
poll_index <- svg_index[[str_c("pts:", p)]]
pt1 <- format_date_range(x0, x1)
pt2 <- z
pt3 <- sprintf("%s: %s%%", p, y)
poll_text <- str_c(pt1, pt2, pt3, sep=", \n")
addToolTips(doc, text=poll_text, path=svg_points[poll_index])
# for(i in levels(poll_data $ Poll)[-1]) {
for(i in levels(droplevels(z))[-1]) {
if(opt_bc && opt_bccurves && !opt_bcplot) {
sm_index <- svg_index[[sprintf("psm:%s:%s", p, i)]]
link(svg_points[poll_index[z==i]], svg_points[[sm_index]], p, i)
}
else {
link(svg_points[poll_index[z==i]], NULL, p, i)
}
}
elect_index <- svg_index[[str_c("election:", p)]]
et1 <- format_date(x[s_elect])
et2 <- "Election"
et3 <- sprintf("%s: %s%%", p, y[s_elect])
elect_text <- str_c(et1, et2, et3, sep="\n")
addToolTips(doc, text=elect_text, path=svg_points[elect_index])
embiggen(svg_points[elect_index])
if(opt_curvett) {
b <- b_list[[p]]
smpts_date <- format_date(xs)
smpts_pc <- str_c(round(b$fit, 1), '%')
smpts_text <- str_c(smpts_date, smpts_pc, sep='\n')
smpts_index <- svg_index[[str_c("smpts:", p)]]
addToolTips(doc, text=smpts_text, path=svg_points[smpts_index])
}
}
# Events
if(opt_events) {
event_index <- svg_index[["events"]]
event_text <- with(event_data, str_c(format_date(Date), "\n", Event))
addToolTips(doc, text=event_text, path=svg_points[event_index])
embiggen(svg_points[event_index])
}
saveXML(doc, opt_filename)
}
@pitakakariki
Copy link
Author

Testing

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment