Skip to content

Instantly share code, notes, and snippets.

@MichaelChirico
Created June 6, 2025 14:19
Show Gist options
  • Save MichaelChirico/391c340d86c7a4659c1b5f1b2f38eb3f to your computer and use it in GitHub Desktop.
Save MichaelChirico/391c340d86c7a4659c1b5f1b2f38eb3f to your computer and use it in GitHub Desktop.
Code for running bootstraps of ranked-choice voting elections
library(data.table)
library(ggplot2)
# Downloaded from https://dataverse.harvard.edu/dataset.xhtml?persistentId=doi:10.7910/DVN/AMK8PJ
dir="/media/michael/ab3f2700-872c-4b29-95f2-9a700166bc52/dataverse_files"
run_rcv_election = function(ballots) {
# TODO(michaelchirico): there should be a way to safely avoid a full copy, is it worth it?
ballots = copy(ballots[, .(rank, candidate)])
# _don't_ rely on ballots$voterid from input -- in bootstrap, under resampling, we will have
# duplicate voterids. That breaks the candidate[1L] logic below (only one vote will be
# recorded for each "clone" voter). So we redefine a bootstrap-robust voterid here.
# TODO(michaelchirico): this feels like the wrong way to do this, e.g. having the logic for
# bootstrapping leak into this function, and reling on rowid() to work correctly, which
# leaks the logic the other way (since it relies on melt() behavior in the data prep stage).
ballots[, voterid := rowid(rank)]
round = 1L
max_round = uniqueN(ballots$candidate)
progression = NULL
while (round <= max_round) {
# first remaining candidate on each ballot
round_ballots = ballots[!is.na(candidate), .(chosen_candidate = candidate[1L]), by = voterid]
round_tally = round_ballots[, .N, keyby=chosen_candidate]
# NB: sum(N) generally reduces as rounds progress -- those ballots with all ranked candidates
# eliminated by this round are effectively ignored.
round_tally[, c("round", "winner") := .(round, N >= sum(N)/2)]
progression = rbind(progression, round_tally)
if (any(round_tally$winner)) return(progression)
loser = round_tally[N == min(N), chosen_candidate]
ballots[candidate %chin% loser, candidate := NA_character_]
round = round + 1L
}
}
# There's a placeholder column named 'candidate' in one election (Boulder_11072023_Mayor); drop it.
maybe_drop_candidate = function(x) {
if ("candidate" %chin% names(x)) x[, candidate := NULL]
invisible(x)
}
# Annoyingly, the constituent datasets are inconsistent in how ballots are uniquely identified.
# The database is consistent in one-row-one-vote so we can just use row# = ID; here are the
# different styles of unique ID for the various files, including those with no voter ID field at all:
# - Single field: ballotid, cvrnumber, cast vote record, index
# - Two fields: ballot style, ballotid or cast vote record
# - Three fields: tabulator id, batch id, record id
# - No UID, has field names all matching ^(count|ballot style|precinct|source_file|rank[0-9]+)$
set_voterid = function(x) {
x[, voterid := seq_len(.N)]
invisible(x)
}
# Similar to set_voterid, not every data set is structured the same. here, there were only
# two types of schema observed:
# - rank1, rank2, ..., rankN columns giving candidates ranked 1, ..., N
# - 1, 2, ..., N columns giving candidates ranked 1, ..., N
# Standardize on the latter.
set_ranks = function(x) {
if (length(rn <- grep("^[0-9]+", names(x)))) setnames(x, rn, paste0("rank", rn))
invisible(x)
}
bootstrap_rcv_election = function(votes_csv, bootstrap_n = 1000L) {
desc = unlist(strsplit(gsub("[.]csv$", "", basename(votes_csv)), "_"))
desc = c(desc[1:2], paste(tail(desc, -2), collapse = "_"))
cat(sprintf(
"Running bootstrap analysis of election %s in %s, held %s; ",
desc[3L], desc[1L], as.Date(desc[2L], format="%m%d%Y")
))
# TODO(michaelchirico): am I doing exactly the right thing with overvotes/undervotes/"skipped"?
ballots = votes_csv |>
fread(na.strings=c("overvote", "undervote", "skipped", "Undeclared"))
cat(sprintf("%d voters\n", nrow(ballots)))
ballots = ballots |>
# otherwise deal with Index vs. index across files
setnames(tolower) |>
maybe_drop_candidate() |>
set_voterid() |>
set_ranks() |>
melt(measure.vars = patterns("^rank"), variable.name="rank", value.name="candidate")
ballots[, rank := as.integer(gsub("^rank", "", rank))]
setkey(ballots, voterid, rank)
true_results = run_rcv_election(ballots)
true_results[, replica := 0L]
voters = unique(ballots$voterid)
ballots[.(sample(voters, replace=TRUE))] |>
run_rcv_election() |>
replicate(n = bootstrap_n, simplify = FALSE) |>
rbindlist(idcol='replica') |>
rbind(true_results)
}
plot_vote_densities = function(election) {
ggplot(election, aes(x = N, color = chosen_candidate, fill = chosen_candidate)) +
facet_wrap(~ round, scales = "free_x") +
geom_density(alpha = 0.5) +
labs(title = "Distribution of vote totals by Candidate and Round",
x = "Votes", y = "Density", color = "Candidate", fill = "Candidate") +
scale_x_log10()
}
elections = list.files(dir) |>
setdiff(c(
# TODO(michaelchirico): Something weird with these files, fread() returns
# what looks like the wrong data set? e.g. rank1 == 10005 which is not a candidate?
"Portland_06142022_SchoolboardAL.csv",
"Portland_06142022_Schoolboarddistrict5.csv",
NULL
))
names(elections) = gsub("[.]csv$", "", elections)
bootstrap_elections = elections |>
# TODO(michaelchirico): parallelize
lapply(\(f) bootstrap_rcv_election(file.path(dir, f))) |>
rbindlist(idcol='election_id')
bootstrap_elections[, c("where", "date", "race") := {
l = strsplit(election_id, "_", fixed=TRUE)
idx = lengths(l) > 3L
l[idx] = lapply(l[idx], \(x) c(x[1:2], paste(tail(x, -2), collapse="_")))
transpose(l)
}]
bootstrap_elections[, date := as.IDate(date, "%m%d%Y")]
setkey(bootstrap_elections, election_id)
# fwrite(bootstrap_elections, "~/bootstrap_rcv_elections.csv")
# Some examples of elections with multiple potential winners
plot_vote_densities(bootstrap_elections[.("Alaska_11082022_HouseDistrict11")][replica != 0])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment