Created
June 6, 2025 14:19
-
-
Save MichaelChirico/391c340d86c7a4659c1b5f1b2f38eb3f to your computer and use it in GitHub Desktop.
Code for running bootstraps of ranked-choice voting elections
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(data.table) | |
library(ggplot2) | |
# Downloaded from https://dataverse.harvard.edu/dataset.xhtml?persistentId=doi:10.7910/DVN/AMK8PJ | |
dir="/media/michael/ab3f2700-872c-4b29-95f2-9a700166bc52/dataverse_files" | |
run_rcv_election = function(ballots) { | |
# TODO(michaelchirico): there should be a way to safely avoid a full copy, is it worth it? | |
ballots = copy(ballots[, .(rank, candidate)]) | |
# _don't_ rely on ballots$voterid from input -- in bootstrap, under resampling, we will have | |
# duplicate voterids. That breaks the candidate[1L] logic below (only one vote will be | |
# recorded for each "clone" voter). So we redefine a bootstrap-robust voterid here. | |
# TODO(michaelchirico): this feels like the wrong way to do this, e.g. having the logic for | |
# bootstrapping leak into this function, and reling on rowid() to work correctly, which | |
# leaks the logic the other way (since it relies on melt() behavior in the data prep stage). | |
ballots[, voterid := rowid(rank)] | |
round = 1L | |
max_round = uniqueN(ballots$candidate) | |
progression = NULL | |
while (round <= max_round) { | |
# first remaining candidate on each ballot | |
round_ballots = ballots[!is.na(candidate), .(chosen_candidate = candidate[1L]), by = voterid] | |
round_tally = round_ballots[, .N, keyby=chosen_candidate] | |
# NB: sum(N) generally reduces as rounds progress -- those ballots with all ranked candidates | |
# eliminated by this round are effectively ignored. | |
round_tally[, c("round", "winner") := .(round, N >= sum(N)/2)] | |
progression = rbind(progression, round_tally) | |
if (any(round_tally$winner)) return(progression) | |
loser = round_tally[N == min(N), chosen_candidate] | |
ballots[candidate %chin% loser, candidate := NA_character_] | |
round = round + 1L | |
} | |
} | |
# There's a placeholder column named 'candidate' in one election (Boulder_11072023_Mayor); drop it. | |
maybe_drop_candidate = function(x) { | |
if ("candidate" %chin% names(x)) x[, candidate := NULL] | |
invisible(x) | |
} | |
# Annoyingly, the constituent datasets are inconsistent in how ballots are uniquely identified. | |
# The database is consistent in one-row-one-vote so we can just use row# = ID; here are the | |
# different styles of unique ID for the various files, including those with no voter ID field at all: | |
# - Single field: ballotid, cvrnumber, cast vote record, index | |
# - Two fields: ballot style, ballotid or cast vote record | |
# - Three fields: tabulator id, batch id, record id | |
# - No UID, has field names all matching ^(count|ballot style|precinct|source_file|rank[0-9]+)$ | |
set_voterid = function(x) { | |
x[, voterid := seq_len(.N)] | |
invisible(x) | |
} | |
# Similar to set_voterid, not every data set is structured the same. here, there were only | |
# two types of schema observed: | |
# - rank1, rank2, ..., rankN columns giving candidates ranked 1, ..., N | |
# - 1, 2, ..., N columns giving candidates ranked 1, ..., N | |
# Standardize on the latter. | |
set_ranks = function(x) { | |
if (length(rn <- grep("^[0-9]+", names(x)))) setnames(x, rn, paste0("rank", rn)) | |
invisible(x) | |
} | |
bootstrap_rcv_election = function(votes_csv, bootstrap_n = 1000L) { | |
desc = unlist(strsplit(gsub("[.]csv$", "", basename(votes_csv)), "_")) | |
desc = c(desc[1:2], paste(tail(desc, -2), collapse = "_")) | |
cat(sprintf( | |
"Running bootstrap analysis of election %s in %s, held %s; ", | |
desc[3L], desc[1L], as.Date(desc[2L], format="%m%d%Y") | |
)) | |
# TODO(michaelchirico): am I doing exactly the right thing with overvotes/undervotes/"skipped"? | |
ballots = votes_csv |> | |
fread(na.strings=c("overvote", "undervote", "skipped", "Undeclared")) | |
cat(sprintf("%d voters\n", nrow(ballots))) | |
ballots = ballots |> | |
# otherwise deal with Index vs. index across files | |
setnames(tolower) |> | |
maybe_drop_candidate() |> | |
set_voterid() |> | |
set_ranks() |> | |
melt(measure.vars = patterns("^rank"), variable.name="rank", value.name="candidate") | |
ballots[, rank := as.integer(gsub("^rank", "", rank))] | |
setkey(ballots, voterid, rank) | |
true_results = run_rcv_election(ballots) | |
true_results[, replica := 0L] | |
voters = unique(ballots$voterid) | |
ballots[.(sample(voters, replace=TRUE))] |> | |
run_rcv_election() |> | |
replicate(n = bootstrap_n, simplify = FALSE) |> | |
rbindlist(idcol='replica') |> | |
rbind(true_results) | |
} | |
plot_vote_densities = function(election) { | |
ggplot(election, aes(x = N, color = chosen_candidate, fill = chosen_candidate)) + | |
facet_wrap(~ round, scales = "free_x") + | |
geom_density(alpha = 0.5) + | |
labs(title = "Distribution of vote totals by Candidate and Round", | |
x = "Votes", y = "Density", color = "Candidate", fill = "Candidate") + | |
scale_x_log10() | |
} | |
elections = list.files(dir) |> | |
setdiff(c( | |
# TODO(michaelchirico): Something weird with these files, fread() returns | |
# what looks like the wrong data set? e.g. rank1 == 10005 which is not a candidate? | |
"Portland_06142022_SchoolboardAL.csv", | |
"Portland_06142022_Schoolboarddistrict5.csv", | |
NULL | |
)) | |
names(elections) = gsub("[.]csv$", "", elections) | |
bootstrap_elections = elections |> | |
# TODO(michaelchirico): parallelize | |
lapply(\(f) bootstrap_rcv_election(file.path(dir, f))) |> | |
rbindlist(idcol='election_id') | |
bootstrap_elections[, c("where", "date", "race") := { | |
l = strsplit(election_id, "_", fixed=TRUE) | |
idx = lengths(l) > 3L | |
l[idx] = lapply(l[idx], \(x) c(x[1:2], paste(tail(x, -2), collapse="_"))) | |
transpose(l) | |
}] | |
bootstrap_elections[, date := as.IDate(date, "%m%d%Y")] | |
setkey(bootstrap_elections, election_id) | |
# fwrite(bootstrap_elections, "~/bootstrap_rcv_elections.csv") | |
# Some examples of elections with multiple potential winners | |
plot_vote_densities(bootstrap_elections[.("Alaska_11082022_HouseDistrict11")][replica != 0]) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment