MichaelChirico · June 6, 2025 14:19
diff --git a/rcv_bootstrap.R b/rcv_bootstrap.R
 library(data.table)
 library(ggplot2)

 # Downloaded from https://dataverse.harvard.edu/dataset.xhtml?persistentId=doi:10.7910/DVN/AMK8PJ
 dir="/media/michael/ab3f2700-872c-4b29-95f2-9a700166bc52/dataverse_files"

 run_rcv_election = function(ballots) {
  # TODO(michaelchirico): there should be a way to safely avoid a full copy, is it worth it?
  ballots = copy(ballots[, .(rank, candidate)])
  # _don't_ rely on ballots$voterid from input -- in bootstrap, under resampling, we will have
  #   duplicate voterids. That breaks the candidate[1L] logic below (only one vote will be
  #   recorded for each "clone" voter). So we redefine a bootstrap-robust voterid here.
  # TODO(michaelchirico): this feels like the wrong way to do this, e.g. having the logic for
  #   bootstrapping leak into this function, and reling on rowid() to work correctly, which
  #   leaks the logic the other way (since it relies on melt() behavior in the data prep stage).
  ballots[, voterid := rowid(rank)]
  round = 1L
  max_round = uniqueN(ballots$candidate)
  progression = NULL
  
  while (round <= max_round) {
    # first remaining candidate on each ballot
    round_ballots = ballots[!is.na(candidate), .(chosen_candidate = candidate[1L]), by = voterid]
    round_tally = round_ballots[, .N, keyby=chosen_candidate]
    # NB: sum(N) generally reduces as rounds progress -- those ballots with all ranked candidates
    #   eliminated by this round are effectively ignored.
    round_tally[, c("round", "winner") := .(round, N >= sum(N)/2)]
    progression = rbind(progression, round_tally)

    if (any(round_tally$winner)) return(progression)

    loser = round_tally[N == min(N), chosen_candidate]
    ballots[candidate %chin% loser, candidate := NA_character_]
    round = round + 1L
  }
 }

 # There's a placeholder column named 'candidate' in one election (Boulder_11072023_Mayor); drop it.
 maybe_drop_candidate = function(x) {
  if ("candidate" %chin% names(x)) x[, candidate := NULL]
  invisible(x)
 }

 # Annoyingly, the constituent datasets are inconsistent in how ballots are uniquely identified.
 #   The database is consistent in one-row-one-vote so we can just use row# = ID; here are the
 #   different styles of unique ID for the various files, including those with no voter ID field at all:
 #  - Single field: ballotid, cvrnumber, cast vote record, index
 #  - Two fields: ballot style, ballotid or cast vote record
 #  - Three fields: tabulator id, batch id, record id
 #  - No UID, has field names all matching ^(count|ballot style|precinct|source_file|rank[0-9]+)$
 set_voterid = function(x) {
  x[, voterid := seq_len(.N)]
  invisible(x)
 }

 # Similar to set_voterid, not every data set is structured the same. here, there were only
 #   two types of schema observed:
 #  - rank1, rank2, ..., rankN columns giving candidates ranked 1, ..., N
 #  - 1, 2, ..., N columns giving candidates ranked 1, ..., N
 # Standardize on the latter.
 set_ranks = function(x) {
  if (length(rn <- grep("^[0-9]+", names(x)))) setnames(x, rn, paste0("rank", rn))
  invisible(x)
 }

 bootstrap_rcv_election = function(votes_csv, bootstrap_n = 1000L) {
  desc = unlist(strsplit(gsub("[.]csv$", "", basename(votes_csv)), "_"))
  desc = c(desc[1:2], paste(tail(desc, -2), collapse = "_"))
  cat(sprintf(
    "Running bootstrap analysis of election %s in %s, held %s; ",
    desc[3L], desc[1L], as.Date(desc[2L], format="%m%d%Y")
  ))
  # TODO(michaelchirico): am I doing exactly the right thing with overvotes/undervotes/"skipped"?
  ballots = votes_csv |>
    fread(na.strings=c("overvote", "undervote", "skipped", "Undeclared"))
  cat(sprintf("%d voters\n", nrow(ballots)))
  ballots = ballots |>
    # otherwise deal with Index vs. index across files
    setnames(tolower) |>
    maybe_drop_candidate() |>
    set_voterid() |>
    set_ranks() |>
    melt(measure.vars = patterns("^rank"), variable.name="rank", value.name="candidate")
  ballots[, rank := as.integer(gsub("^rank", "", rank))]
  setkey(ballots, voterid, rank)

  true_results = run_rcv_election(ballots)
  true_results[, replica := 0L]

  voters = unique(ballots$voterid)
  ballots[.(sample(voters, replace=TRUE))] |>
    run_rcv_election() |>
    replicate(n = bootstrap_n, simplify = FALSE) |>
    rbindlist(idcol='replica') |>
    rbind(true_results)
 }

 plot_vote_densities = function(election) {
  ggplot(election, aes(x = N, color = chosen_candidate, fill = chosen_candidate)) +
    facet_wrap(~ round, scales = "free_x") +
    geom_density(alpha = 0.5) +
    labs(title = "Distribution of vote totals by Candidate and Round",
         x = "Votes", y = "Density", color = "Candidate", fill = "Candidate") +
    scale_x_log10()
 }

 elections = list.files(dir) |>
  setdiff(c(
    # TODO(michaelchirico): Something weird with these files, fread() returns
    #   what looks like the wrong data set? e.g. rank1 == 10005 which is not a candidate?
    "Portland_06142022_SchoolboardAL.csv",
    "Portland_06142022_Schoolboarddistrict5.csv",
    NULL
  ))
 names(elections) = gsub("[.]csv$", "", elections)
 bootstrap_elections = elections |>
  # TODO(michaelchirico): parallelize
  lapply(\(f) bootstrap_rcv_election(file.path(dir, f))) |>
  rbindlist(idcol='election_id')
 bootstrap_elections[, c("where", "date", "race") := {
  l = strsplit(election_id, "_", fixed=TRUE)
  idx = lengths(l) > 3L
  l[idx] = lapply(l[idx], \(x) c(x[1:2], paste(tail(x, -2), collapse="_")))
  transpose(l)
 }]
 bootstrap_elections[, date := as.IDate(date, "%m%d%Y")]
 setkey(bootstrap_elections, election_id)

 # fwrite(bootstrap_elections, "~/bootstrap_rcv_elections.csv")

 # Some examples of elections with multiple potential winners
 plot_vote_densities(bootstrap_elections[.("Alaska_11082022_HouseDistrict11")][replica != 0])
	library(data.table)
	library(ggplot2)

	# Downloaded from https://dataverse.harvard.edu/dataset.xhtml?persistentId=doi:10.7910/DVN/AMK8PJ
	dir="/media/michael/ab3f2700-872c-4b29-95f2-9a700166bc52/dataverse_files"

	run_rcv_election = function(ballots) {
	# TODO(michaelchirico): there should be a way to safely avoid a full copy, is it worth it?
	ballots = copy(ballots[, .(rank, candidate)])
	# _don't_ rely on ballots$voterid from input -- in bootstrap, under resampling, we will have
	# duplicate voterids. That breaks the candidate[1L] logic below (only one vote will be
	# recorded for each "clone" voter). So we redefine a bootstrap-robust voterid here.
	# TODO(michaelchirico): this feels like the wrong way to do this, e.g. having the logic for
	# bootstrapping leak into this function, and reling on rowid() to work correctly, which
	# leaks the logic the other way (since it relies on melt() behavior in the data prep stage).
	ballots[, voterid := rowid(rank)]
	round = 1L
	max_round = uniqueN(ballots$candidate)
	progression = NULL

	while (round <= max_round) {
	# first remaining candidate on each ballot
	round_ballots = ballots[!is.na(candidate), .(chosen_candidate = candidate[1L]), by = voterid]
	round_tally = round_ballots[, .N, keyby=chosen_candidate]
	# NB: sum(N) generally reduces as rounds progress -- those ballots with all ranked candidates
	# eliminated by this round are effectively ignored.
	round_tally[, c("round", "winner") := .(round, N >= sum(N)/2)]
	progression = rbind(progression, round_tally)

	if (any(round_tally$winner)) return(progression)

	loser = round_tally[N == min(N), chosen_candidate]
	ballots[candidate %chin% loser, candidate := NA_character_]
	round = round + 1L
	}
	}

	# There's a placeholder column named 'candidate' in one election (Boulder_11072023_Mayor); drop it.
	maybe_drop_candidate = function(x) {
	if ("candidate" %chin% names(x)) x[, candidate := NULL]
	invisible(x)
	}

	# Annoyingly, the constituent datasets are inconsistent in how ballots are uniquely identified.
	# The database is consistent in one-row-one-vote so we can just use row# = ID; here are the
	# different styles of unique ID for the various files, including those with no voter ID field at all:
	# - Single field: ballotid, cvrnumber, cast vote record, index
	# - Two fields: ballot style, ballotid or cast vote record
	# - Three fields: tabulator id, batch id, record id
	# - No UID, has field names all matching ^(count\|ballot style\|precinct\|source_file\|rank[0-9]+)$
	set_voterid = function(x) {
	x[, voterid := seq_len(.N)]
	invisible(x)
	}

	# Similar to set_voterid, not every data set is structured the same. here, there were only
	# two types of schema observed:
	# - rank1, rank2, ..., rankN columns giving candidates ranked 1, ..., N
	# - 1, 2, ..., N columns giving candidates ranked 1, ..., N
	# Standardize on the latter.
	set_ranks = function(x) {
	if (length(rn <- grep("^[0-9]+", names(x)))) setnames(x, rn, paste0("rank", rn))
	invisible(x)
	}

	bootstrap_rcv_election = function(votes_csv, bootstrap_n = 1000L) {
	desc = unlist(strsplit(gsub("[.]csv$", "", basename(votes_csv)), "_"))
	desc = c(desc[1:2], paste(tail(desc, -2), collapse = "_"))
	cat(sprintf(
	"Running bootstrap analysis of election %s in %s, held %s; ",
	desc[3L], desc[1L], as.Date(desc[2L], format="%m%d%Y")
	))
	# TODO(michaelchirico): am I doing exactly the right thing with overvotes/undervotes/"skipped"?
	ballots = votes_csv \|>
	fread(na.strings=c("overvote", "undervote", "skipped", "Undeclared"))
	cat(sprintf("%d voters\n", nrow(ballots)))
	ballots = ballots \|>
	# otherwise deal with Index vs. index across files
	setnames(tolower) \|>
	maybe_drop_candidate() \|>
	set_voterid() \|>
	set_ranks() \|>
	melt(measure.vars = patterns("^rank"), variable.name="rank", value.name="candidate")
	ballots[, rank := as.integer(gsub("^rank", "", rank))]
	setkey(ballots, voterid, rank)

	true_results = run_rcv_election(ballots)
	true_results[, replica := 0L]

	voters = unique(ballots$voterid)
	ballots[.(sample(voters, replace=TRUE))] \|>
	run_rcv_election() \|>
	replicate(n = bootstrap_n, simplify = FALSE) \|>
	rbindlist(idcol='replica') \|>
	rbind(true_results)
	}

	plot_vote_densities = function(election) {
	ggplot(election, aes(x = N, color = chosen_candidate, fill = chosen_candidate)) +
	facet_wrap(~ round, scales = "free_x") +
	geom_density(alpha = 0.5) +
	labs(title = "Distribution of vote totals by Candidate and Round",
	x = "Votes", y = "Density", color = "Candidate", fill = "Candidate") +
	scale_x_log10()
	}

	elections = list.files(dir) \|>
	setdiff(c(
	# TODO(michaelchirico): Something weird with these files, fread() returns
	# what looks like the wrong data set? e.g. rank1 == 10005 which is not a candidate?
	"Portland_06142022_SchoolboardAL.csv",
	"Portland_06142022_Schoolboarddistrict5.csv",
	NULL
	))
	names(elections) = gsub("[.]csv$", "", elections)
	bootstrap_elections = elections \|>
	# TODO(michaelchirico): parallelize
	lapply(\(f) bootstrap_rcv_election(file.path(dir, f))) \|>
	rbindlist(idcol='election_id')
	bootstrap_elections[, c("where", "date", "race") := {
	l = strsplit(election_id, "_", fixed=TRUE)
	idx = lengths(l) > 3L
	l[idx] = lapply(l[idx], \(x) c(x[1:2], paste(tail(x, -2), collapse="_")))
	transpose(l)
	}]
	bootstrap_elections[, date := as.IDate(date, "%m%d%Y")]
	setkey(bootstrap_elections, election_id)

	# fwrite(bootstrap_elections, "~/bootstrap_rcv_elections.csv")

	# Some examples of elections with multiple potential winners
	plot_vote_densities(bootstrap_elections[.("Alaska_11082022_HouseDistrict11")][replica != 0])