Skip to content

Instantly share code, notes, and snippets.

@vankesteren
Last active August 7, 2023 13:04
Show Gist options
  • Save vankesteren/6db89b3670816d5d28c74443e6c5624b to your computer and use it in GitHub Desktop.
Save vankesteren/6db89b3670816d5d28c74443e6c5624b to your computer and use it in GitHub Desktop.
Subsample and aggregate method from https://dl.acm.org/doi/pdf/10.1145/1993636.1993743 for estimation
#' Compute any function of single data vector in a differentially private way.
#'
#' The subsample-and-aggregate method applies the function to O(sqrt(n)) subsamples
#' of the data and then averages the function results using a differentially private
#' aggregation function called the "widened winsorized mean".
#'
#' @param x data, single vector
#' @param FUN function returning a single numeric value
#' @param epsilon differential privacy parameter
#' @param lower theoretical lower bound of interest for the function value
#' @param upper theoretical upper bound of interest for the function value
#'
#' @references Smith, A. (2011, June). Privacy-preserving statistical estimation with optimal convergence rates.
#' In Proceedings of the forty-third annual ACM symposium on Theory of computing (pp. 813-822).
#' https://dl.acm.org/doi/pdf/10.1145/1993636.1993743
#'
#' @export
ssa_dp <- function(x, FUN, epsilon = 1e3, lower = -3, upper = 3) {
# Perform checks on data x
n <- length(x)
if (length(dim(x)) > 1) stop("x should be a single vector.")
if (n < 2) stop("x needs more than 1 observation.")
if (n < 10) warning("x has very few observations: ", n)
# Determine number of groups k for subsampling
eta <- 1e-1 # small positive constant
k <- round(n^(1/2 - eta)) # k = O(sqrt(n))
# subsample
div <- floor(n / k)
rem <- n %% k
grp <- c(rep(seq_len(k), div), seq_len(rem))
grp <- sample(grp)
ssx <- split(x, grp)
# apply function to list of subsamples
z <- vapply(ssx, FUN, numeric(1)) # output should be single number
# aggregate subsamples using widened winsorized mean
# algorithm 1 in Smith (2011)
rad <- k^(1/3)
a_hat <- DPpack::quantileDP(z, 1/4, epsilon/4, lower, upper)
b_hat <- DPpack::quantileDP(z, 3/4, epsilon/4, lower, upper)
mu_crude <- mean(a_hat, b_hat)
iqr_crude <- abs(b_hat - a_hat)
lb <- mu_crude - 4 * rad * iqr_crude
ub <- mu_crude + 4 * rad * iqr_crude
z[z < lb] <- lb
z[z > ub] <- ub
lap_noise <- rexp(1, 2 * epsilon * k / abs(ub - lb)) * sample(c(-1, 1), 1) * sqrt(2)
# custom addition: estimate should not be outside theoretical bounds
# todo: check whether this truncation violates DP in any way (I don't think so)
est <- mean(z) + lap_noise
if (est < lower) est <- lower
if (est > upper) est <- upper
return(est)
}
# subsample-and-aggregate method of estimating mean and sd of samples x
set.seed(45)
N <- 500
mu <- -1.3
sigma <- 1.9
x <- rnorm(N, mu, sigma)
# set seed for reproducibility / demonstration, dp is probabilistic
set.seed(45)
ests <- matrix(c(
mean(x), sd(x),
ssa_dp(x, mean, e = 1e+9), ssa_dp(x, sd, e = 1e+9, lower = 0, upper = 6),
ssa_dp(x, mean, e = 1e+1), ssa_dp(x, sd, e = 1e+1, lower = 0, upper = 6),
ssa_dp(x, mean, e = 1e+0), ssa_dp(x, sd, e = 1e+0, lower = 0, upper = 6),
ssa_dp(x, mean, e = 1e-1), ssa_dp(x, sd, e = 1e-1, lower = 0, upper = 6),
ssa_dp(x, mean, e = 1e-2), ssa_dp(x, sd, e = 1e-2, lower = 0, upper = 6)
), nrow = 2)
# do the same but with contaminated data
x[N] <- 10000
set.seed(45)
ests_cont <- matrix(c(
mean(x), sd(x),
ssa_dp(x, mean, e = 1e+9), ssa_dp(x, sd, e = 1e+9, lower = 0, upper = 6),
ssa_dp(x, mean, e = 1e+1), ssa_dp(x, sd, e = 1e+1, lower = 0, upper = 6),
ssa_dp(x, mean, e = 1e+0), ssa_dp(x, sd, e = 1e+0, lower = 0, upper = 6),
ssa_dp(x, mean, e = 1e-1), ssa_dp(x, sd, e = 1e-1, lower = 0, upper = 6),
ssa_dp(x, mean, e = 1e-2), ssa_dp(x, sd, e = 1e-2, lower = 0, upper = 6)
), nrow = 2)
# compare normal estimates to various DP estimates
results <- rbind(ests, ests_cont)
dimnames(results) <- list(
c("mean", "sd", "mean (contaminated)", "sd (contaminated)"),
c("Standard", "DP(1e9)", "DP(10)", "DP(1)", "DP(.1)", "DP(.01)")
)
round(results, 3)
@vankesteren
Copy link
Author

vankesteren commented Aug 7, 2023

Result:

                    Standard DP(1e9) DP(10)  DP(1) DP(.1) DP(.01)
mean                  -1.402  -1.402 -0.318 -0.922      3      -3
sd                     1.950   1.950  1.839  1.730      0       6
mean (contaminated)   18.599  -0.947  2.725 -0.463      3      -3
sd   (contaminated)  447.281   2.251  3.374  2.489      0       6

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment