Last active
August 7, 2023 13:04
-
-
Save vankesteren/6db89b3670816d5d28c74443e6c5624b to your computer and use it in GitHub Desktop.
Subsample and aggregate method from https://dl.acm.org/doi/pdf/10.1145/1993636.1993743 for estimation
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#' Compute any function of single data vector in a differentially private way. | |
#' | |
#' The subsample-and-aggregate method applies the function to O(sqrt(n)) subsamples | |
#' of the data and then averages the function results using a differentially private | |
#' aggregation function called the "widened winsorized mean". | |
#' | |
#' @param x data, single vector | |
#' @param FUN function returning a single numeric value | |
#' @param epsilon differential privacy parameter | |
#' @param lower theoretical lower bound of interest for the function value | |
#' @param upper theoretical upper bound of interest for the function value | |
#' | |
#' @references Smith, A. (2011, June). Privacy-preserving statistical estimation with optimal convergence rates. | |
#' In Proceedings of the forty-third annual ACM symposium on Theory of computing (pp. 813-822). | |
#' https://dl.acm.org/doi/pdf/10.1145/1993636.1993743 | |
#' | |
#' @export | |
ssa_dp <- function(x, FUN, epsilon = 1e3, lower = -3, upper = 3) { | |
# Perform checks on data x | |
n <- length(x) | |
if (length(dim(x)) > 1) stop("x should be a single vector.") | |
if (n < 2) stop("x needs more than 1 observation.") | |
if (n < 10) warning("x has very few observations: ", n) | |
# Determine number of groups k for subsampling | |
eta <- 1e-1 # small positive constant | |
k <- round(n^(1/2 - eta)) # k = O(sqrt(n)) | |
# subsample | |
div <- floor(n / k) | |
rem <- n %% k | |
grp <- c(rep(seq_len(k), div), seq_len(rem)) | |
grp <- sample(grp) | |
ssx <- split(x, grp) | |
# apply function to list of subsamples | |
z <- vapply(ssx, FUN, numeric(1)) # output should be single number | |
# aggregate subsamples using widened winsorized mean | |
# algorithm 1 in Smith (2011) | |
rad <- k^(1/3) | |
a_hat <- DPpack::quantileDP(z, 1/4, epsilon/4, lower, upper) | |
b_hat <- DPpack::quantileDP(z, 3/4, epsilon/4, lower, upper) | |
mu_crude <- mean(a_hat, b_hat) | |
iqr_crude <- abs(b_hat - a_hat) | |
lb <- mu_crude - 4 * rad * iqr_crude | |
ub <- mu_crude + 4 * rad * iqr_crude | |
z[z < lb] <- lb | |
z[z > ub] <- ub | |
lap_noise <- rexp(1, 2 * epsilon * k / abs(ub - lb)) * sample(c(-1, 1), 1) * sqrt(2) | |
# custom addition: estimate should not be outside theoretical bounds | |
# todo: check whether this truncation violates DP in any way (I don't think so) | |
est <- mean(z) + lap_noise | |
if (est < lower) est <- lower | |
if (est > upper) est <- upper | |
return(est) | |
} | |
# subsample-and-aggregate method of estimating mean and sd of samples x | |
set.seed(45) | |
N <- 500 | |
mu <- -1.3 | |
sigma <- 1.9 | |
x <- rnorm(N, mu, sigma) | |
# set seed for reproducibility / demonstration, dp is probabilistic | |
set.seed(45) | |
ests <- matrix(c( | |
mean(x), sd(x), | |
ssa_dp(x, mean, e = 1e+9), ssa_dp(x, sd, e = 1e+9, lower = 0, upper = 6), | |
ssa_dp(x, mean, e = 1e+1), ssa_dp(x, sd, e = 1e+1, lower = 0, upper = 6), | |
ssa_dp(x, mean, e = 1e+0), ssa_dp(x, sd, e = 1e+0, lower = 0, upper = 6), | |
ssa_dp(x, mean, e = 1e-1), ssa_dp(x, sd, e = 1e-1, lower = 0, upper = 6), | |
ssa_dp(x, mean, e = 1e-2), ssa_dp(x, sd, e = 1e-2, lower = 0, upper = 6) | |
), nrow = 2) | |
# do the same but with contaminated data | |
x[N] <- 10000 | |
set.seed(45) | |
ests_cont <- matrix(c( | |
mean(x), sd(x), | |
ssa_dp(x, mean, e = 1e+9), ssa_dp(x, sd, e = 1e+9, lower = 0, upper = 6), | |
ssa_dp(x, mean, e = 1e+1), ssa_dp(x, sd, e = 1e+1, lower = 0, upper = 6), | |
ssa_dp(x, mean, e = 1e+0), ssa_dp(x, sd, e = 1e+0, lower = 0, upper = 6), | |
ssa_dp(x, mean, e = 1e-1), ssa_dp(x, sd, e = 1e-1, lower = 0, upper = 6), | |
ssa_dp(x, mean, e = 1e-2), ssa_dp(x, sd, e = 1e-2, lower = 0, upper = 6) | |
), nrow = 2) | |
# compare normal estimates to various DP estimates | |
results <- rbind(ests, ests_cont) | |
dimnames(results) <- list( | |
c("mean", "sd", "mean (contaminated)", "sd (contaminated)"), | |
c("Standard", "DP(1e9)", "DP(10)", "DP(1)", "DP(.1)", "DP(.01)") | |
) | |
round(results, 3) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Result: