vankesteren · August 7, 2023 13:04 · vankesteren · Aug 7, 2023
diff --git a/subsample_and_aggregate.R b/subsample_and_aggregate.R
 #' Compute any function of single data vector in a differentially private way.
 #' 
 #' The subsample-and-aggregate method applies the function to O(sqrt(n)) subsamples
 #' of the data and then averages the function results using a differentially private
 #' aggregation function called the "widened winsorized mean".
 #' 
 #' @param x data, single vector
 #' @param FUN function returning a single numeric value
 #' @param epsilon differential privacy parameter
 #' @param lower theoretical lower bound of interest for the function value
 #' @param upper theoretical upper bound of interest for the function value
 #' 
 #' @references Smith, A. (2011, June). Privacy-preserving statistical estimation with optimal convergence rates. 
 #' In Proceedings of the forty-third annual ACM symposium on Theory of computing (pp. 813-822). 
 #' https://dl.acm.org/doi/pdf/10.1145/1993636.1993743
 #' 
 #' @export
 ssa_dp <- function(x, FUN, epsilon = 1e3, lower = -3, upper = 3) {
  # Perform checks on data x
  n <- length(x)
  if (length(dim(x)) > 1) stop("x should be a single vector.")
  if (n < 2) stop("x needs more than 1 observation.")
  if (n < 10) warning("x has very few observations: ", n)
  
  # Determine number of groups k for subsampling
  eta <- 1e-1                   # small positive constant
  k   <- round(n^(1/2 - eta))   # k = O(sqrt(n))
  
  # subsample
  div <- floor(n / k)
  rem <- n %% k
  grp <- c(rep(seq_len(k), div), seq_len(rem)) 
  grp <- sample(grp)
  ssx <- split(x, grp)
  
  # apply function to list of subsamples
  z   <- vapply(ssx, FUN, numeric(1)) # output should be single number
  
  # aggregate subsamples using widened winsorized mean
  # algorithm 1 in Smith (2011)
  rad       <- k^(1/3)
  a_hat     <- DPpack::quantileDP(z, 1/4, epsilon/4, lower, upper)
  b_hat     <- DPpack::quantileDP(z, 3/4, epsilon/4, lower, upper)
  mu_crude  <- mean(a_hat, b_hat)
  iqr_crude <- abs(b_hat - a_hat)
  lb        <- mu_crude - 4 * rad * iqr_crude
  ub        <- mu_crude + 4 * rad * iqr_crude
  z[z < lb] <- lb
  z[z > ub] <- ub
  lap_noise <- rexp(1, 2 * epsilon * k / abs(ub - lb)) * sample(c(-1, 1), 1) * sqrt(2)
  
  # custom addition: estimate should not be outside theoretical bounds
  # todo: check whether this truncation violates DP in any way (I don't think so)
  est <- mean(z) + lap_noise
  if (est < lower) est <- lower
  if (est > upper) est <- upper
  return(est)
 }

 # subsample-and-aggregate method of estimating mean and sd of samples x
 set.seed(45)
 N     <- 500
 mu    <- -1.3
 sigma <- 1.9
 x     <- rnorm(N, mu, sigma)

 # set seed for reproducibility / demonstration, dp is probabilistic
 set.seed(45)
 ests <- matrix(c(
  mean(x),                   sd(x), 
  ssa_dp(x, mean, e = 1e+9), ssa_dp(x, sd, e = 1e+9, lower = 0, upper = 6), 
  ssa_dp(x, mean, e = 1e+1), ssa_dp(x, sd, e = 1e+1, lower = 0, upper = 6), 
  ssa_dp(x, mean, e = 1e+0), ssa_dp(x, sd, e = 1e+0, lower = 0, upper = 6), 
  ssa_dp(x, mean, e = 1e-1), ssa_dp(x, sd, e = 1e-1, lower = 0, upper = 6), 
  ssa_dp(x, mean, e = 1e-2), ssa_dp(x, sd, e = 1e-2, lower = 0, upper = 6)
 ), nrow = 2)

 # do the same but with contaminated data 
 x[N] <- 10000
 set.seed(45)
 ests_cont <- matrix(c(
  mean(x),                   sd(x), 
  ssa_dp(x, mean, e = 1e+9), ssa_dp(x, sd, e = 1e+9, lower = 0, upper = 6), 
  ssa_dp(x, mean, e = 1e+1), ssa_dp(x, sd, e = 1e+1, lower = 0, upper = 6), 
  ssa_dp(x, mean, e = 1e+0), ssa_dp(x, sd, e = 1e+0, lower = 0, upper = 6), 
  ssa_dp(x, mean, e = 1e-1), ssa_dp(x, sd, e = 1e-1, lower = 0, upper = 6), 
  ssa_dp(x, mean, e = 1e-2), ssa_dp(x, sd, e = 1e-2, lower = 0, upper = 6)
 ), nrow = 2)


 # compare normal estimates to various DP estimates
 results <- rbind(ests, ests_cont)
 dimnames(results) <- list(
  c("mean", "sd", "mean (contaminated)", "sd   (contaminated)"), 
  c("Standard", "DP(1e9)", "DP(10)", "DP(1)", "DP(.1)", "DP(.01)")
 )
 round(results, 3)
	#' Compute any function of single data vector in a differentially private way.
	#'
	#' The subsample-and-aggregate method applies the function to O(sqrt(n)) subsamples
	#' of the data and then averages the function results using a differentially private
	#' aggregation function called the "widened winsorized mean".
	#'
	#' @param x data, single vector
	#' @param FUN function returning a single numeric value
	#' @param epsilon differential privacy parameter
	#' @param lower theoretical lower bound of interest for the function value
	#' @param upper theoretical upper bound of interest for the function value
	#'
	#' @references Smith, A. (2011, June). Privacy-preserving statistical estimation with optimal convergence rates.
	#' In Proceedings of the forty-third annual ACM symposium on Theory of computing (pp. 813-822).
	#' https://dl.acm.org/doi/pdf/10.1145/1993636.1993743
	#'
	#' @export
	ssa_dp <- function(x, FUN, epsilon = 1e3, lower = -3, upper = 3) {
	# Perform checks on data x
	n <- length(x)
	if (length(dim(x)) > 1) stop("x should be a single vector.")
	if (n < 2) stop("x needs more than 1 observation.")
	if (n < 10) warning("x has very few observations: ", n)

	# Determine number of groups k for subsampling
	eta <- 1e-1 # small positive constant
	k <- round(n^(1/2 - eta)) # k = O(sqrt(n))

	# subsample
	div <- floor(n / k)
	rem <- n %% k
	grp <- c(rep(seq_len(k), div), seq_len(rem))
	grp <- sample(grp)
	ssx <- split(x, grp)

	# apply function to list of subsamples
	z <- vapply(ssx, FUN, numeric(1)) # output should be single number

	# aggregate subsamples using widened winsorized mean
	# algorithm 1 in Smith (2011)
	rad <- k^(1/3)
	a_hat <- DPpack::quantileDP(z, 1/4, epsilon/4, lower, upper)
	b_hat <- DPpack::quantileDP(z, 3/4, epsilon/4, lower, upper)
	mu_crude <- mean(a_hat, b_hat)
	iqr_crude <- abs(b_hat - a_hat)
	lb <- mu_crude - 4 * rad * iqr_crude
	ub <- mu_crude + 4 * rad * iqr_crude
	z[z < lb] <- lb
	z[z > ub] <- ub
	lap_noise <- rexp(1, 2 * epsilon * k / abs(ub - lb)) * sample(c(-1, 1), 1) * sqrt(2)

	# custom addition: estimate should not be outside theoretical bounds
	# todo: check whether this truncation violates DP in any way (I don't think so)
	est <- mean(z) + lap_noise
	if (est < lower) est <- lower
	if (est > upper) est <- upper
	return(est)
	}

	# subsample-and-aggregate method of estimating mean and sd of samples x
	set.seed(45)
	N <- 500
	mu <- -1.3
	sigma <- 1.9
	x <- rnorm(N, mu, sigma)

	# set seed for reproducibility / demonstration, dp is probabilistic
	set.seed(45)
	ests <- matrix(c(
	mean(x), sd(x),
	ssa_dp(x, mean, e = 1e+9), ssa_dp(x, sd, e = 1e+9, lower = 0, upper = 6),
	ssa_dp(x, mean, e = 1e+1), ssa_dp(x, sd, e = 1e+1, lower = 0, upper = 6),
	ssa_dp(x, mean, e = 1e+0), ssa_dp(x, sd, e = 1e+0, lower = 0, upper = 6),
	ssa_dp(x, mean, e = 1e-1), ssa_dp(x, sd, e = 1e-1, lower = 0, upper = 6),
	ssa_dp(x, mean, e = 1e-2), ssa_dp(x, sd, e = 1e-2, lower = 0, upper = 6)
	), nrow = 2)

	# do the same but with contaminated data
	x[N] <- 10000
	set.seed(45)
	ests_cont <- matrix(c(
	mean(x), sd(x),
	ssa_dp(x, mean, e = 1e+9), ssa_dp(x, sd, e = 1e+9, lower = 0, upper = 6),
	ssa_dp(x, mean, e = 1e+1), ssa_dp(x, sd, e = 1e+1, lower = 0, upper = 6),
	ssa_dp(x, mean, e = 1e+0), ssa_dp(x, sd, e = 1e+0, lower = 0, upper = 6),
	ssa_dp(x, mean, e = 1e-1), ssa_dp(x, sd, e = 1e-1, lower = 0, upper = 6),
	ssa_dp(x, mean, e = 1e-2), ssa_dp(x, sd, e = 1e-2, lower = 0, upper = 6)
	), nrow = 2)


	# compare normal estimates to various DP estimates
	results <- rbind(ests, ests_cont)
	dimnames(results) <- list(
	c("mean", "sd", "mean (contaminated)", "sd (contaminated)"),
	c("Standard", "DP(1e9)", "DP(10)", "DP(1)", "DP(.1)", "DP(.01)")
	)
	round(results, 3)