Last active
October 12, 2022 14:30
-
-
Save bschneidr/6dd967643c9a829ec59e09a2954f920a to your computer and use it in GitHub Desktop.
Wilson's confidence interval for complex surveys
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#' @title Wilson's confidence interval for complex survey designs | |
#' @description Calculate Wilson's confidence interval for a proportion, | |
#' with the effective sample size determined using a design-unbiased | |
#' estimate of the complex survey design effect. | |
#' | |
#' @param x A formula, vector, or matrix. | |
#' @param design A survey.design or svyrep.design object | |
#' @param na.rm Should cases with missing values be dropped? | |
#' @param level The confidence level required | |
#' | |
#' @return An object of class 'svyciprop'. | |
#' Use \code{as.numeric(coef(result))} to extract the point estimates, | |
#' and use \code{confint()} to extract the confidence intervals. | |
#' The variance-covariance matrix, standard errors, and design effects | |
#' can be extracted using the usual methods. | |
#' @export | |
#' | |
#' @examples | |
#' library(survey) | |
#' | |
#' data(api) | |
#' dclus1 <- svydesign(id=~dnum, fpc=~fpc, data=apiclus1) | |
#' | |
#' estimates <- svy_prop_with_wilson_ci(x = ~ stype, | |
#' na.rm = TRUE, | |
#' design = dclus1, | |
#' level = 0.95) | |
#' confint(estimates) | |
#' as.numeric(coef(estimates)) | |
#' SE(estimates) | |
#' vcov(estimates) | |
#' deff(estimates) | |
#' | |
#' # Example with `svyby()` | |
#' svyby(~ awards, by = ~ stype, design = dclus1, | |
#' FUN = svy_prop_with_wilson_ci, vartype = c("ci", "se")) | |
#' | |
svy_prop_with_wilson_ci <- function(x, design, na.rm = FALSE, level = 0.95) { | |
# Extract the matrix of variables to use for the estimate | |
# (This code is taken directly from the 'svymean()' function) | |
# It's complicated because x can be a formula, a vector, a data frame, etc. | |
if (inherits(x, "formula")) { | |
mf <- model.frame(x, design$variables, na.action = na.pass) | |
xx <- lapply( | |
attr(terms(x), "variables")[-1], | |
function(tt) model.matrix(eval(bquote(~0 + .(tt))), mf) | |
) | |
cols <- sapply(xx, NCOL) | |
x <- matrix(nrow = NROW(xx[[1]]), ncol = sum(cols)) | |
scols <- c(0, cumsum(cols)) | |
for (i in 1:length(xx)) { | |
x[, scols[i] + 1:cols[i]] <- xx[[i]] | |
} | |
colnames(x) <- do.call("c", lapply(xx, colnames)) | |
} else { | |
if (typeof(x) %in% c("expression", "symbol")) | |
x <- eval(x, design$variables) | |
else if (is.data.frame(x) && any(sapply(x, is.factor))) { | |
xx <- lapply(x, function(xi) { | |
if (is.factor(xi)) | |
0 + (outer(xi, levels(xi), "==")) | |
else xi | |
}) | |
cols <- sapply(xx, NCOL) | |
scols <- c(0, cumsum(cols)) | |
cn <- character(sum(cols)) | |
for (i in 1:length(xx)) cn[scols[i] + 1:cols[i]] <- paste(names(x)[i], | |
levels(x[[i]]), sep = "") | |
x <- matrix(nrow = NROW(xx[[1]]), ncol = sum(cols)) | |
for (i in 1:length(xx)) { | |
x[, scols[i] + 1:cols[i]] <- xx[[i]] | |
} | |
colnames(x) <- cn | |
} | |
} | |
if (na.rm) { | |
nas <- rowSums(is.na(x)) | |
design <- design[nas == 0, ] | |
if (length(nas) > length(design$prob)) | |
x <- x[nas == 0, , drop = FALSE] | |
else x[nas > 0, ] <- 0 | |
} | |
# Point estimates, standard errors, and design effects | |
estimate <- survey::svymean(x = x, na.rm = TRUE, | |
design = design, | |
deff = TRUE) | |
# Determine the sample size used for the estimate | |
n_for_estimate <- nrow(x) | |
# Obtain design-based estimate of the design effect | |
# and the effective sample size | |
design_effects <- survey::deff(estimate) | |
if (is.matrix(design_effects) && isSymmetric(design_effects)) { | |
design_effects <- diag(design_effects) | |
} | |
n_eff <- n_for_estimate / design_effects | |
# Calculate "effective number of successes" | |
p_hat <- coef(estimate) | |
q_hat <- 1 - p_hat | |
n_successes_eff <- p_hat * n_eff | |
# Determine z statistic to use | |
alpha <- (1 - level) | |
half_alpha <- alpha/2 | |
z <- qnorm(p = half_alpha, lower.tail = FALSE) | |
# Calculate confidence interval | |
center <- (n_successes_eff + (z^2)/2)/(n_eff + (z^2)) | |
width <- (z * sqrt(n_eff))/(n_eff + (z^2)) | |
width <- width * sqrt((p_hat*q_hat) + (z^2)/(4*n_eff)) | |
lower <- center - width | |
upper <- center + width | |
ci <- rbind(lower, upper) | |
rownames(ci) <- paste(round(c(half_alpha, level + half_alpha) * | |
100, 1), "%", sep = "") | |
colnames(ci) <- NULL | |
# Wrap up the result as an object of class 'svyciprop' | |
# so that S3 methods from the survey package can be used | |
# (e.g. print(), confint(), SE(), deff(), etc.) | |
result <- p_hat | |
attr(result, "var") <- vcov(estimate) | |
attr(result, 'deff') <- attr(estimate, 'deff') | |
attr(result, 'ci') <- ci | |
class(result) <- 'svyciprop' | |
return(result) | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment