Last active
August 9, 2023 17:26
-
-
Save eschen42/ce226d4f924dae9e0e4bbf87ca8f6b0b to your computer and use it in GitHub Desktop.
Get unadjusted and adjusted p-values from LCT-B and LCT-N (see [comment](https://gist.github.com/eschen42/ce226d4f924dae9e0e4bbf87ca8f6b0b?permalink_comment_id=4653172#gistcomment-4653172))
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# LCTboot_inspect - TestCor::LCTboot modified: | |
# - to return a "reportables" data.frame as an attribute of the result | |
# - (for estimating the adjusted and unadjusted p-values) | |
# - to return the function environment as an attribute of the result | |
# - (for debugging reportables) | |
# - to use the normal approximation when Nboot == 0 | |
# - (so that the same code base may be used for LCT-N and LCT-B) | |
# ref: https://github.com/cran/TestCor/blob/9d5d2a9a1ac284e3346bd144776559cbd82a8b52/R/FdrMethods.R | |
#' Bootstrap procedure LCT-B proposed by Cai & Liu (2016) for correlation testing. | |
#' | |
#' @param data matrix of observations | |
#' @param alpha level of multiple testing | |
#' @param stat_test | |
#' \describe{ | |
#' \item{'empirical'}{\eqn{\sqrt{n}*abs(corr)}} | |
#' \item{'fisher'}{\eqn{\sqrt{n-3}*1/2*\log( (1+corr)/(1-corr) )}} | |
#' \item{'student'}{\eqn{\sqrt{n-2}*abs(corr)/\sqrt(1-corr^2)}} | |
#' \item{'2nd.order'}{\eqn{\sqrt{n}*mean(Y)/sd(Y)} with \eqn{Y=(X_i-mean(X_i))(X_j-mean(X_j))}} | |
#' } | |
#' @param Nboot number of iterations for bootstrap quantile evaluation | |
#' @param vect if TRUE returns a vector of TRUE/FALSE values, corresponding to \code{vectorize(cor(data))}; | |
#' if FALSE, returns an array containing TRUE/FALSE values for each entry of the correlation matrix | |
#' @param arr.ind if TRUE, returns the indexes of the significant correlations, with respect to level alpha | |
#' | |
#' @return Returns \itemize{\item{logicals, equal to TRUE if the corresponding element of the statistic vector is rejected, as a vector or a matrix depending of the value of \code{vect},} \item{an array containing indexes \eqn{\lbrace(i,j),\,i<j\rbrace} for which correlation between variables \eqn{i} and \eqn{j} is significant, if \code{arr.ind=TRUE}.}} | |
#' | |
#' @importFrom stats cor quantile var | |
#' | |
#' @export | |
#' @seealso TestCor::ApplyFdrCor, TestCor::LCTNorm | |
#' @references Cai, T. T., & Liu, W. (2016). Large-scale multiple testing of correlations. Journal of the American Statistical Association, 111(513), 229-240. | |
#' | |
#' @examples | |
#' n <- 100 | |
#' p <- 10 | |
#' corr_theo <- diag(1,p) | |
#' corr_theo[1,3] <- 0.5 | |
#' corr_theo[3,1] <- 0.5 | |
#' data <- MASS::mvrnorm(n,rep(0,p),corr_theo) | |
#' alpha <- 0.05 | |
#' # significant correlations: | |
#' LCTboot(data,alpha,stat_test='empirical',Nboot=100,arr.ind=TRUE) | |
LCTboot_inspect <- | |
function( # requires `TestCor` | |
input_data, | |
alpha = 0.05, | |
stat_test = '2nd.order', | |
Nboot = 100, | |
vect = FALSE, | |
arr.ind = FALSE | |
) { | |
if (sum(stat_test == c('empirical','fisher','student','2nd.order')) == 0) { | |
stop('Wrong value for stat_test.') | |
} | |
require(TestCor) | |
n <- nrow(input_data) # n = number of samples | |
p <- ncol(input_data) # p = number of proteins (variables) | |
p_names <- colnames(input_data) | |
# test statistic | |
stat_sign <- TestCor::eval_stat(input_data, stat_test) | |
stat <- abs(stat_sign) | |
stat_sign <- sign(stat_sign) | |
m <- length(stat) | |
# -------------------------------------------------------------------------- | |
# These lines are the only changes to the original code (apart from format- | |
# ing) and do not change the result apart from preparing an attribute. | |
# This is a usability hack for the context attribute. | |
stat_unvectorized <- TestCor::unvectorize(stat) | |
rownames(stat_unvectorized) <- | |
colnames(stat_unvectorized) <- colnames(input_data) | |
# The following hack creates a map from index of unsorted stat to protein | |
stat_names_casted <- TestCor::unvectorize(seq_along(stat)) | |
rownames(stat_names_casted) <- | |
colnames(stat_names_casted) <- colnames(input_data) | |
stat_names <- reshape2::melt(stat_names_casted) | |
trt_cor <- cor(input_data) | |
trt_cor_unvectorized <- trt_cor * (stat_unvectorized > 0) | |
rownames(trt_cor_unvectorized) <- | |
colnames(trt_cor_unvectorized) <- colnames(input_data) | |
stat_names$trt_cor <- reshape2::melt(trt_cor_unvectorized)$value | |
# eliminate the degenerate rows (i.e., having zero values) | |
stat_names <- stat_names[stat_names$value > 0, ] | |
stat_names <- stat_names[order(stat_names$value), , drop = FALSE] | |
# -------------------------------------------------------------------------- | |
# sort the results in decreasing order of magnitude of test statistic | |
# this was: stat_sort <- sort(stat, index.return = TRUE, decreasing = TRUE) | |
stat_sort <- | |
data.frame( | |
T = stat, | |
ix = seq_len(m), | |
name = stat_names$value, | |
trt_cor = stat_names$trt_cor, | |
pval_alpha = rep.int(0, m), | |
sqrt_etc = rep.int(0, m), | |
res = rep.int(0, m), | |
stat_sign = stat_sign | |
) | |
stat_sort <- stat_sort[order(-stat), ] | |
if (Nboot > 0) { | |
# evaluation of pval by bootstrap | |
prop <- matrix(0, nrow = Nboot, ncol = m) | |
for (nboot in 1:Nboot) { | |
indb <- sample(seq(1, m, 1), replace = TRUE) | |
datab <- input_data[indb, ] | |
statb <- abs(eval_stat(datab, stat_test)) | |
# comparison with stat test | |
for (i in 1:m) { | |
prop[nboot, i] <- mean(statb > stat_sort$T[i]) | |
} | |
} | |
cd_of_T <- colMeans(prop) | |
} else { | |
# evaluation of p-value by normal approxiation | |
cd_of_T <- 2 * (1 - pnorm(stat_sort$T)) | |
} | |
# -------------------------------------------------------------------------- | |
# These lines are the only changes to the original code (apart from format- | |
# ing) and do not change the result apart from preparing an attribute. | |
stat_rank <- seq(1, m, 1) | |
stat_quantile <- stat_rank / m | |
stat_sort$cd_of_T <- cd_of_T | |
stat_sort$pval_alpha <- alpha - cd_of_T * stat_rank | |
stat_sort$sqrt_etc <- sqrt(4 * log(p) - 2 * log(log(p))) - stat_sort$T | |
# The next line is a paraphrase of: | |
# https://github.com/cran/TestCor/blob/9d5d2a9a1ac284e3346bd144776559cbd82a8b52/R/FdrMethods.R#L52C5-L52C98 | |
stat_sort$ind <- | |
(cd_of_T <= alpha * stat_quantile) & | |
(stat_sort$T <= sqrt(4 * log(p) - 2 * log(log(p)))) | |
# The unadjusted p-value is coded by reverse engineering the previous line.: | |
if (FALSE) { # not executed but not flagged by the code-in-comment linter | |
# Simplification for compound inequality: | |
# (1) Original expression, where: | |
# - n = number of samples | |
# - p = number of proteins (variables) | |
# - m = number of nondegenerate test statistics in upper triangle of | |
# the correlation matrix, i.e., (p ^ 2 - p) / 2 | |
# - T = ranked test-statistic computed by TestCor::eval_stat | |
# - cd_T = cumulative distribution for (ranked) test-statistic T | |
# - stat_rank = rank of T | |
(cd_T < alpha * stat_quantile) & (T <= sqrt(4 * log(p) - 2 * log(log(p)))) | |
# (2) Adjust for real cd_T and assumption that this is logical conjunction | |
(cd_T <= alpha * stat_quantile) && (T <= sqrt(4 * log(p) - 2 * log(log(p)))) | |
# (3) Isolate alpha | |
(cd_T * m / stat_rank <= alpha) && (T <= sqrt(4 * log(p)- 2 * log(log(p)))) | |
# (4) Product of lesser terms < product of greater terms, | |
# because T > 0 and sqrt(yadayada) > 0 | |
T * cd_T * m / stat_rank <= alpha * sqrt(4 * log(p) - 2 * log(log(p))) | |
# (5) Solve for alpha | |
alpha >= T * cd_T * m / (stat_rank * sqrt(4 * log(p) - 2 * log(log(p)))) | |
} | |
stat_sort$p_unadj <- | |
with( | |
stat_sort, | |
T * cd_of_T * m / (stat_rank * sqrt(4 * log(p) - 2 * log(log(p)))) | |
) | |
stat_sort$stat_median <- with(stat_sort, median(T)) | |
stat_sort$stat_mad <- with(stat_sort, mad(T)) | |
z_mean <- mean(stat_sort$T) | |
z_sd <- sd(stat_sort$T) | |
stat_sort$z_score <- (stat_sort$T - z_mean) / z_sd | |
stat_sort$p_adj <- 2 * pnorm(stat_sort$T, mean = z_mean, sd = z_sd, lower.tail = FALSE) | |
# -------------------------------------------------------------------------- | |
# truncated BH threshold | |
ind <- which(stat_sort$ind) | |
stat_cutoff <- | |
ifelse( | |
length(ind) == 0, | |
2 * sqrt(log(p)), # use normal approx. when forced by low alpha | |
stat_sort$T[max(ind)] # use T from distrib. when alpha large enough | |
) | |
res <- (stat >= stat_cutoff) | |
stat_sort$res <- (stat_sort$T >= stat_cutoff) | |
if (arr.ind) { | |
vect <- FALSE | |
} | |
if (!vect) { | |
res <- (unvectorize(res) > 0) | |
} | |
if (arr.ind) { | |
res <- whichCor(res) | |
} | |
# -------------------------------------------------------------------------- | |
# These lines are the only changes to the original code (apart from format- | |
# ing) and do not change the result apart from preparing an attribute. | |
stat_resort <- stat_sort[order(stat_sort$ix), ] | |
p_unadj <- stat_resort$p_unadj | |
p_adj <- stat_resort$p_adj | |
if (is.null(p_unadj)) { | |
cat("stat_resort$p_unadj unexpectedly produced null\n", file = stderr()) | |
} | |
# The "adjustment factor" relates the unadjusted p-value to the cutoff | |
reportables <- | |
data.frame( | |
var1 = as.character(stat_names[, "Var1"]), | |
var2 = as.character(stat_names[, "Var2"]), | |
ix = stat_resort$ix, | |
trt_cor = stat_resort$trt_cor, | |
test_statistic = stat_resort$T, | |
test_sign = stat_resort$stat_sign, | |
z_score = stat_resort$z_score, | |
z_relative = stat_resort$z_score / stat_resort$stat_median, | |
stat_median = stat_resort$stat_median, | |
stat_mad = stat_resort$stat_mad, | |
stat_cutoff = stat_cutoff, | |
p_unadj = p_unadj, | |
p_adj = p_adj, | |
significant = stat_resort$res, | |
ind = stat_resort$ind | |
) | |
# The significance determination has been made; therefore, | |
# that the following handling of the adjusted p-value seems as | |
# valid as adjusting a p-value in the first place ... | |
reportables <- | |
with(d <- reportables, d[order(-d$significant, -d$test_statistic), ]) | |
if (FALSE) { | |
sig_pval <- with(d <- reportables, d[d$significant, ]$p_unadj) | |
if (sum(is.na(sig_pval)) > 0) | |
sig_pval[is.na(sig_pval)] <- 1.0 | |
nonsig_pval <- with(d <- reportables, d[!d$significant, ]$p_unadj) | |
if (sum(is.na(nonsig_pval)) > 0) | |
nonsig_pval[is.na(nonsig_pval)] <- 1.0 | |
# scale p-value to fit within decision threshold | |
if (length(sig_pval) > 0 && max(sig_pval) > alpha) { | |
sig_pval <- 1.0001 * alpha * sig_pval / max(sig_pval) | |
} | |
# scale p-value to fall outside decision threshold | |
plen <- min(5, length(nonsig_pval)) | |
pseudo_min <- min(nonsig_pval[1:plen]) | |
if (min(pseudo_min < alpha)) { | |
cat("initial min(nonsig_pval[1:n]) is ", pseudo_min, "\n", file = stderr()) | |
# raise min to alpha, i.e., lower 1 - p to 1 - alpha | |
nonsig_pval <- 1 - nonsig_pval | |
nonsig_pval <- nonsig_pval * 0.9999 * pseudo_min / alpha | |
nonsig_pval <- 1 - nonsig_pval | |
cat("final min(nonsig_pval[1:n]) is ", nonsig_pval[1:plen], "\n", file = stderr()) | |
} | |
reportables$p_adj <- c(sig_pval, nonsig_pval) | |
} | |
# restore order of reportables | |
reportables <- with(d <- reportables, d[order(d$ix), ]) | |
attr(res, "reportables") <- reportables | |
# no need to duplicate the reportables in the context | |
#rm(reportables) | |
# return the environment in the event that future inspection is needed. | |
attr(res, "context") <- (list(new_env, environment)[[1]])() | |
# End of changes to the original code apart from formatting | |
# --------------------------------------------------------------------------- | |
# return result `res` | |
res | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
I needed a version that would report unadjusted and adjusted p-values.
Demonstration code: