Created
January 5, 2016 16:00
-
-
Save DarwinAwardWinner/61dd28e6fda8281369cb to your computer and use it in GitHub Desktop.
Example of running multiple variants of an analysis in R
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env Rscript | |
library(matrixStats) | |
library(plyr) | |
library(lubridate) | |
library(reshape2) | |
library(minfi) | |
library(limma) | |
library(sva) | |
library(celltypes450) | |
library(ggbiplot) | |
library(dendextend) | |
library(evaluate) | |
library(ggplot2) | |
library(stringr) | |
library(magrittr) | |
library(dplyr) | |
library(rms) | |
library(parallel) | |
options(cores=8, mc.cores=8) | |
try_or_err <- function(expr) { | |
try_capture_stack(substitute(expr), parent.frame()) | |
} | |
suppressPlot <- function(arg) { | |
png("/dev/null") | |
on.exit(dev.off()) | |
result <- arg | |
result | |
} | |
add.numbered.colnames <- function(x, prefix="C") { | |
x %>% set_colnames(sprintf("%s%i", prefix, seq(from=1, length.out=ncol(x)))) | |
} | |
tsmsg <- function(...) message(date(), ": ", ...) | |
human.readable.duration <- function(x, ...) { | |
if (length(x) == 0) | |
return(character(0)) | |
x <- duration(x, ...) | |
use.secs <- is.na(x) | [email protected] < 120 | |
res <- sapply([email protected], lubridate:::compute_estimate) | |
res %<>% str_replace("^~", "") | |
res[use.secs] <- sprintf("%0.3g seconds", [email protected][use.secs]) | |
approx <- res %>% str_split(" +", 2) %>% | |
sapply(. %>% { duration(as.numeric(.[1]), .[2])}) | |
not.exact <- approx != x | |
res[str_detect(res, "^-?1 ")] %<>% str_replace("s$", "") | |
res[not.exact] %<>% str_c("~", .) | |
return(res) | |
} | |
timeTask <- function(name, expr) { | |
tsmsg("Starting task '", name, "'") | |
timing <- system.time(result <- withVisible(expr)) | |
tsmsg(sprintf("Completed task '%s' in %s.", name, | |
human.readable.duration(timing["elapsed"], "seconds"))) | |
print(timing) | |
if (result$visible) { | |
return(result$value) | |
} else { | |
return(invisible(result$value)) | |
} | |
} | |
plotCellProportions <- function(cellprop, scale=FALSE) { | |
tidy.cellprop <- cellprop %>% | |
melt(id.vars="Sample", | |
varnames = c("Sample", "CellType"), | |
value.name = "Proportion") | |
ggplot(tidy.cellprop) + | |
aes(x=Sample, y=Proportion, fill=CellType) + | |
geom_bar(stat="identity", position=if (scale) "fill" else "stack") + | |
scale_fill_brewer(palette="Set1") | |
} | |
reg.logit2 <- function(x, eps=0) { | |
log2(x + eps) - log2(1 - x + eps) | |
} | |
as.numeric.index <- function(i, names) { | |
if (is.character(i)) { | |
return(match(i, names)) | |
} else if (is.logical(i)) { | |
return(which(i)) | |
} else if (is.numeric(i)) { | |
return(i) | |
} else { | |
stop(sprintf("Unknown index type: '%s'", class(i))) | |
} | |
} | |
## Return an index that selects all the elements that the given index | |
## i would NOT select. In other words, the complementary index of i | |
invert.index <- function(i, ...) { | |
-as.numeric.index(i, ...) | |
} | |
make.centered.cell.proportion.design <- function(cellprop, | |
intcol=which.max(colMeans(cellprop)), | |
includeIntercept=FALSE) { | |
selected.cols <- invert.index(intcol, colnames(cellprop)) | |
reduced <- cellprop[,selected.cols] | |
if (ncol(cellprop) - ncol(reduced) != 1) { | |
stop("Must select exactly one column as the intercept") | |
} | |
centered <- scale(reduced, center=TRUE, scale=FALSE) | |
if (includeIntercept) { | |
centered %<>% cbind(`(Intercept)`=1, .) | |
} | |
centered | |
} | |
clamp.values <- function(x, floor=0, ceiling=1, epsilon=0) { | |
floor <- floor + epsilon | |
ceiling <- ceiling - epsilon | |
stopifnot(floor <= ceiling) | |
x %>% pmax(floor) %>% pmin(ceiling) | |
} | |
gc.after <- function(expr) { | |
on.exit(gc()) | |
expr | |
} | |
## Version of vooma with all the relevant features of voom | |
vooma2 <- function (y, design = NULL, normalize.method = "none", | |
plot = FALSE, span = NULL, ...) | |
{ | |
out <- list() | |
ngenes <- nrow(y) | |
isExpressionSet <- suppressPackageStartupMessages( | |
is(y, "ExpressionSet")) | |
if (isExpressionSet) { | |
if (length(Biobase::fData(y))) | |
out$genes <- Biobase::fData(y) | |
if (length(Biobase::pData(y))) | |
out$targets <- Biobase::pData(y) | |
y <- Biobase::exprs(y) | |
} | |
else { | |
y <- as.matrix(y) | |
} | |
if (is.null(design)) { | |
design <- matrix(1, ncol(y), 1) | |
rownames(design) <- colnames(y) | |
colnames(design) <- "GrandMean" | |
} | |
y <- normalizeBetweenArrays(y, method = normalize.method) | |
fit <- lmFit(y, design, ...) | |
if (is.null(fit$Amean)) | |
fit$Amean <- rowMeans(y, na.rm = TRUE) | |
sx <- rowMeans(y) | |
sy <- sqrt(fit$sigma) | |
if (is.null(span)) | |
if (ngenes <= 10) | |
span <- 1 | |
else span <- 0.3 + 0.7 * (10/ngenes)^0.5 | |
l <- lowess(sx, sy, f = span) | |
if (plot) { | |
plot(sx, sy, xlab = "Average log2 expression", ylab = "Sqrt( standard deviation )", | |
pch = 16, cex = 0.25) | |
title("vooma: Mean-variance trend") | |
lines(l, col = "red") | |
} | |
f <- approxfun(l, rule = 2) | |
if (fit$rank < ncol(design)) { | |
j <- fit$pivot[1:fit$rank] | |
fitted.values <- fit$coef[, j, drop = FALSE] %*% t(fit$design[, | |
j, drop = FALSE]) | |
} | |
else { | |
fitted.values <- fit$coef %*% t(fit$design) | |
} | |
w <- 1/f(fitted.values)^4 | |
dim(w) <- dim(y) | |
colnames(w) <- colnames(y) | |
rownames(w) <- rownames(y) | |
out$E <- y | |
out$other <- list(fitted.values=fitted.values) | |
out$meanvar.trend <- data.frame(x = sx, y = sy) | |
out$weights <- w | |
out$design <- design | |
out$span <- span | |
new("EList", out) | |
} | |
voomaWithQualityWeights <- | |
function (y, design = NULL, normalize.method = "none", | |
plot = FALSE, span = NULL, var.design = NULL, | |
method = "genebygene", maxiter = 50, tol = 1e-10, | |
trace = FALSE, replace.weights = TRUE, col = NULL, ...) | |
{ | |
if (plot) { | |
oldpar <- par(mfrow = c(1, 2)) | |
on.exit(par(oldpar)) | |
} | |
v <- vooma2(y, design = design, normalize.method = normalize.method, | |
plot = FALSE, span = span, ...) | |
aw <- arrayWeights(v, design = design, method = method, maxiter = maxiter, | |
tol = tol, var.design = var.design) | |
v <- vooma2(y, design = design, weights = aw, | |
normalize.method = normalize.method, plot = plot, span = span, | |
...) | |
aw <- arrayWeights(v, design = design, method = method, maxiter = maxiter, | |
tol = tol, trace = trace, var.design = var.design) | |
wts <- asMatrixWeights(aw, dim(v)) * v$weights | |
attr(wts, "arrayweights") <- NULL | |
if (plot) { | |
barplot(aw, names = 1:length(aw), main = "Sample-specific weights", | |
ylab = "Weight", xlab = "Sample", col = col) | |
abline(h = 1, col = 2, lty = 2) | |
} | |
if (replace.weights) { | |
v$weights <- wts | |
v$sample.weights <- aw | |
return(v) | |
} | |
else { | |
return(wts) | |
} | |
} | |
timeTask("Loading data", { | |
RGSet <- readRDS("dataProcessQC/RGSet.RDS") | |
targets <- pData(RGSet) %>% | |
dplyr::select( | |
Sample_Name, | |
TStatus, | |
Diabetes, | |
Sex, Age, | |
Replicate=Replcate, | |
Well=Sample_Well, Plate=Sample_Plate, | |
Array, Slide, | |
Basename) %>% | |
mutate(Basename=basename(Basename), | |
Sample_Name=as.character(Sample_Name), | |
TStatus=TStatus %>% factor %>% | |
relevel(ref="TX") %>% | |
revalue(c(`CAN/IFTA`="CAN")), | |
Sex=factor(Sex), | |
Diabetes=factor(Diabetes), | |
Well=factor(Well), | |
Plate=factor(Plate), | |
Array=factor(Array), | |
Slide=factor(Slide), | |
Replicate=factor(Replicate)) | |
}) | |
timeTask("Getting SNPs from RGSet", { | |
sample.snpbeta <- getSnpBeta(RGSet) | |
sample.snps <- (round(sample.snpbeta * 2)/2) | |
## Make dendrogram | |
snpdist.cor <- (1-abs(cor(t(sample.snps)))) %>% as.dist | |
snpdist.manh <- dist(sample.snps, method="manhattan") | |
snpclust.cor <- hclust(snpdist.cor) | |
snpclust.manh <- hclust(snpdist.manh) | |
snpdendro.cor <- snpclust.cor %>% as.dendrogram %>% hang.dendrogram | |
snpdendro.manh <- snpclust.manh %>% as.dendrogram %>% hang.dendrogram | |
dir.create("snpplots", FALSE, TRUE) | |
pdf("snpplots/snpdendro.pdf", height=12, width=8) | |
plot(snpdendro.manh, horiz = TRUE, main="SNP dendrogram (manhattan distance)") | |
plot(snpdendro.cor, horiz = TRUE, main="SNP dendrogram (dist=1-abs(corr))") | |
invisible(dev.off()) | |
}) | |
timeTask("Proprocessing RGSet to GRatioSet", { | |
GRatioSet <- RGSet %>% preprocessSWAN %>% mapToGenome %>% ratioConvert %>% | |
dropLociWithSnps(snps=c("SBE","CpG"), maf=0) | |
}) | |
timeTask("Estimating cell type proportions", { | |
dir.create("plots", FALSE, TRUE) | |
pdf("plots/cellprop-qc.pdf", height=12, width=12) | |
RGTemp <- RGSet | |
pData(RGTemp)$Sample_Name %<>% as.character | |
pData(RGTemp)$Slide %<>% as.numeric | |
cellprop <- gc.after(estimateCellCounts( | |
RGTemp, | |
cellTypes = c("CD8T","CD4T", "NK","Bcell","Mono"), | |
meanPlot=TRUE)) %>% | |
pmax(0) | |
print(plotCellProportions(cellprop, TRUE) + | |
coord_flip()) | |
print(ggbiplot(princomp(cellprop)) + | |
aes(color=targets$TStatus) + | |
scale_color_brewer(palette="Set2")) | |
dev.off() | |
rm(RGTemp) | |
}) | |
{ | |
tsmsg("Conducting ANOVA of cell proportions against covariates") | |
print(anova(lm(cellprop ~ TStatus + Diabetes + Sex + Age, data=targets))) | |
tsmsg("Conducting ANOVA of logit(cell proportions) against covariates") | |
print(anova(lm(reg.logit2(cellprop, 0.01) ~ TStatus + Diabetes + Sex + Age, data=targets))) | |
} | |
timeTask("Computing adjusted betas by fixed effects model", { | |
eps <- getBeta(GRatioSet) %>% {min(., 1-.)} / 2 | |
assay(GRatioSet, "Beta.Adjusted.FE") <- | |
getBeta(GRatioSet) %>% | |
removeBatchEffect(covariates = make.centered.cell.proportion.design(cellprop)) %>% | |
clamp.values(floor=0, ceiling=1, epsilon=eps) | |
}) | |
timeTask("Computing adjusted betas by mixed effects model", { | |
assay(GRatioSet, "Beta.Adjusted.LME") <- | |
adjust.beta(getBeta(GRatioSet), omega.mix=cellprop, mc.cores=8) | |
}) | |
x <- cbind(Unadj=as.vector(assay(GRatioSet, "Beta")), | |
FE=as.vector(assay(GRatioSet, "Beta.Adjusted.FE")), | |
LME=as.vector(assay(GRatioSet, "Beta.Adjusted.LME"))) | |
## Select SNP sets | |
timeTask("Removing known covariates from M values for SNP selection", { | |
dupfilter <- !duplicated(targets$Sample) | |
design.snp <- model.matrix(~Diabetes + Sex + Age, targets[dupfilter,]) | |
preserve.cols <- 1 | |
## Raw M-values | |
Mvals <- list( | |
Unadj=getM(GRatioSet)[,dupfilter], | |
FEAdj=logit2(assay(GRatioSet, "Beta.Adjusted.FE"))[,dupfilter], | |
LMEAdj=logit2(assay(GRatioSet, "Beta.Adjusted.LME"))[,dupfilter]) | |
## M-values after removing batch effects | |
Mvals.bs <- | |
mclapply(Mvals, | |
. %>% | |
removeBatchEffect( | |
design=design.snp[,preserve.cols, drop=FALSE], | |
covariates=design.snp[,-preserve.cols, drop=FALSE])) | |
allMvals <- c(Mvals, Mvals.bs %>% setNames(str_c(names(.), ".bs"))) | |
}) | |
timeTask("Computing PCoA for SNP selection", { | |
dmats <- suppressPlot(mclapply(allMvals, . %>% plotMDS %$% distance.matrix %>% as.dist, mc.cores=4)) | |
mds <- suppressWarnings(lapply(dmats, . %>% cmdscale(k=attr(., "Size") - 1, eig=TRUE))) | |
mdsdims <- lapply(mds, . %$% points %>% add.numbered.colnames("PC")) | |
}) | |
timeTask("Performing logistic regression of SNPs against PCs", { | |
snps.rmdup <- sample.snps[,dupfilter] | |
ndims <- 10 | |
snpframes <- lapply(mdsdims, function(m) { | |
apply(snps.rmdup, 1, function(snp) { | |
data.frame(snp=factor(snp, ordered=TRUE), m[,seq_len(ndims), drop=FALSE]) | |
}) | |
}) | |
snp.pvals <- sapply(snpframes, function(dflist) { | |
lapply(dflist, function(df) { | |
depvar <- colnames(df)[1] | |
covars <- colnames(df)[-1] | |
altformula <- as.formula(str_c(depvar, " ~ ", str_c(covars, collapse="+"))) | |
nullformula <- as.formula(str_c(depvar, " ~ 1")) | |
## Use penalized regression to deal with perfect separation problem | |
lrm(altformula, data=df, penalty=1) %>% anova %>% .["TOTAL", "P"] | |
## fit1 <- polr(altformula, data=df, Hess=TRUE) | |
## fit0 <- polr(nullformula, data=df, Hess=TRUE) | |
## atest <- anova(fit1, fit0) | |
## atest$`Pr(Chi)`[2] | |
}) %>% unlist | |
}) | |
snp.fdr <- apply(snp.pvals, 2, p.adjust, method="BH") | |
snp.ranks <- apply(snp.pvals, 2, function(p) {seq_len(length(p))[order(order(p))]}) %>% | |
set_rownames(rownames(snp.pvals)) | |
## Plot SNP dendrograms with ranks for each version | |
pdf("snpplots/snpdendro-ranked.pdf", height=14, width=10) | |
for (i in colnames(snp.pvals)) local({ | |
title <- sprintf("SNP Dendrogram (ranked by logit reg vs %s data)", i) | |
tempclust <- snpclust.manh | |
snplabels <- sprintf("%s: %s, FDR = %0.3g", | |
snp.ranks[tempclust$labels,i], | |
tempclust$labels, | |
snp.fdr[tempclust$labels,i]) | |
tempclust$labels <- snplabels | |
tempdendro <- tempclust %>% as.dendrogram %>% hang.dendrogram | |
plot(tempdendro, horiz = TRUE, main=title) | |
}) | |
dev.off() | |
## Make SNP MDS plots | |
for (i in colnames(snp.pvals)) local({ | |
o <- order(snp.ranks[,i]) | |
snpnames <- rownames(snp.pvals)[o] | |
snpfdrs <- snp.fdr[o,i] | |
snpranks <- snp.ranks[o,i] | |
pdf(sprintf("snpplots/SNP_MDS_%s.pdf", i)) | |
for (s in seq_len(length(snpnames))) { | |
frame <- snpframes[[i]][[snpnames[s]]] %>% cbind(targets[dupfilter,]) %>% | |
mutate(Sex=factor(Sex)) | |
frame$SNP <- frame$snp %>% as.character %>% as.numeric | |
title <- sprintf("MDS plot colored by SNP %s; Rank %s, FDR = %0.3g", | |
snpnames[s], snpranks[s], snpfdrs[s]) | |
print(ggplot(frame) + | |
aes(x=PC1, y=PC2, color=SNP, shape=TStatus:Sex) + | |
geom_point(size=5) + | |
scale_color_gradient2(midpoint = 0.5, mid="gray40") + | |
scale_shape_manual( | |
values=c(16, 1, | |
15, 0, | |
17, 2, | |
18, 5)) + | |
coord_fixed() + | |
ggtitle(title)) | |
} | |
dev.off() | |
}) | |
}) | |
## Determined by looking at the above MDS plots and dendrogram | |
## (Theoretically I should revisit them after switching to penalized | |
## regression, but it seems the SNP idea is garbage anyway, so never | |
## mind.) | |
snpsets <- list( | |
origsnps=c( | |
SNP1="rs6426327", | |
SNP2="rs472920", | |
SNP3="rs2468330", | |
SNP4="rs133860", | |
SNP5="rs1467387"), | |
batchsub=c( | |
SNP1="rs6426327", | |
SNP2="rs472920" , | |
SNP4="rs5987737", | |
SNP7="rs133860", | |
SNP9="rs1484127"), | |
adj.batchsub=c( | |
SNP1="rs10033147", | |
SNP2="rs9292570", | |
SNP3="rs2804694", | |
SNP6="rs1484127", | |
SNP7="rs1941955")) | |
save.image("saved.RData") | |
run.variant <- function(adjustForCellProportions=c("none", "adj.beta.fe", "adj.beta.lme", "adj.M"), | |
regressionScale=c("M", "beta"), | |
handleDuplicates=c("dupcor", "remove", "none"), | |
snps=NULL, | |
handleVarTrend=c("vooma", "trend", "none"), | |
handleHiddenVar=c("sva", "peer", "none"), | |
useArrayWeights=FALSE, | |
randomSeed) { | |
adjustForCellProportions <- match.arg(adjustForCellProportions) | |
regressionScale <- match.arg(regressionScale) | |
handleVarTrend <- match.arg(handleVarTrend) | |
handleDuplicates <- match.arg(handleDuplicates) | |
handleHiddenVar <- match.arg(handleHiddenVar) | |
selected.options <- | |
mget(c("adjustForCellProportions", "regressionScale", "handleDuplicates", | |
"snps", "handleVarTrend", "handleHiddenVar", "useArrayWeights", | |
"randomSeed")) | |
if (regressionScale == "beta" && adjustForCellProportions == "adj.M") { | |
stop("M-value adjustment makes no sense when performing analysis on beta values.") | |
} | |
vars.to.return <- character(0) | |
add.return.var <- function(...) { | |
vars.to.return <<- c(vars.to.return, ...) | |
} | |
## Return all the inputs | |
add.return.var( | |
"adjustForCellProportions", "regressionScale", "handleDuplicates", | |
"snps", "handleVarTrend", "handleHiddenVar", "useArrayWeights", | |
"randomSeed") | |
## Allow reproducibility | |
if (!missing(randomSeed) && !is.null(randomSeed)) { | |
set.seed(randomSeed) | |
add.return.var("randomSeed") | |
tsmsg("Setting random seed to ", deparse(randomSeed)) | |
} | |
## Cell proportion adjustment | |
if (adjustForCellProportions == "adj.beta.lme") { | |
tsmsg("Using betas adjusted for cell proportions via linear mixed effects model") | |
cellprop <- cellprop | |
add.return.var("cellprop") | |
## It's already pre-computed | |
beta <- assay(GRatioSet, "Beta.Adjusted.LME") | |
} else if (adjustForCellProportions == "adj.beta.fe" && regressionScale == "M") { | |
tsmsg("Using betas adjusted for cell proportions via ordinary (fixed effect) linear model") | |
cellprop <- cellprop | |
add.return.var("cellprop") | |
## It's already pre-computed | |
beta <- assay(GRatioSet, "Beta.Adjusted.FE") | |
} else if (adjustForCellProportions == "adj.beta.fe" && regressionScale == "beta") { | |
tsmsg("Using unadjusted betas, since cell-type adjustment will happen during model fitting") | |
beta <- getBeta(GRatioSet) | |
} else { | |
tsmsg("Using unadjusted betas") | |
beta <- getBeta(GRatioSet) | |
} | |
add.return.var("beta") | |
## Make a local copy of targets | |
targets <- targets | |
add.return.var("targets") | |
## Dup removal | |
if (handleDuplicates == "remove") { | |
tsmsg("Removing duplicates") | |
dupfilter <- !duplicated(targets$Sample) | |
targets <- targets[dupfilter,] | |
beta <- beta[,dupfilter] | |
} | |
if (regressionScale == "M") { | |
timeTask("Transforming betas to M-values for regression", { | |
elist <- new("EList", list(E = logit2(beta))) | |
}) | |
} else { | |
tsmsg("Performing regression on beta values") | |
elist <- new("EList", list(E = beta)) | |
} | |
add.return.var("elist") | |
## Base designs | |
design1 <- model.matrix(~TStatus + Diabetes + Sex + Age, targets) | |
design0 <- model.matrix(~Diabetes + Sex + Age, targets) | |
add.return.var("design1", "design0") | |
## SNP | |
if (length(snps) == 0) { | |
snps <- NULL | |
} | |
if (is.vector(snps)) { | |
snps <- sample.snps[snps,] | |
add.return.var("snps") | |
} | |
if (is.matrix(snps)) { | |
if (handleDuplicates == "remove") { | |
snps <- snps[dupfilter,] | |
} | |
tsmsg("Adding selected SNPs to design matrix: ", deparse(rownames(snps))) | |
design1 %<>% cbind(t(snps)) | |
design0 %<>% cbind(t(snps)) | |
} else if (!is.null(snps)) { | |
stop("Invalid snps argument") | |
} | |
## cell proportions as covariates | |
if (adjustForCellProportions == "adj.M" || | |
(adjustForCellProportions == "adj.beta.fe" && regressionScale == "beta")) { | |
## Make a local copy | |
cellprop <- cellprop | |
add.return.var("cellprop") | |
## Normalize row sums to exactly one, since | |
## estimateCellCounts doesn't guarantee this. | |
cellprop %<>% divide_by(rowSums(.)) | |
## TODO make this threshold a parameter | |
present.celltypes <- colMaxs(cellprop) >= 0.05 | |
if (any(!present.celltypes)) { | |
cellprop <- cellprop[,present.celltypes] | |
cellprop %<>% divide_by(rowSums(.)) | |
} | |
if (ncol(cellprop) > 1) { | |
tsmsg("Adding cell type proportions as covariates in design matrix") | |
## Delete the column with the smallest overall proportion | |
cellpropdesign = make.centered.cell.proportion.design(cellprop, includeIntercept = FALSE) | |
design1 %<>% cbind(cellpropdesign) | |
design0 %<>% cbind(cellpropdesign) | |
} else{ | |
tsmsg("Not adding cell type proportions as covariates in design matrix because multiple cell types are not present") | |
} | |
} | |
## sva/PEER | |
svobj <- NULL | |
switch(handleHiddenVar, | |
sva=timeTask("Running sva", { | |
svobj <- sva(as.matrix(elist), design1, design0) | |
svobj$sv %<>% set_colnames(sprintf("SV%i", seq(ncol(.)))) | |
tsmsg(sprintf("Got %i significant surrogate variables", ncol(svobj$sv))) | |
design1 %<>% cbind(svobj$sv) | |
design0 %<>% cbind(svobj$sv) | |
add.return.var("svobj") | |
}), | |
peer=timeTask("Running PEER", { | |
stop("Unimplemented") | |
}), | |
none={ | |
tsmsg("Not adjusting for hidden variables") | |
}) | |
## arrayWeights and/or vooma | |
if (useArrayWeights) { | |
if (handleVarTrend == "vooma") { | |
timeTask("Computing sample and observation weights", { | |
elist <- voomaWithQualityWeights(elist, design1) | |
}) | |
} else { | |
timeTask("Computing sample weights", { | |
elist$sample.weights <- arrayWeights(elist, design1) | |
elist$weights <- elist$sample.weights %>% asMatrixWeights(dim(elist)) | |
}) | |
} | |
} | |
else { | |
if (handleVarTrend == "vooma") { | |
timeTask("Computing vooma observation weights",{ | |
elist <- vooma2(elist, design1) | |
}) | |
} else { | |
tsmsg("Not using sample weights") | |
} | |
} | |
## dupCor | |
if (handleDuplicates == "dupcor") { | |
timeTask("Running duplicateCorrelation", { | |
corfit <- suppressWarnings(duplicateCorrelation(elist, design1, block=targets$Sample)) | |
tsmsg("Got consensus correlation of ", corfit$consensus.correlation) | |
}) | |
add.return.var("corfit") | |
## Need to re-run vooma with correlation and block | |
if (handleVarTrend == "vooma") { | |
if (useArrayWeights) { | |
timeTask("Re-computing sample and observation weights after duplicateCorrelation", { | |
elist <- voomaWithQualityWeights( | |
elist, design1, | |
correlation = corfit$consensus.correlation, | |
block=targets$Sample) | |
}) | |
} else { | |
timeTask("Re-computing vooma observation weights after duplicateCorrelation", { | |
elist <- vooma2( | |
elist, design1, | |
correlation = corfit$consensus.correlation, | |
block=targets$Sample) | |
}) | |
} | |
} | |
} | |
fit.args <- list(object=elist, design=design1) | |
if (handleDuplicates == "dupcor") { | |
fit.args$correlation <- corfit$consensus.correlation | |
fit.args$block <- targets$Sample | |
} | |
timeTask("Fitting linear model", { | |
fit <- do.call(lmFit, fit.args) %>% eBayes(robust=TRUE, trend=handleVarTrend == "trend") | |
}) | |
add.return.var("fit") | |
coef.tests <- list(ARvsTX="TStatusAR", ADNRvsTX="TStatusADNR", CANvsTX="TStatusCAN", | |
DiabetesT2vsT1="DiabetesT2D", MvsF="SexM", Age="Age") | |
if (!is.null(snps)) { | |
coef.tests$SNPs <- rownames(snps) | |
} | |
if (adjustForCellProportions == "adj.M" || | |
(adjustForCellProportions == "adj.beta.fe" && regressionScale == "beta")) { | |
coef.tests$CellType <- colnames(cellpropdesign) | |
} | |
if (!is.null(svobj)) { | |
coef.tests$SV <- colnames(svobj$sv) | |
} | |
add.return.var("coef.tests") | |
## TODO PEER | |
timeTask("Generating results tables", { | |
results <- lapply(coef.tests, . %>% topTable(fit=fit, coef=., n=Inf)) | |
}) | |
add.return.var("results") | |
## Collect a summary of all relevant results | |
results.summary <- list() | |
add.return.var("results.summary") | |
results.summary$SignifCounts <- | |
data.frame( | |
ESTDM=results %>% | |
sapply(. %$% P.Value %>% propTrueNull %>% {1 - .} %>% | |
multiply_by(nrow(results[[1]]))) %>% | |
round, | |
FDR10=results %>% | |
sapply(. %$% adj.P.Val %>% is_weakly_less_than(0.1) %>% sum), | |
FDR15=results %>% | |
sapply(. %$% adj.P.Val %>% is_weakly_less_than(0.15) %>% sum)) | |
results.summary$ModelSummary <- list( | |
DesignDF=ncol(fit$design), | |
ResidDF=max(fit$df.residual), | |
PriorDF=fit$df.prior %>% { if (length(.) <= 1) . else summary(.) }, | |
ModelVariance=summary(fit$s2.post)) | |
if (handleDuplicates == "dupcor") { | |
results.summary$DuplicateCorrelation <- corfit$consensus.correlation | |
} | |
if (handleHiddenVar == "sva") { | |
results.summary$NumSV <- ncol(svobj$sv) | |
} | |
if (useArrayWeights) { | |
results.summary$ArrayWeights <- summary(elist$sample.weights) | |
} | |
results.summary$Options <- selected.options | |
## Save sessionInfo in results | |
session.info <- sessionInfo() | |
add.return.var("session.info") | |
return(mget(vars.to.return)) | |
} | |
arglists <- list( | |
## proto=list( | |
## adjustForCellProportions=c("none", "adj.beta", "adj.beta.lme", "adj.M"), | |
## regressionScale=c("M", "beta"), | |
## handleDuplicates=c("dupcor", "remove", "none"), | |
## snps=NULL, | |
## handleVarTrend=c("vooma", "trend", "none"), | |
## handleHiddenVar=c("sva", "peer", "none"), | |
## useArrayWeights=FALSE, | |
## randomSeed), | |
unadj.naive=list( | |
adjustForCellProportions="none", | |
regressionScale="M", | |
handleDuplicates="none", | |
snps=NULL, | |
handleVarTrend="trend", | |
handleHiddenVar="none", | |
useArrayWeights=FALSE, | |
randomSeed=1986), | |
unadj.rmdups=list( | |
adjustForCellProportions="none", | |
regressionScale="M", | |
handleDuplicates="remove", | |
snps=NULL, | |
handleVarTrend="trend", | |
handleHiddenVar="none", | |
useArrayWeights=FALSE, | |
randomSeed=1986), | |
unadj.dupcor=list( | |
adjustForCellProportions="none", | |
regressionScale="M", | |
handleDuplicates="dupcor", | |
snps=NULL, | |
handleVarTrend="trend", | |
handleHiddenVar="none", | |
useArrayWeights=FALSE, | |
randomSeed=1986), | |
unadj.dupcor.sva=list( | |
adjustForCellProportions="none", | |
regressionScale="M", | |
handleDuplicates="dupcor", | |
snps=NULL, | |
handleVarTrend="trend", | |
handleHiddenVar="sva", | |
useArrayWeights=FALSE, | |
randomSeed=1986), | |
unadj.dupcor.sva.aw=list( | |
adjustForCellProportions="none", | |
regressionScale="M", | |
handleDuplicates="dupcor", | |
snps=NULL, | |
handleVarTrend="trend", | |
handleHiddenVar="sva", | |
useArrayWeights=TRUE, | |
randomSeed=1986), | |
unadj.dupcor.sva.vooma=list( | |
adjustForCellProportions="none", | |
regressionScale="M", | |
handleDuplicates="dupcor", | |
snps=NULL, | |
handleVarTrend="vooma", | |
handleHiddenVar="sva", | |
useArrayWeights=FALSE, | |
randomSeed=1986), | |
unadj.dupcor.snps.sva.voomaw=list( | |
adjustForCellProportions="none", | |
regressionScale="M", | |
handleDuplicates="dupcor", | |
snps=snpsets$batchsub, | |
handleVarTrend="vooma", | |
handleHiddenVar="sva", | |
useArrayWeights=TRUE, | |
randomSeed=1986), | |
unadj.dupcor.sva.voomaw=list( | |
adjustForCellProportions="none", | |
regressionScale="M", | |
handleDuplicates="dupcor", | |
snps=NULL, | |
handleVarTrend="vooma", | |
handleHiddenVar="sva", | |
useArrayWeights=TRUE, | |
randomSeed=1986), | |
adjBetaLME.dupcor=list( | |
adjustForCellProportions="adj.beta.lme", | |
regressionScale="M", | |
handleDuplicates="dupcor", | |
snps=NULL, | |
handleVarTrend="trend", | |
handleHiddenVar="none", | |
useArrayWeights=FALSE, | |
randomSeed=1986), | |
adjBetaLME.dupcor.sva=list( | |
adjustForCellProportions="adj.beta.lme", | |
regressionScale="M", | |
handleDuplicates="dupcor", | |
snps=NULL, | |
handleVarTrend="trend", | |
handleHiddenVar="sva", | |
useArrayWeights=FALSE, | |
randomSeed=1986), | |
adjBetaLME.dupcor.sva.aw=list( | |
adjustForCellProportions="adj.beta.lme", | |
regressionScale="M", | |
handleDuplicates="dupcor", | |
snps=NULL, | |
handleVarTrend="trend", | |
handleHiddenVar="sva", | |
useArrayWeights=TRUE, | |
randomSeed=1986), | |
adjBetaLME.dupcor.sva.vooma=list( | |
adjustForCellProportions="adj.beta.lme", | |
regressionScale="M", | |
handleDuplicates="dupcor", | |
snps=NULL, | |
handleVarTrend="vooma", | |
handleHiddenVar="sva", | |
useArrayWeights=FALSE, | |
randomSeed=1986), | |
adjBetaLME.dupcor.sva.voomaw=list( | |
adjustForCellProportions="adj.beta.lme", | |
regressionScale="M", | |
handleDuplicates="dupcor", | |
snps=NULL, | |
handleVarTrend="vooma", | |
handleHiddenVar="sva", | |
useArrayWeights=TRUE, | |
randomSeed=1986), | |
adjM.dupcor=list( | |
adjustForCellProportions="adj.M", | |
regressionScale="M", | |
handleDuplicates="dupcor", | |
snps=NULL, | |
handleVarTrend="trend", | |
handleHiddenVar="none", | |
useArrayWeights=FALSE, | |
randomSeed=1986), | |
adjM.dupcor.snps=list( | |
adjustForCellProportions="adj.M", | |
regressionScale="M", | |
handleDuplicates="dupcor", | |
snps=snpsets$batchsub, | |
handleVarTrend="trend", | |
handleHiddenVar="none", | |
useArrayWeights=FALSE, | |
randomSeed=1986), | |
adjM.dupcor.sva=list( | |
adjustForCellProportions="adj.M", | |
regressionScale="M", | |
handleDuplicates="dupcor", | |
snps=NULL, | |
handleVarTrend="trend", | |
handleHiddenVar="sva", | |
useArrayWeights=FALSE, | |
randomSeed=1986), | |
adjM.dupcor.sva.aw=list( | |
adjustForCellProportions="adj.M", | |
regressionScale="M", | |
handleDuplicates="dupcor", | |
snps=NULL, | |
handleVarTrend="trend", | |
handleHiddenVar="sva", | |
useArrayWeights=TRUE, | |
randomSeed=1986), | |
adjM.dupcor.sva.vooma=list( | |
adjustForCellProportions="adj.M", | |
regressionScale="M", | |
handleDuplicates="dupcor", | |
snps=NULL, | |
handleVarTrend="vooma", | |
handleHiddenVar="sva", | |
useArrayWeights=FALSE, | |
randomSeed=1986), | |
adjM.dupcor.sva.voomaw=list( | |
adjustForCellProportions="adj.M", | |
regressionScale="M", | |
handleDuplicates="dupcor", | |
snps=NULL, | |
handleVarTrend="vooma", | |
handleHiddenVar="sva", | |
useArrayWeights=TRUE, | |
randomSeed=1986), | |
adjBetaFE.dupcor.voomaw=list( | |
adjustForCellProportions="adj.beta.fe", | |
regressionScale="M", | |
handleDuplicates="dupcor", | |
snps=NULL, | |
handleVarTrend="vooma", | |
handleHiddenVar="none", | |
useArrayWeights=TRUE, | |
randomSeed=1986), | |
adjBetaFE.dupcor.sva.voomaw=list( | |
adjustForCellProportions="adj.beta.fe", | |
regressionScale="M", | |
handleDuplicates="dupcor", | |
snps=NULL, | |
handleVarTrend="vooma", | |
handleHiddenVar="sva", | |
useArrayWeights=TRUE, | |
randomSeed=1986), | |
betascale.dupcor.sva.voomaw=list( | |
adjustForCellProportions="none", | |
regressionScale="beta", | |
handleDuplicates="dupcor", | |
snps=NULL, | |
handleVarTrend="vooma", | |
handleHiddenVar="sva", | |
useArrayWeights=TRUE, | |
randomSeed=1986), | |
betascale.dupcor.cellprop.voomaw=list( | |
adjustForCellProportions="adj.beta.fe", | |
regressionScale="beta", | |
handleDuplicates="dupcor", | |
snps=NULL, | |
handleVarTrend="vooma", | |
handleHiddenVar="none", | |
useArrayWeights=TRUE, | |
randomSeed=1986), | |
betascale.dupcor.cellprop.sva.voomaw=list( | |
adjustForCellProportions="adj.beta.fe", | |
regressionScale="beta", | |
handleDuplicates="dupcor", | |
snps=NULL, | |
handleVarTrend="vooma", | |
handleHiddenVar="sva", | |
useArrayWeights=TRUE, | |
randomSeed=1986)) | |
variants.to.run <- names(arglists) | |
dir.create("analyses", FALSE, TRUE) | |
for (i in variants.to.run) local({ | |
on.exit(gc()) | |
save.file <- file.path("analyses", sprintf("analysis-%s.RDS", i)) | |
summary.file <- file.path("analyses", sprintf("summary-%s.RDS", i)) | |
error.file <- file.path("analyses", sprintf("error-%s.RDS", i)) | |
if (file.exists(save.file)) { | |
tsmsg("Skipping already done analysis ", i) | |
tsmsg("Printing previous results for this analysis:") | |
print(readRDS(summary.file)) | |
} else { | |
timeTask(str_c("Running full pipeline for ", i), { | |
result <- try_or_err({ | |
analysis <- do.call(run.variant, arglists[[i]]) | |
timeTask("Saving results", { | |
saveRDS(analysis, save.file) | |
saveRDS(analysis$results.summary, summary.file) | |
}) | |
tsmsg(sprintf("Summary for %s:", i)) | |
print(analysis$results.summary) | |
}) | |
if (is(result, "error")) { | |
tsmsg("ERROR: Failed during ", i, " analysis. Saving error (including stacktrace) to ", deparse(error.file), ". Also printing stack trace:") | |
print(result) | |
print(result$calls) | |
saveRDS(result, error.file) | |
} | |
}) | |
} | |
}) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment