jnpaulson · November 5, 2019 08:01
diff --git a/gtex.R b/gtex.R
 library(yarn)
 library(dplyr)
 library(rafalib)

 obj = downloadGTEx()

 lowSampleSizes = c("Bladder","Cells - Leukemia cell line (CML)","Cervix - Ectocervix","Cervix - Endocervix","Fallopian Tube")
 obj = filterSamples(obj,lowSampleSizes,"SMTSD") %>%
      filterMissingGenes

 our_subtypes = gsub(" - "," ",as.character(pData(obj)$SMTSD))
 our_subtypes = gsub(" ","_",our_subtypes)
 our_subtypes = tolower(our_subtypes)
 brain_0 = which(pData(obj)$SMTSD == "Brain - Hypothalamus")
 brain_01 = which(pData(obj)$SMTSD == "Brain - Amygdala")
 brain_02 = which(pData(obj)$SMTSD == "Brain - Anterior cingulate cortex (BA24)")
 brain_03 = which(pData(obj)$SMTSD == "Brain - Hippocampus")
 brain_04 = which(pData(obj)$SMTSD == "Brain - Cortex")
 brain_05 = which(pData(obj)$SMTSD == "Brain - Substantia nigra")
 brain_06 = which(pData(obj)$SMTSD == "Brain - Spinal cord (cervical c-1)")
 brain_07 = which(pData(obj)$SMTSD == "Brain - Frontal Cortex (BA9)")
 brain_0 = union(brain_0,c(brain_01,brain_02,brain_03,brain_04,brain_05,brain_06,brain_07))
 our_subtypes[brain_0] = "Brain-0"
 brain_1 = c(which(pData(obj)$SMTSD=="Brain - Cerebellum"),which(pData(obj)$SMTSD=="Brain - Cerebellar Hemisphere"))
 our_subtypes[brain_1] = "Brain-1"
 brain_2 = c(which(pData(obj)$SMTSD=="Brain - Caudate (basal ganglia)"),which(pData(obj)$SMTSD=="Brain - Nucleus accumbens (basal ganglia)"),which(pData(obj)$SMTSD=="Brain - Putamen (basal ganglia)"))
 our_subtypes[brain_2] = "Brain-2"
 skin = grep("sun",our_subtypes)
 our_subtypes[skin] = "skin"
 our_subtypes = factor(tolower(our_subtypes))
 pData(obj)$our_subtypes = our_subtypes

 message("smallest sample size is:")
 show(min(table(pData(obj)$our_subtypes)))
 obj = filterLowGenes(obj,"our_subtypes")

 ### Normalize using qsmooth
 obj = normalizeTissueAware(obj,"our_subtypes")


 saveRDS(obj,file=("gtex_portal_normalized.rds")
diff --git a/voomWeightsCustomized.R b/voomWeightsCustomized.R
 #' Compute voom weights using customized log2 counts
 #' Modified from: https://raw.githubusercontent.com/jhsiao999/Humanzee/916ee1b3e0e213839c17a99280211d68edaca161/R/voomWeightsCustomized.r
 #' and of course the voom function
 #'
 #' @param log2counts counts on log2 scale 
 #' @param design Experimental design of the data. Required to be an R 
 #'                design.matrix object 
 #' @param lib.size Library size.
 #' @param is.cpm if the data is CPM normalized.
 #' 
 #' @export
 #'
 # example:
 # gender = pData(objsubset)$gender
 # batch  = factor(as.character(pData(objsubset)$SMNABTCHT))
 # design = model.matrix(~batch+gender)
 # transformedCounts = assayData(objsubset)[["qsmooth"]]

 # voomOutput = voomWeightsCustomized(transformedCounts,design)
 # fitGood = lmFit(voomOutput,design)
 # fitGood = eBayes(fitGood)
 # gl  = fData(objsubset)$geneNames
 # tt  = topTable(fit,number=Inf,genelist=gl,coef="genderMALE")

 voomWeightsCustomized <- function(log2counts,design,lib.size = NULL,is.cpm = FALSE) {
  if (is.null(design)) {
    design <- model.matrix(~ 1 )
  }
  out<-list()

  if (is.cpm == TRUE) {
    fit <- lmFit(log2counts, design)
    xx <- fit$Amean + mean(log2(lib.size + 1)) - log2(1e+06)
    yy <- sqrt(fit$sigma)
    l <- lowess(xx, yy, f = .5)
    f <- approxfun(l, rule = 2)
    fitted.values <- fit$coef %*% t(fit$design)
    fitted.cpm <- 2^fitted.values
    fitted.count <- 1e-06 * t(t(fitted.cpm) * (lib.size + 1))
    fitted.logcount <- log2(fitted.count)
    w <- 1/f(fitted.logcount)^4
    dim(w) <- dim(fitted.logcount)
  }

  if (is.cpm == FALSE) {
    fit <- lmFit(log2counts, design)
    xx <- fit$Amean 
    yy <- sqrt(fit$sigma)
    l <- lowess(xx, yy, f = .5)
    f <- approxfun(l, rule = 2)
    fitted.values <- fit$coef %*% t(fit$design)
    fitted.logcount <- log2(fitted.values)
    w <- 1/f(fitted.logcount)^4
    dim(w) <- dim(fitted.logcount)
  }
  out$E = log2counts
  out$weights = w
  out$design = design
  new("EList",out)
 }
	library(yarn)
	library(dplyr)
	library(rafalib)

	obj = downloadGTEx()

	lowSampleSizes = c("Bladder","Cells - Leukemia cell line (CML)","Cervix - Ectocervix","Cervix - Endocervix","Fallopian Tube")
	obj = filterSamples(obj,lowSampleSizes,"SMTSD") %>%
	filterMissingGenes

	our_subtypes = gsub(" - "," ",as.character(pData(obj)$SMTSD))
	our_subtypes = gsub(" ","_",our_subtypes)
	our_subtypes = tolower(our_subtypes)
	brain_0 = which(pData(obj)$SMTSD == "Brain - Hypothalamus")
	brain_01 = which(pData(obj)$SMTSD == "Brain - Amygdala")
	brain_02 = which(pData(obj)$SMTSD == "Brain - Anterior cingulate cortex (BA24)")
	brain_03 = which(pData(obj)$SMTSD == "Brain - Hippocampus")
	brain_04 = which(pData(obj)$SMTSD == "Brain - Cortex")
	brain_05 = which(pData(obj)$SMTSD == "Brain - Substantia nigra")
	brain_06 = which(pData(obj)$SMTSD == "Brain - Spinal cord (cervical c-1)")
	brain_07 = which(pData(obj)$SMTSD == "Brain - Frontal Cortex (BA9)")
	brain_0 = union(brain_0,c(brain_01,brain_02,brain_03,brain_04,brain_05,brain_06,brain_07))
	our_subtypes[brain_0] = "Brain-0"
	brain_1 = c(which(pData(obj)$SMTSD=="Brain - Cerebellum"),which(pData(obj)$SMTSD=="Brain - Cerebellar Hemisphere"))
	our_subtypes[brain_1] = "Brain-1"
	brain_2 = c(which(pData(obj)$SMTSD=="Brain - Caudate (basal ganglia)"),which(pData(obj)$SMTSD=="Brain - Nucleus accumbens (basal ganglia)"),which(pData(obj)$SMTSD=="Brain - Putamen (basal ganglia)"))
	our_subtypes[brain_2] = "Brain-2"
	skin = grep("sun",our_subtypes)
	our_subtypes[skin] = "skin"
	our_subtypes = factor(tolower(our_subtypes))
	pData(obj)$our_subtypes = our_subtypes

	message("smallest sample size is:")
	show(min(table(pData(obj)$our_subtypes)))
	obj = filterLowGenes(obj,"our_subtypes")

	### Normalize using qsmooth
	obj = normalizeTissueAware(obj,"our_subtypes")


	saveRDS(obj,file=("gtex_portal_normalized.rds")
	#' Compute voom weights using customized log2 counts
	#' Modified from: https://raw.githubusercontent.com/jhsiao999/Humanzee/916ee1b3e0e213839c17a99280211d68edaca161/R/voomWeightsCustomized.r
	#' and of course the voom function
	#'
	#' @param log2counts counts on log2 scale
	#' @param design Experimental design of the data. Required to be an R
	#' design.matrix object
	#' @param lib.size Library size.
	#' @param is.cpm if the data is CPM normalized.
	#'
	#' @export
	#'
	# example:
	# gender = pData(objsubset)$gender
	# batch = factor(as.character(pData(objsubset)$SMNABTCHT))
	# design = model.matrix(~batch+gender)
	# transformedCounts = assayData(objsubset)[["qsmooth"]]

	# voomOutput = voomWeightsCustomized(transformedCounts,design)
	# fitGood = lmFit(voomOutput,design)
	# fitGood = eBayes(fitGood)
	# gl = fData(objsubset)$geneNames
	# tt = topTable(fit,number=Inf,genelist=gl,coef="genderMALE")

	voomWeightsCustomized <- function(log2counts,design,lib.size = NULL,is.cpm = FALSE) {
	if (is.null(design)) {
	design <- model.matrix(~ 1 )
	}
	out<-list()

	if (is.cpm == TRUE) {
	fit <- lmFit(log2counts, design)
	xx <- fit$Amean + mean(log2(lib.size + 1)) - log2(1e+06)
	yy <- sqrt(fit$sigma)
	l <- lowess(xx, yy, f = .5)
	f <- approxfun(l, rule = 2)
	fitted.values <- fit$coef %*% t(fit$design)
	fitted.cpm <- 2^fitted.values
	fitted.count <- 1e-06 * t(t(fitted.cpm) * (lib.size + 1))
	fitted.logcount <- log2(fitted.count)
	w <- 1/f(fitted.logcount)^4
	dim(w) <- dim(fitted.logcount)
	}

	if (is.cpm == FALSE) {
	fit <- lmFit(log2counts, design)
	xx <- fit$Amean
	yy <- sqrt(fit$sigma)
	l <- lowess(xx, yy, f = .5)
	f <- approxfun(l, rule = 2)
	fitted.values <- fit$coef %*% t(fit$design)
	fitted.logcount <- log2(fitted.values)
	w <- 1/f(fitted.logcount)^4
	dim(w) <- dim(fitted.logcount)
	}
	out$E = log2counts
	out$weights = w
	out$design = design
	new("EList",out)
	}