mschubert · August 29, 2015 14:23
diff --git a/diff_expr.r b/diff_expr.r
 #
 # loading the table, converting it to an expression matrix
 #
 df = read.table(
    "GBM.rnaseqv2__illuminahiseq_rnaseqv2__unc_edu__Level_3__RSEM_genes__data.data.txt",
    header=TRUE, sep="\t", stringsAsFactors=FALSE)
 expr = data.matrix(df[2:nrow(df), df[1,]=="raw_count"])
 rownames(expr) = df[2:nrow(df), 1]

 #
 # filtering the row- and column names to what we want
 #
 colnames(expr) = substr(colnames(expr), 15, 15)
 rownames(expr) = sub("\\|.*$", "", rownames(expr))
 expr = expr[rownames(expr) != "?",]

 #
 # transform the raw read counts using voom
 #  -- with this, we can use standard linear modeling tools
 #
 # if you start with a .bam file instead of the TCGA text file,
 # there are more specific tools to get differential expression
 #  -- see DESeq2, edgeR packages
 #
 library(limma)
 vexpr = voom(expr)$E
 axpr = avereps(vexpr) # average if multiple measurements on same gene

 #
 # use lmFit to compute differential expression
 #
 design = model.matrix(~ 0 + as.factor(colnames(axpr)))
 colnames(design) = c("primary", "recurrent")
 fit.1 = lmFit(axpr, design)
 contrast = makeContrasts("primary-recurrent", levels=design)
 fit.2 = contrasts.fit(fit.1, contrast)
 fit.3 = eBayes(fit.2)

 #
 # make a summary table ordered by p-value
 #
 result = data.frame(
    gene = rownames(fit.3),
    p.value = fit.3$p.value[,1],
    fold_change = fit.3$coefficients[,1],
    stringsAsFactors = FALSE
 )
 result$adj.p = p.adjust(result$p.value, method="fdr")
 result = result[order(result$p.value),]
 cat(result$gene[result$adj.p < 0.01], sep="\n") # print all genes w/ FDR<1%
	#
	# loading the table, converting it to an expression matrix
	#
	df = read.table(
	"GBM.rnaseqv2__illuminahiseq_rnaseqv2__unc_edu__Level_3__RSEM_genes__data.data.txt",
	header=TRUE, sep="\t", stringsAsFactors=FALSE)
	expr = data.matrix(df[2:nrow(df), df[1,]=="raw_count"])
	rownames(expr) = df[2:nrow(df), 1]

	#
	# filtering the row- and column names to what we want
	#
	colnames(expr) = substr(colnames(expr), 15, 15)
	rownames(expr) = sub("\\\|.*$", "", rownames(expr))
	expr = expr[rownames(expr) != "?",]

	#
	# transform the raw read counts using voom
	# -- with this, we can use standard linear modeling tools
	#
	# if you start with a .bam file instead of the TCGA text file,
	# there are more specific tools to get differential expression
	# -- see DESeq2, edgeR packages
	#
	library(limma)
	vexpr = voom(expr)$E
	axpr = avereps(vexpr) # average if multiple measurements on same gene

	#
	# use lmFit to compute differential expression
	#
	design = model.matrix(~ 0 + as.factor(colnames(axpr)))
	colnames(design) = c("primary", "recurrent")
	fit.1 = lmFit(axpr, design)
	contrast = makeContrasts("primary-recurrent", levels=design)
	fit.2 = contrasts.fit(fit.1, contrast)
	fit.3 = eBayes(fit.2)

	#
	# make a summary table ordered by p-value
	#
	result = data.frame(
	gene = rownames(fit.3),
	p.value = fit.3$p.value[,1],
	fold_change = fit.3$coefficients[,1],
	stringsAsFactors = FALSE
	)
	result$adj.p = p.adjust(result$p.value, method="fdr")
	result = result[order(result$p.value),]
	cat(result$gene[result$adj.p < 0.01], sep="\n") # print all genes w/ FDR<1%
No results found