bjkeller · May 19, 2015 22:10
diff --git a/geoexample.R b/geoexample.R
 # R example for working with GEO data using GEOquery and limma for
 # Seattle SIGKDD data club.
 # (riffing on auto generated R script for GEO2R service on GEO site)

 ## libraries for everything
 library(Biobase)
 library(GEOquery)
 library(limma)

 ###
 # Series info:
 # http://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE6275
 #
 # Note: we are getting the already normalized data. Different procedure
 # to start with raw data
 ###

 ## get data
 # 1. series data
 gset <- getGEO("GSE6275", GSEMatrix =TRUE)
 gset <- gset[[1]]

 # 2. NCBI platform annotation
 gpl <- annotation(gset)
 platf <- getGEO(gpl, AnnotGPL=TRUE)

 # reset column names
 fvarLabels(gset) <- make.names(fvarLabels(gset))

 ##
 # Data set has 36 samples corresponding to different brain regions of three
 # strains of mice. Samples from each strain are pooled into three groups.

 # group names for all samples
 sml <- c("G0","G0","G1","G2","G2","G0","G0","G1","G2","G2","G0","G0","G1","G2","G2","G0","G0","G1","G2","G2","G0","G1","G1","G2","G0","G1","G1","G2","G0","G1","G1","G2","G0","G1","G1","G2");

 # strain names corresponding to group names
 labels <- c("stargazer","tottering","lethargic")

 ###
 # plot the samples as is

 oex <- exprs(gset)[,order(sml)]
 osml <- sml[order(sml)]
 ofl <- as.factor(sml)

 palette(c("#dfeaf4","#f4dfdf","#f2cb98", "#AABBCC"))
 dev.new(width=4+dim(gset)[[2]]/5, height=6)
 par(mar=c(2+round(max(nchar(sampleNames(gset)))/2),4,2,1))
 title <- paste ("GSE6275", '/', annotation(gset), " selected samples", sep ='')
 boxplot(oex, boxwex=0.6, notch=T, main=title, outline=FALSE, las=2, col=ofl)
 legend("topleft", labels, fill=palette(), bty="n")

 ###
 # limma setup

 # 1. log2 transform
 ex <- exprs(gset)
 ex[which(ex <= 0)] <- NaN
 exprs(gset) <- log2(ex)

 # 2. setup model for individual expression
 fl <- as.factor(sml)
 gset$description <- fl
 design <- model.matrix(~ description + 0, gset)
 colnames(design) <- levels(fl)
 fit <- lmFit(gset, design)

 # 3. model for comparisons across three groups
 cont.matrix <- makeContrasts(G2-G0, G1-G0, G2-G1, levels=design)
 fit2 <- contrasts.fit(fit, cont.matrix)
 fit2 <- eBayes(fit2, 0.01)

 # 4. grab table of best 250 results
 tT <- topTable(fit2, adjust="fdr", sort.by="B", number=250)

 ###
 # use NCBI annotation (which genes)
 # note: uses platform data from beginning

 ncbifd <- data.frame(attr(dataTable(platf), "table"))

 # replace original platform annotation
 tT <- tT[setdiff(colnames(tT), setdiff(fvarLabels(gset), "ID"))]
 tT <- merge(tT, ncbifd, by="ID")
 tT <- tT[order(tT$P.Value), ]  # restore correct order

 tT <- subset(tT, select=c("ID","adj.P.Val","P.Value","F","Gene.symbol","Gene.title"))

 ##dump the file 
 # but change the directory to yours first!
 #setwd("~/Documents/projects/dataclub/demo")
 #write.csv(tT,"toptable.csv",row.names=FALSE)
	# R example for working with GEO data using GEOquery and limma for
	# Seattle SIGKDD data club.
	# (riffing on auto generated R script for GEO2R service on GEO site)

	## libraries for everything
	library(Biobase)
	library(GEOquery)
	library(limma)

	###
	# Series info:
	# http://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE6275
	#
	# Note: we are getting the already normalized data. Different procedure
	# to start with raw data
	###

	## get data
	# 1. series data
	gset <- getGEO("GSE6275", GSEMatrix =TRUE)
	gset <- gset[[1]]

	# 2. NCBI platform annotation
	gpl <- annotation(gset)
	platf <- getGEO(gpl, AnnotGPL=TRUE)

	# reset column names
	fvarLabels(gset) <- make.names(fvarLabels(gset))

	##
	# Data set has 36 samples corresponding to different brain regions of three
	# strains of mice. Samples from each strain are pooled into three groups.

	# group names for all samples
	sml <- c("G0","G0","G1","G2","G2","G0","G0","G1","G2","G2","G0","G0","G1","G2","G2","G0","G0","G1","G2","G2","G0","G1","G1","G2","G0","G1","G1","G2","G0","G1","G1","G2","G0","G1","G1","G2");

	# strain names corresponding to group names
	labels <- c("stargazer","tottering","lethargic")

	###
	# plot the samples as is

	oex <- exprs(gset)[,order(sml)]
	osml <- sml[order(sml)]
	ofl <- as.factor(sml)

	palette(c("#dfeaf4","#f4dfdf","#f2cb98", "#AABBCC"))
	dev.new(width=4+dim(gset)[[2]]/5, height=6)
	par(mar=c(2+round(max(nchar(sampleNames(gset)))/2),4,2,1))
	title <- paste ("GSE6275", '/', annotation(gset), " selected samples", sep ='')
	boxplot(oex, boxwex=0.6, notch=T, main=title, outline=FALSE, las=2, col=ofl)
	legend("topleft", labels, fill=palette(), bty="n")

	###
	# limma setup

	# 1. log2 transform
	ex <- exprs(gset)
	ex[which(ex <= 0)] <- NaN
	exprs(gset) <- log2(ex)

	# 2. setup model for individual expression
	fl <- as.factor(sml)
	gset$description <- fl
	design <- model.matrix(~ description + 0, gset)
	colnames(design) <- levels(fl)
	fit <- lmFit(gset, design)

	# 3. model for comparisons across three groups
	cont.matrix <- makeContrasts(G2-G0, G1-G0, G2-G1, levels=design)
	fit2 <- contrasts.fit(fit, cont.matrix)
	fit2 <- eBayes(fit2, 0.01)

	# 4. grab table of best 250 results
	tT <- topTable(fit2, adjust="fdr", sort.by="B", number=250)

	###
	# use NCBI annotation (which genes)
	# note: uses platform data from beginning

	ncbifd <- data.frame(attr(dataTable(platf), "table"))

	# replace original platform annotation
	tT <- tT[setdiff(colnames(tT), setdiff(fvarLabels(gset), "ID"))]
	tT <- merge(tT, ncbifd, by="ID")
	tT <- tT[order(tT$P.Value), ] # restore correct order

	tT <- subset(tT, select=c("ID","adj.P.Val","P.Value","F","Gene.symbol","Gene.title"))

	##dump the file
	# but change the directory to yours first!
	#setwd("~/Documents/projects/dataclub/demo")
	#write.csv(tT,"toptable.csv",row.names=FALSE)