Created
May 19, 2015 22:10
-
-
Save bjkeller/87d0b334463bec9f8fb5 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# R example for working with GEO data using GEOquery and limma for | |
# Seattle SIGKDD data club. | |
# (riffing on auto generated R script for GEO2R service on GEO site) | |
## libraries for everything | |
library(Biobase) | |
library(GEOquery) | |
library(limma) | |
### | |
# Series info: | |
# http://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE6275 | |
# | |
# Note: we are getting the already normalized data. Different procedure | |
# to start with raw data | |
### | |
## get data | |
# 1. series data | |
gset <- getGEO("GSE6275", GSEMatrix =TRUE) | |
gset <- gset[[1]] | |
# 2. NCBI platform annotation | |
gpl <- annotation(gset) | |
platf <- getGEO(gpl, AnnotGPL=TRUE) | |
# reset column names | |
fvarLabels(gset) <- make.names(fvarLabels(gset)) | |
## | |
# Data set has 36 samples corresponding to different brain regions of three | |
# strains of mice. Samples from each strain are pooled into three groups. | |
# group names for all samples | |
sml <- c("G0","G0","G1","G2","G2","G0","G0","G1","G2","G2","G0","G0","G1","G2","G2","G0","G0","G1","G2","G2","G0","G1","G1","G2","G0","G1","G1","G2","G0","G1","G1","G2","G0","G1","G1","G2"); | |
# strain names corresponding to group names | |
labels <- c("stargazer","tottering","lethargic") | |
### | |
# plot the samples as is | |
oex <- exprs(gset)[,order(sml)] | |
osml <- sml[order(sml)] | |
ofl <- as.factor(sml) | |
palette(c("#dfeaf4","#f4dfdf","#f2cb98", "#AABBCC")) | |
dev.new(width=4+dim(gset)[[2]]/5, height=6) | |
par(mar=c(2+round(max(nchar(sampleNames(gset)))/2),4,2,1)) | |
title <- paste ("GSE6275", '/', annotation(gset), " selected samples", sep ='') | |
boxplot(oex, boxwex=0.6, notch=T, main=title, outline=FALSE, las=2, col=ofl) | |
legend("topleft", labels, fill=palette(), bty="n") | |
### | |
# limma setup | |
# 1. log2 transform | |
ex <- exprs(gset) | |
ex[which(ex <= 0)] <- NaN | |
exprs(gset) <- log2(ex) | |
# 2. setup model for individual expression | |
fl <- as.factor(sml) | |
gset$description <- fl | |
design <- model.matrix(~ description + 0, gset) | |
colnames(design) <- levels(fl) | |
fit <- lmFit(gset, design) | |
# 3. model for comparisons across three groups | |
cont.matrix <- makeContrasts(G2-G0, G1-G0, G2-G1, levels=design) | |
fit2 <- contrasts.fit(fit, cont.matrix) | |
fit2 <- eBayes(fit2, 0.01) | |
# 4. grab table of best 250 results | |
tT <- topTable(fit2, adjust="fdr", sort.by="B", number=250) | |
### | |
# use NCBI annotation (which genes) | |
# note: uses platform data from beginning | |
ncbifd <- data.frame(attr(dataTable(platf), "table")) | |
# replace original platform annotation | |
tT <- tT[setdiff(colnames(tT), setdiff(fvarLabels(gset), "ID"))] | |
tT <- merge(tT, ncbifd, by="ID") | |
tT <- tT[order(tT$P.Value), ] # restore correct order | |
tT <- subset(tT, select=c("ID","adj.P.Val","P.Value","F","Gene.symbol","Gene.title")) | |
##dump the file | |
# but change the directory to yours first! | |
#setwd("~/Documents/projects/dataclub/demo") | |
#write.csv(tT,"toptable.csv",row.names=FALSE) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment