Last active
September 28, 2018 09:29
-
-
Save dinovski/298ee98efcd44ca9c3aae8c473bdf91b to your computer and use it in GitHub Desktop.
reanalysis of GSE26682
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/Rscript | |
#source("http://bioconductor.org/biocLite.R") | |
#biocLite("curatedCRCData") | |
#biocLite("inSilicoMerging") | |
library(Biobase) | |
library(GEOquery) | |
library(rgl) | |
library(inSilicoMerging) | |
library(gdata) | |
library(scatterplot3d) | |
## ABOUT | |
## 176 (eset1) and 155 samples (eset2) MSI and MSS tumors | |
## http://www.ncbi.nlm.nih.gov/geo/geo2r/?acc=GSE26682&platform=GPL96 | |
## Probes of the U133A array present in the U133A Plus 2.0 were selected and quantile normalized to mimic the distribution of the U133A array. | |
## For all 331 samples, MAS 5.0-calculated signal intensities were normalized using the quantile normalization procedure implemented in | |
## robust multiarray analysis and the normalized data were log 2 transformed. | |
## Sample specific median centering and scaling by the standard deviation were additionally applied. | |
## Probe sets that were not expressed or that exhibited low variability across samples were excluded. | |
gds<-getGEO("GSE26682", AnnotGPL=TRUE, getGPL=TRUE) #eset | |
## save locally | |
# saveRDS(gds, "gse26682.rds") | |
# saveRDS(exprs(gds[[1]]), "gse26682_exp1.rds") | |
# saveRDS(exprs(gds[[2]]), "gse26682_exp2.rds") | |
# exp1<-readRDS("~/Dropbox/brandeis/gse26682_exp1.rds") | |
# exp2<-readRDS("~/Dropbox/brandeis/gse26682_exp2.rds") | |
# exp.list<-list(exp1, exp2) | |
# exp.sets<-merge(exp1, expmethod="NONE") | |
#### | |
dim(exprs(gds[[1]])) | |
dim(exprs(gds[[2]])) | |
## merge esets | |
esets<-merge(gds, method="NONE") | |
## gene names | |
gene.info<-fData(esets) | |
gene.id<-gene.info[,1] #ID | |
gene.name<-gene.info[,3] #gene | |
gene.go<-gene.info[,16:17] #GO function | |
gene.summary<-cbind(as.data.frame(gene.id), as.data.frame(gene.name), as.data.frame(gene.go)) | |
gene.names<-cbind(as.data.frame(gene.id), as.data.frame(gene.name)) | |
## phenotypic data | |
pdata<-pData(esets) | |
pdata<-data.frame(pdata$geo_accession, pdata$characteristics_ch1, pdata$characteristics_ch1.1, pdata$characteristics_ch1.2) | |
names(pdata)<-c("geo_accession", "age", "gender", "status") | |
## write table and reformat on command line | |
#write.table(pdata, "~/geo_pdata.txt", row.names=F, quote=F, sep=",") | |
#### | |
## pdata | |
pdata<-read.csv("~/geo_pdata_reformat.txt", header=T) | |
rownames(pdata)<-pdata$geo_accession | |
pdata$geo_accession<-NULL | |
## expression data | |
#dim(exprs(esets)) | |
exp<-exprs(esets) | |
ge<-as.data.frame(t(exp)) | |
rownames(ge)==rownames(pdata) | |
## status = stable, low, high, unknown | |
e.m<-pdata$status | |
##################### | |
## initial analysis: MSS versus MSI high and low | |
## remove samples with unknown status (16) | |
sample.sel <- e.m != "Unknown" | |
ge<-ge[sample.sel,] | |
pdata<-pdata[sample.sel,] | |
e.m<-e.m[sample.sel] | |
rownames(ge)==rownames(pdata) | |
####2 factor class | |
## add column for stable (MSS) or unstable (MSI) | |
MSS<-as.factor("0") | |
MSI<-as.factor("1") | |
pdata$class<-apply(pdata, 1, function(x) if ( x[3] == "Stable" ) { return(MSS) } else { return(MSI) } ) | |
e.m<-pdata[,4] | |
## expression values = (ge) and pdata (or e.m) = with microsatellite stability status | |
## and gene.names df with name corresponding to each probe ID = 300 samples total: 218 stable | |
############ | |
## check expression levels across samples (that data were indeed quantile normalized and log2 transformed) | |
boxplot((t(ge)[,1:100]),las=3,cex.axis=0.8, col="lightgreen") | |
boxplot((t(ge)[,101:200]),las=3,cex.axis=0.8, col="lightgreen") | |
boxplot((t(ge)[,201:300]),las=3,cex.axis=0.8, col="lightgreen") | |
############ | |
## PCA | |
pca<-prcomp(ge, scale=T) | |
class<-pdata$class | |
plot(pca$x[,1], pca$x[,2], col=class, pch=19) | |
legend("topright", legend=levels(class), pch=19, pt.lwd=2, lty=NULL, cex=0.5, col=seq_along(levels(class))) | |
plot(pca) | |
pca.rpm=prcomp(ge,scale=T) | |
sc3=scatterplot3d(pca.rpm$x[,1:3],pch=20,highlight.3d=TRUE,cex.symbol=2) | |
s3d.coords = sc3$xyz.convert(pca.rpm$x[,1:3]) | |
text(s3d.coords$x,s3d.coords$y,labels=pdata[,3],cex=0.8,pos=4) | |
## kmeans : top BH corrected genes: 202836_s_at 213738_s_at | |
fit.e=kmeans(ge.top,3) | |
## assign cluster ID to sample | |
df<-data.frame(CLASS=pdata[,3],CLUST.ID=fit.e$cluster) | |
plot(ge.top[,1:2], pch=20,col=ifelse(fit.e$cluster==1,"blue","green")) | |
points(fit.e$centers,pch='x',cex=2,col='red') | |
fit.loge=kmeans(log2(ge+1),2) | |
df<-data.frame(CLASS=pdata[,3],CLUST.ID=fit.loge$cluster) | |
## 'ge' rows/obervations=samples and cols/variables=genes | |
## one outlier sample (GSM656860) | |
#### | |
## t-test for association between gene expression and microsatellite stability (stable versus high/low) | |
tt.pvals<-apply(ge,2,function(x) t.test(x ~ e.m)$p.value) | |
sort(tt.pvals)[1:20] | |
## add actual gene name to probe id/p value | |
df.pvals<-data.frame(tt.pvals) | |
df.pvals$gene.id<-rownames(df.pvals) | |
df.pvals<-inner_join(df.pvals, gene.names) | |
df.pvals<-df.pvals[ order(df.pvals[,1]), ] | |
####### logistic regression model on top genes from t-test | |
## top genes from t test all 300 samples | |
sel<-order(tt.pvals, decreasing=F) | |
top<-ge[,sel][1:80] | |
M<-glm(e.m ~ ., data=top,family="binomial") | |
assess.prediction(e.m, as.numeric(predict(M,type="response")>0.5)) | |
# Total cases that are not NA: 300 | |
# Correct predictions (accuracy): 280(93.3%) | |
# TPR (sensitivity)=TP/P: 82.9% | |
# TNR (specificity)=TN/N: 97.2% | |
# PPV (precision)=TP/(TP+FP): 91.9% | |
# FDR (false discovery)=1-PPV: 8.11% | |
# FPR =FP/N=1-TNR: 2.75% | |
## see how samples cluster for top 100 most significant genes | |
df.pvals$gene.id == data.frame(rownames(t(ge))) | |
df.pvals.100<-df.pvals.s[1:100,] | |
top.sel<-rownames(t(ge)) %in% df.pvals.100$gene.id | |
## select only those genes from ge | |
top.exp<-t(ge)[top.sel,] | |
## PCA | |
## columns = variables (genes) | |
pca<-prcomp(t(top.exp), scale=T) | |
class<-pdata$class | |
plot(pca$x[,1], pca$x[,2], col=class, pch=19) | |
legend("topright", legend=levels(class), pch=19, pt.lwd=2, lty=NULL, cex=0.5, col=seq_along(levels(class))) | |
## weak separation separation between MSS (0) and MSI (1) | |
## hierarchical clustering of top 100 genes from the t-test | |
d = dist(t(top.exp)) | |
d.log = dist(log2(t(top.exp)+1)) | |
labels<-rownames(top.exp) | |
plot(hclust(d,method="average"),label=pdata[,3], main="Average, count") | |
plot(hclust(d,method="ward.D2"),label=pdata[,3], main="Ward,count") | |
plot(hclust(d.log,method="average"),label=pdata[,3], main="Average, log-count") | |
plot(hclust(d.log,method="ward.D2"),label=pdata[,3], main="Ward, log-count") | |
## MSI high do form a separate cluster when using average linkage but the branch is not very high | |
pca.rpm=prcomp(t(top.exp),scale=T) | |
sc3=scatterplot3d(pca.rpm$x[,1:3],pch=20,highlight.3d=TRUE,cex.symbol=2) | |
s3d.coords = sc3$xyz.convert(pca.rpm$x[,1:3]) | |
text(s3d.coords$x,s3d.coords$y,labels=pdata[,3],cex=0.8,pos=4) | |
## can see a separate cluster for samples with high MSI, similar to the hierarchical clustering and PCA | |
## kmeans | |
fit.e=kmeans(ge,3) | |
## assign cluster ID to sample | |
df<-data.frame(CLASS=pdata[,3],CLUST.ID=fit.e$cluster) | |
## color by cluster | |
plot(ge[,1:2],pch=19,col=ifelse(fit.e$cluster==1,"green","lightblue"), main="kmeans, color by cluster") | |
points(fit.e$centers,pch='x',cex=2,col='red') | |
cr<-c("blue","green") | |
legend("topright", legend=levels(class), pch=19, pt.lwd=2, lty=NULL, cex=0.5, col=cr) | |
## Using k=2, can see more or less 2 clusters | |
fit.loge=kmeans(log2(ge+1),2) | |
data.frame(CLASS=pdata[,3],CLUST.ID=fit.loge$cluster) | |
############# | |
## following analysis is with 3 separate classes defined: (stable=0, low=1, high=2) | |
MSS<-as.factor("0") | |
MSI.low<-as.factor("1") | |
MSI.high<-as.factor("2") | |
pdata$class<-apply(pdata, 1, function(x) if ( x[3] == "Stable" ) { return(MSS) } else if (x[3] == "Low" ) { return(MSI.low) } else { return(MSI.high) } ) | |
## add classes to e.m variable | |
e.m<-pdata[,4] | |
## anova, using 3 distinct classes | |
MSS<-as.factor("0") | |
MSI.low<-as.factor("1") | |
MSI.high<-as.factor("2") | |
pdata$class<-apply(pdata, 1, function(x) if ( x[3] == "Stable" ) { return(MSS) } else if (x[3] == "Low" ) { return(MSI.low) } else { return(MSI.high) } ) | |
## add classes to e.m variable | |
e.m<-pdata[,4] | |
## anova for reg1a | |
gene<-ge[,c("209752_at")] | |
anova(lm(gene ~ pdata$class)) | |
# pval=5.891e-09 | |
## lowest variance | |
gene<-ge[,c("221131_at")] | |
anova(lm(gene ~ pdata$class)) | |
# pval = 0.7202 | |
## MLH1 | |
gene<-ge[,c("202520_s_at")] | |
anova(lm(gene ~ pdata$class)) | |
# pval = 2.2e-16 | |
## expression, 3 classes | |
with(pdata, boxplot( ge[,c("209752_at")] ~ reorder(class), col=c("lightblue", "lightgreen", "orange"), ylim=c(6,16), main="REG1A expression" )) | |
with(pdata, boxplot(ge[,c("205886_at")] ~ reorder(class), col=c("lightblue", "lightgreen", "orange"), ylim=c(6,16), main="REG1B expression" )) | |
with(pdata, boxplot(ge[,c("204673_at")] ~ reorder(class), col=c("lightblue", "lightgreen", "orange"), ylim=c(6,16), main="MUC2 expression" )) | |
with(pdata, boxplot( ge[,c("221131_at")] ~ reorder(class), col=c("lightblue", "lightgreen", "orange"), ylim=c(6,16), main="A4GNT expression" )) | |
## MLH1 | |
with(pdata, boxplot(ge[,c("202520_s_at")] ~ reorder(class), col=c("lightblue", "lightgreen", "orange"), ylim=c(6,16), main="MLH1 expression" )) | |
## variables=samples (columns) | |
pca<-prcomp(t(top.exp), scale=T) | |
class<-pdata$class | |
plot(pca$x[,1], pca$x[,2], col=class, pch=19, xlab="PC1", ylab="PC2") | |
legend("topright", legend=levels(class), pch=19, pt.lwd=2, lty=NULL, cex=0.5, col=seq_along(levels(class))) | |
## stable samples mostly overlap the low instablity samples | |
## kmeans top 2 genes (from t-test) | |
top2<-ge[,c("202836_s_at", "213738_s_at")] | |
fit.e=kmeans(ge,3) | |
## assign cluster ID to sample | |
df<-data.frame(CLASS=pdata[,3],CLUST.ID=fit.e$cluster) | |
attach(df) ; plot(top2, col=c("black","blue","green")[2]); detach(df) | |
c("black","orange","green")[df$CLUST.ID] | |
## color by cluster and add actual gene names | |
gene.names[gene.id == "202836_s_at",] | |
gene.names[gene.id == "213738_s_at",] | |
plot(top2,pch=19,col=c("black","orange","green")[df$CLUST.ID], main="kmeans, color by cluster", | |
xlab="TXNL4A", ylab="ATP5A1") | |
points(fit.e$centers,pch='x',cex=2,col='red') | |
legend("topright", legend=levels(as.factor(df$CLUST.ID)), pch=19, pt.lwd=2, lty=NULL, cex=0.5, col=c("black","orange","green") ) | |
## color by microsatellite class | |
df$CLASS.n<-factor(df$CLASS) # need to factor to remove unused level | |
df$CLASS<-NULL | |
attach(df) ; plot(top2, col=c("black","blue","green")[1]); detach(df) | |
col.class<-c("black","blue","green")[df$CLASS.n] | |
plot(top2, pch=19, col=col.class, main="color by class", | |
xlab="TXNL4A", ylab="ATP5A1") | |
legend("topright", legend=levels(df$CLASS.n), pch=19, pt.lwd=2, lty=NULL, cex=0.5, col=c("black","blue","green")) | |
#### | |
## try fit on top hit from t-test (stable versus unstable) | |
M<-glm(e.m ~ ge[,c("201464_x_at")],family="binomial") | |
predict(M, type="response")[1:5] | |
## convert fitted probabilites to category predictions | |
as.numeric(predict(M, type="response")[1:5]>0.5) | |
## compare predictions to known values of the tumor class model was fit to | |
assess.prediction=function(truth,predicted) { | |
predicted = predicted[ ! is.na(truth) ] | |
truth = truth[ ! is.na(truth) ] | |
truth = truth[ ! is.na(predicted) ] | |
predicted = predicted[ ! is.na(predicted) ] | |
cat("Total cases that are not NA: ",length(truth),"\n",sep="") | |
cat("Correct predictions (accuracy): ",sum(truth==predicted), | |
"(",signif(sum(truth==predicted)*100/length(truth),3),"%)\n",sep="") | |
TP = sum(truth==1 & predicted==1) | |
TN = sum(truth==0 & predicted==0) | |
FP = sum(truth==0 & predicted==1) | |
FN = sum(truth==1 & predicted==0) | |
P = TP+FN # total number of positives in the truth data | |
N = FP+TN # total number of negatives | |
cat("TPR (sensitivity)=TP/P: ", signif(100*TP/P,3),"%\n",sep="") | |
cat("TNR (specificity)=TN/N: ", signif(100*TN/N,3),"%\n",sep="") | |
cat("PPV (precision)=TP/(TP+FP): ", signif(100*TP/(TP+FP),3),"%\n",sep="") | |
cat("FDR (false discovery)=1-PPV: ", signif(100*FP/(TP+FP),3),"%\n",sep="") | |
cat("FPR =FP/N=1-TNR: ", signif(100*FP/N,3),"%\n",sep="") | |
} | |
## check predictions | |
assess.prediction(e.m,as.numeric(predict(M,type="response")>0.5)) | |
################################## | |
## analysis without MSI:Low samples | |
## remove samples with unknown status and "Low" stability = 253 total samples | |
sample.sel <- e.m != "Unknown" & e.m != "Low" | |
## remove unknown and MSI:low samples from expression, phenotypic data, and 'outcome' | |
ge<-ge[sample.sel,] | |
pdata<-pdata[sample.sel,] | |
e.m<-e.m[sample.sel] | |
## remove unused levels | |
e.m<-factor(e.m) | |
rownames(ge)==rownames(pdata) | |
## add column for stable (MSS) or MSI | |
MSS<-as.factor("0") | |
MSI<-as.factor("1") | |
pdata$class<-apply(pdata, 1, function(x) if ( x[3] == "Stable" ) { return(MSS) } else { return(MSI) } ) | |
e.m<-pdata[,4] | |
## PCA | |
pca<-prcomp(ge, scale=T) | |
class<-pdata$class | |
plot(pca$x[,1], pca$x[,2], col=class, pch=19, xlab="PC1", ylab="PC2") | |
legend("topright", legend=levels(class), pch=19, pt.lwd=2, lty=NULL, cex=0.5, col=seq_along(levels(class))) | |
## t-test:SI high versus MSS for 253 samples | |
tt.pvals<-apply(ge,2,function(x) t.test(x ~ e.m.filtered)$p.value) | |
## add actual gene name to probe id/p value | |
df.pvals<-data.frame(tt.pvals) | |
## FDR corrected p values | |
df.pvals$BH<-c(p.adjust(df.pvals$tt.pvals, method = "BH")) | |
df.pvals$gene.id<-rownames(df.pvals) | |
df.pvals<-inner_join(df.pvals, gene.names) | |
## sort from lowest FDR corrected pval | |
df.pvals<-df.pvals[ order(df.pvals[,2]), ] | |
df.pvals[1:20,] | |
## LR top hit (pval) | |
which.min(tt.pvals) | |
M<-glm(e.m ~ ge[,c("215780_s_at")],family="binomial") | |
predict(M,type="response")[1:5] | |
## convert fitted probabilites to category predictions | |
as.numeric(predict(M,type="response")[1:5]>0.5) | |
## compare predictions to known values of the tumor class we were fitting the model to | |
assess.prediction(e.m, as.numeric(predict(M,type="response")>0.5)) | |
## multiple top hits | |
sel<-order(tt.pvals, decreasing=F) | |
top<-ge[,sel][1:100] | |
M<-glm(e.m ~ ., data=top,family="binomial") | |
assess.prediction(e.m, as.numeric(predict(M,type="response")>0.5)) | |
## t-test on 253 samples did not improve predictive accuracy or sensitivity | |
## LR top hit (fdr corrected) | |
bh<-c(p.adjust(tt.pvals, method="BH")) | |
which.min(bh) | |
gene.names[gene.id=="202589_at",] | |
M<-glm(e.m ~ ge[,c("202589_at")],family="binomial") | |
predict(M,type="response")[1:5] | |
## convert fitted probabilites to category predictions | |
as.numeric(predict(M,type="response")[1:5]>0.5) | |
## compare predictions to known values of the tumor class model was fit to | |
assess.prediction(e.m, as.numeric(predict(M,type="response")>0.5)) | |
#### | |
## cluster top genes by variance | |
## convenience function to calculate correlation-based distance | |
## note: cor() calculates the matrix of pairwise correlation coefficients between columns (genes) | |
corr.dist=function(x) { as.dist(1-cor(x)) } | |
## calculate variations and means across columns (probe/gene) | |
vars=apply(ge,2,var) | |
means=apply(ge,2,mean) | |
## add status to sample names | |
# pdata.mod<-pdata | |
# pdata.mod$sample<-rownames(pdata.mod) | |
# pdata.mod$labels <- do.call(paste, c(pdata.mod[c("sample", "status")], sep = ":")) | |
# labels<-pdata.mod$labels | |
select<-order(vars,decreasing=T)[1:300] | |
ge[,select] | |
labels<-pdata[,3] | |
## correlation, top 300 vars | |
plot(hclust(corr.dist(t(ge[,select])), method="ward.D2"), label=labels) | |
plot(hclust(dist(t(ge[,select])), method="average"),label=labels) | |
select<-order(means,decreasing=T)[1:300] | |
## correlation, top 300 means | |
plot(hclust(corr.dist(t(ge[,select])), method="ward.D2"), label=labels) | |
plot(hclust(dist(ge[,select]), method="average"), label=labels) | |
############## | |
## naive bayes, 'ge' ordered by FDR corrected t-test significance: 108 with q<0.01 | |
bh.pvals<-p.adjust(tt.pvals, method = "BH") | |
sel<-order(bh.pvals, decreasing=F) | |
sel<-bh.pvals<0.01 | |
ge.top<-ge[,sel] #300 samples, 108 genes | |
Mb1=naiveBayes(e.m ~ . , data=data.frame(G=ge.top[,1])) | |
assess.prediction(e.m,predict(Mb1,data.frame(G=ge.top[,1]))) | |
n<-100 | |
Mb=naiveBayes(e.m ~ . , data=ge.top[,1:n]) | |
assess.prediction(e.m,predict(Mb,ge.top[,1:n])) | |
## leave n (samples) out, top 300 variances | |
library(class) | |
df<-ge.top | |
n = 0 | |
n.out = 5 | |
Nrep = 1000 | |
for ( i in 1:Nrep ) { | |
## randomly choose 5 observations to withhold and keep as the test set: | |
leave.out = sample(nrow(df),size=n.out) | |
pred = knn(df[-leave.out,1,drop=F], | |
df[leave.out,1,drop=F],e.m[-leave.out],k=1) | |
n = n + sum( pred==e.m[leave.out] ) | |
} | |
n/(n.out*Nrep) | |
#~65% accuracy | |
## supervised: SVM | |
M=svm(e.m~.,ge.top[,1:20],kernel="polynomial",degree=3) | |
assess.prediction(e.m,predict(M,ge.top[,1:20])) | |
M=svm(e.m~.,ge.top[,1:20],kernel="radial") | |
assess.prediction(e.m,predict(M,ge.top[,1:20])) | |
M=svm(e.m~.,ge.top[,1:20,drop=F],kernel="radial",gamma=0.1) | |
assess.prediction(e.m,predict(M,ge.top[,1:20])) | |
## leave-n-out cross validation | |
df<-ge.top #300 samples, 108 genes (t-test FDR cutoff <0.01) | |
predictor.LR$train(e.m ~ . , df[1:20]) | |
assess.prediction(e.m, predictor.LR$predict(df[1:20])) | |
cv.LR.20=cross.validate(predictor.LR, e.m ~ . , df[1:20]) | |
assess.prediction(cv.LR.20$truth,cv.LR.20$prediction) | |
## similar to LR on top variance and top hits from t-test | |
#[1] 80.46 | |
#$TPR | |
#[1] 45.17804 | |
#$TNR | |
#[1] 93.48302 | |
#$PPV | |
#[1] 71.90083 | |
#$FDR | |
#[1] 28.09917 | |
#$FPR | |
#[1] 6.516977 | |
####################### | |
#http://journals.plos.org/plosone/article?id=10.1371/journal.pone.0071755 | |
#getGEO("GSE36335") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment