Last active
December 23, 2020 09:19
-
-
Save dinovski/e9aaa99880a3354b58dcbd50a59d5039 to your computer and use it in GitHub Desktop.
import cell sorting data and perform automated gating using unsupervised learning (k-means)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/Rscript | |
## Perform automated gating of FACS data using unsupervised clustering and regression analysis. | |
## Data is first filtered to exclude debris (low FSC and high SSC) and doublets (FSC-A v FSC-H) | |
## and the CSV is exported using FlowJo (flowjo.com). | |
## This analysis was designed to assess the effects of various microsatellite lengths on gene expression | |
## using a reporter assay. Briefly, varying 'eSTRs' (gBlock gene fragments) were cloned upstream of a CMV | |
## promoter driving GFP reporter gene expression and co-transfected with an RFP expression plasmid at a | |
## raio of 2:1. Reference samples of GFP:RFP were co-transfected at various ratios (1:1, 2:1, 3:1, 4:1) and | |
## prepared in triplicate for each experiment. | |
## WORKFLOW | |
## 1. Import CSV with fluorescent measures | |
## 2. Filter extreme values | |
## 3. Perform k-means clustering to gate cell populations | |
## 4. Fit linear model and compute MSE | |
## 5. Export slope of regression to compare signal across conditions | |
## LOAD ENVIRONMENT | |
library(mclust) | |
library(fpc) | |
library(tclust) | |
library(dplyr) | |
library(plotrix) | |
interactive() | |
## INPUT | |
inPath='/path/to/csvs/' | |
outPath='/path/to/export/' | |
sampMarker='FITC.A' | |
refMarker='E.Texas.Red.A' | |
## References | |
refs<-list.files(path=inPath, recursive=FALSE, pattern=".csv", full.names=TRUE) | |
x<-c(1, 1.5, 2, 2.5, 3, 4) ## Transfection ratio of sampMarker to refMarker | |
xlab<-"Tfxn ratio GFP:RFP" | |
main<-"Reference plate" | |
lrmain=Sys.date() | |
## Samples | |
samps<-list.files(path=inPath, recursive=FALSE, pattern=".csv", full.names=TRUE) | |
x<-c(0, 0, 0, 11, 11, 11, 16, 16, 16, 21, 21, 21) ## STR allele length | |
xlab<-"STR allele length" | |
main<-"500ng gblock 2:1 GFP:RFP" ## Experiment name | |
lrmain<-Sys.date() | |
##--------------- | |
## FUNCTIONS | |
## fit lm with automated gating | |
clust_analysis <- function (filenames) { | |
slopes<-numeric() | |
up<-numeric() | |
low<-numeric() | |
lr<-numeric() | |
for (i in 1:length(filenames)) { | |
print(filenames[i]) | |
f <- read.csv(filenames[i], header=TRUE) | |
## Filter extreme values | |
f<-f[f[,c(sampMarker)] < max(f[,c(sampMarker)]),] | |
#f<-f[f[,c(refMarker)] < max(f[,c(refMarker)]),] | |
## Select samp and ref marker columns | |
g<-f[,c(sampMarker)] | |
r<-f[,c(refMarker)] | |
## BEGIN LOG | |
#plot log scale | |
plot(xy.coords(r,g),xlim=c(1,max(r)),ylim=c(1,max(g)),log="xy",xlab="RFP",ylab="GFP") | |
#plot linear | |
#gr<-cbind(r,g) | |
#plot(gr[,1], gr[,2]) | |
grlinear<-cbind(r, g) | |
## gate on log scale | |
gr<-grlinear + 1-min(grlinear) | |
gr<-log10(gr) | |
## END LOG | |
#within SS | |
#wss <- (nrow(gr)-1)*sum(apply(gr,2,var)) | |
#for (i in 2:10) wss[i] <- sum(kmeans(gr, centers=i)$withinss) #SS | |
#plot(1:10, wss, type="b", xlab="num clusters", ylab="within group sum of squares") | |
#tclus<-tkmeans(gr, k=2, alpha = 0.01) | |
tclus<-tclust(gr, k=2, alpha=0.01) | |
plot(tclus, main="tclust", xlab=c(refMarker), ylab=c(sampMarker)) | |
#readline(prompt="enter") | |
#append cluster info to linear data | |
gr <- data.frame(grlinear, tclus$cluster) | |
gr1<-gr[gr[,3]==1,] | |
gr2<-gr[gr[,3]==2,] | |
#gr3<-gr[gr[,3]==3,] | |
#gr4<-gr[gr[,3]==4,] | |
## plot diff clusters | |
#plot(gr1[,1], gr1[,2], main="cluster 1") | |
#readline(prompt="enter to continue") | |
#plot(gr2[,1], gr2[,2], main="cluster 2") | |
#readline(prompt="enter to continue") | |
#plot(gr3[,1], gr3[,2], main="cluster 3") | |
#readline(prompt="enter to continue") | |
#plot(gr4[,1], gr4[,2], main="cluster 4") | |
#readline(prompt="enter to continue") | |
## linear regression sampMarker ~ refMarker | |
fit<-lm(gr2[,c(sampMarker)] ~ gr2[,c(refMarker)]) | |
## slope and intercept | |
b <- fit$coefficients[2] | |
a <- fit$coefficients[1] | |
## plot sampMarker v. refMarker cluster | |
plot(gr1[,1], gr1[,2], pch=20, bg='blue', col='blue') | |
abline(a,b, col = 'red') | |
slopes<-append(slopes, b) | |
## confidence intervals | |
ci <- confint(fit, level=0.95) # CI for slope | |
u <-ci[4] | |
l <- ci[2] | |
up<-append(up, u) | |
low<-append(low, l) | |
} | |
plotCI(x, slopes, up-slopes, slopes-low, main=main, xlab=xlab, ylab=paste("slope(", sampMarker, "~" ,refMarker, ")")) | |
print(u) | |
print(l) | |
} | |
## fit lm without automated gating | |
facs_analyze <- function (filenames) { | |
slopes <- numeric() | |
up<-numeric() | |
low<-numeric() | |
summaries<-NULL | |
for (i in 1:length(filenames)) { | |
print(filenames[i]) | |
f <- read.csv(filenames[i], header=TRUE) | |
## remove extreme values | |
f<-f[f[,(sampMarker)]<max(f[,c(sampMarker)]),] | |
f<-f[f[,c(refMarker)]<max(f[,c(refMarker)]),] | |
#c<-cor(f[,c(refMarker)], f[,c(sampMarker)]) | |
#c<-cor.test(f[,c(refMarker)], f[,c(sampMarker)], method="pearson") | |
#print(c) | |
## fit linear model | |
fit <- lm(f[,c(sampMarker)] ~ f[,c(refMarker)]) | |
## MSE | |
sm<-summary(fit) | |
fitsum<-mean(sm$residuals^2) | |
summaries<-append(summaries, fitsum) | |
## slope and intercept | |
b <- fit$coefficients[2] | |
a <- fit$coefficients[1] | |
## plot sampMarker v. refMarker | |
#lr<-plot(f[,c(refMarker)], f[,c(sampMarker)], pch=20, bg='blue', col='blue', main=lrmain) | |
#abline(a,b, col = 'red') | |
#read <- readline( paste(sampMarker, "~", refMarker, ": press [enter] to continue")) | |
## append slope | |
slopes<-append(slopes, b) | |
## confidence intervals | |
ci <- confint(fit, "f[,c(refMarker)]", level=0.95) # CI of slope | |
u <-ci[2] | |
l <- ci[1] | |
up<-append(up, u) | |
low<-append(low, l) | |
} | |
## print MSE info with condition IDs(x) | |
names(summaries)<-c(x) | |
print(summaries) | |
names(slopes)<-c(x) | |
print(slopes) | |
## plot transfection ratios and slopes of samp:ref regression with CIs | |
plotCI(x, slopes, up-slopes, slopes-low, main=main, xlab=xlab, ylab=paste("slope(", sampMarker, "~" ,refMarker, ")")) | |
} | |
## Execute functions | |
facs_analyze(refs) | |
facs_analyze(samps) | |
clust_analysis(refs) | |
clust_analysis(samps) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment