Last active
June 6, 2021 03:17
-
-
Save stephenturner/3413453c9ce67ecd0b4d to your computer and use it in GitHub Desktop.
are genes "expressed?"
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| library(dplyr) | |
| library(readr) | |
| # Get data from http://dx.doi.org/10.6084/m9.figshare.1601975 | |
| browseURL("http://dx.doi.org/10.6084/m9.figshare.1601975") | |
| countdata <- read_csv("http://files.figshare.com/2439061/GSE37704_featurecounts.csv") | |
| head(countdata) | |
| metadata <- read_csv("http://files.figshare.com/2439060/GSE37704_metadata.csv") | |
| head(metadata) | |
| # Give it a matrix of counts, get back a | |
| counts2fpkm <- function(x, length, log=FALSE, prior.count=.25) { | |
| # sanity checks | |
| if (class(x)!="matrix") stop("x must be a matrix") | |
| if (nrow(x)!=length(length)) stop("dimensions of count matrix and gene lengths don't match") | |
| # library size is sum of reads in each sample | |
| lib.size <- colSums(x) | |
| if (log) { | |
| # If you're log scaling, you'll have to add something to the zeros, so | |
| # adjust the library sizes accordingly. prior.count is the average count to | |
| # be added to each observation to avoid taking log of zero. | |
| prior.count.scaled <- lib.size/mean(lib.size) * prior.count | |
| lib.size <- lib.size + 2 * prior.count.scaled | |
| } | |
| # Per million | |
| lib.size <- 1e-06 * lib.size | |
| if (log) { | |
| cpm <- log2(t((t(x) + prior.count.scaled)/lib.size)) | |
| } else { | |
| cpm <- t(t(x)/lib.size) | |
| } | |
| # per kilobase | |
| length.kb <- length/1000 | |
| if (log) { | |
| fpkm <- cpm-log2(length.kb) | |
| } else { | |
| fpkm <- cpm/length.kb | |
| } | |
| return(fpkm) | |
| } | |
| # Make a count matrix | |
| counts <- select(countdata, starts_with("SRR")) %>% as.matrix | |
| head(counts, 10) | |
| # get the fpkm and log fpkm | |
| fpkm <- counts2fpkm(counts, length=countdata$length, log=FALSE) | |
| head(fpkm, 10) | |
| # this is better than log(fpkm(...)) because deals with zeros better. | |
| log2fpkm <- counts2fpkm(counts, length=countdata$length, log=TRUE) | |
| head(log2fpkm, 10) | |
| # z score is (x-mu)/sigma | |
| zscores <- apply(log2fpkm, 2, function(x) (x-mean(x)/sd(x))) | |
| head(zscores) | |
| # Some scatterplots | |
| plot(counts, zscores) # not monotonic because of length | |
| plot(fpkm, zscores) # monotonic, but skewed | |
| plot(log2fpkm, zscores) #ahh, more like it | |
| # Where's a good cutoff? zero? | |
| hist(zscores, 500, col=1) | |
| abline(v=0, col=2, lwd=3, lty=2) | |
| # How many genes are "expressed" in each sample, using these criteria? | |
| apply(zscores, 2, function(x) table(x>0)) %>% t | |
| # Get an "expressed" matrix | |
| expressed <- zscores>0 | |
| head(expressed) | |
| # Get the indexes of the control and hoxa1 knockdown samples from the metadata | |
| metadata | |
| (indexes.control <- which(grepl("control_sirna", metadata$condition))) | |
| (indexes.knockdn <- which(grepl("hoxa1_kd", metadata$condition))) | |
| # Create data frame with whether it's expressed in ALL controls, all knockdowns, and the mean counts. | |
| # add back the ensembl gene, and rearrange things, then join back to original data. | |
| dfexpressed <- data_frame(control_expressed=apply(expressed[,indexes.control], 1, all), | |
| knockdn_expressed=apply(expressed[,indexes.knockdn], 1, all), | |
| control_meancounts=rowMeans(counts[, indexes.control]), | |
| knockdn_meancounts=rowMeans(counts[, indexes.knockdn])) %>% | |
| mutate(ensgene=countdata$ensgene) %>% | |
| select(ensgene, everything()) %>% | |
| inner_join(countdata, by="ensgene") | |
| head(dfexpressed, 10) | |
| library(ggplot2) | |
| # Plot mean counts on log scale, red if expressed in both | |
| ggplot(dfexpressed, aes(control_meancounts, knockdn_meancounts)) + | |
| geom_point(aes(col=(control_expressed & knockdn_expressed)), alpha=1/2) + | |
| scale_x_log10() + scale_y_log10() + | |
| scale_colour_manual("Expressed in both", values=c("black", "red")) | |
| # Plot mean counts on log scale, red if expressed in one but not other | |
| ggplot(dfexpressed, aes(control_meancounts, knockdn_meancounts)) + | |
| geom_point(aes(col=control_expressed!=knockdn_expressed), alpha=1/2) + | |
| scale_x_log10() + scale_y_log10() + | |
| scale_colour_manual("Different expression pattern", values=c("black", "red")) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment