Skip to content

Instantly share code, notes, and snippets.

@stephenturner
Last active June 6, 2021 03:17
Show Gist options
  • Save stephenturner/3413453c9ce67ecd0b4d to your computer and use it in GitHub Desktop.
Save stephenturner/3413453c9ce67ecd0b4d to your computer and use it in GitHub Desktop.
are genes "expressed?"
library(dplyr)
library(readr)
# Get data from http://dx.doi.org/10.6084/m9.figshare.1601975
browseURL("http://dx.doi.org/10.6084/m9.figshare.1601975")
countdata <- read_csv("http://files.figshare.com/2439061/GSE37704_featurecounts.csv")
head(countdata)
metadata <- read_csv("http://files.figshare.com/2439060/GSE37704_metadata.csv")
head(metadata)
# Give it a matrix of counts, get back a
counts2fpkm <- function(x, length, log=FALSE, prior.count=.25) {
# sanity checks
if (class(x)!="matrix") stop("x must be a matrix")
if (nrow(x)!=length(length)) stop("dimensions of count matrix and gene lengths don't match")
# library size is sum of reads in each sample
lib.size <- colSums(x)
if (log) {
# If you're log scaling, you'll have to add something to the zeros, so
# adjust the library sizes accordingly. prior.count is the average count to
# be added to each observation to avoid taking log of zero.
prior.count.scaled <- lib.size/mean(lib.size) * prior.count
lib.size <- lib.size + 2 * prior.count.scaled
}
# Per million
lib.size <- 1e-06 * lib.size
if (log) {
cpm <- log2(t((t(x) + prior.count.scaled)/lib.size))
} else {
cpm <- t(t(x)/lib.size)
}
# per kilobase
length.kb <- length/1000
if (log) {
fpkm <- cpm-log2(length.kb)
} else {
fpkm <- cpm/length.kb
}
return(fpkm)
}
# Make a count matrix
counts <- select(countdata, starts_with("SRR")) %>% as.matrix
head(counts, 10)
# get the fpkm and log fpkm
fpkm <- counts2fpkm(counts, length=countdata$length, log=FALSE)
head(fpkm, 10)
# this is better than log(fpkm(...)) because deals with zeros better.
log2fpkm <- counts2fpkm(counts, length=countdata$length, log=TRUE)
head(log2fpkm, 10)
# z score is (x-mu)/sigma
zscores <- apply(log2fpkm, 2, function(x) (x-mean(x)/sd(x)))
head(zscores)
# Some scatterplots
plot(counts, zscores) # not monotonic because of length
plot(fpkm, zscores) # monotonic, but skewed
plot(log2fpkm, zscores) #ahh, more like it
# Where's a good cutoff? zero?
hist(zscores, 500, col=1)
abline(v=0, col=2, lwd=3, lty=2)
# How many genes are "expressed" in each sample, using these criteria?
apply(zscores, 2, function(x) table(x>0)) %>% t
# Get an "expressed" matrix
expressed <- zscores>0
head(expressed)
# Get the indexes of the control and hoxa1 knockdown samples from the metadata
metadata
(indexes.control <- which(grepl("control_sirna", metadata$condition)))
(indexes.knockdn <- which(grepl("hoxa1_kd", metadata$condition)))
# Create data frame with whether it's expressed in ALL controls, all knockdowns, and the mean counts.
# add back the ensembl gene, and rearrange things, then join back to original data.
dfexpressed <- data_frame(control_expressed=apply(expressed[,indexes.control], 1, all),
knockdn_expressed=apply(expressed[,indexes.knockdn], 1, all),
control_meancounts=rowMeans(counts[, indexes.control]),
knockdn_meancounts=rowMeans(counts[, indexes.knockdn])) %>%
mutate(ensgene=countdata$ensgene) %>%
select(ensgene, everything()) %>%
inner_join(countdata, by="ensgene")
head(dfexpressed, 10)
library(ggplot2)
# Plot mean counts on log scale, red if expressed in both
ggplot(dfexpressed, aes(control_meancounts, knockdn_meancounts)) +
geom_point(aes(col=(control_expressed & knockdn_expressed)), alpha=1/2) +
scale_x_log10() + scale_y_log10() +
scale_colour_manual("Expressed in both", values=c("black", "red"))
# Plot mean counts on log scale, red if expressed in one but not other
ggplot(dfexpressed, aes(control_meancounts, knockdn_meancounts)) +
geom_point(aes(col=control_expressed!=knockdn_expressed), alpha=1/2) +
scale_x_log10() + scale_y_log10() +
scale_colour_manual("Different expression pattern", values=c("black", "red"))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment