stephenturner · June 6, 2021 03:17
diff --git a/expressed_or_not.R b/expressed_or_not.R
 library(dplyr)
 library(readr)

 # Get data from http://dx.doi.org/10.6084/m9.figshare.1601975
 browseURL("http://dx.doi.org/10.6084/m9.figshare.1601975")
 countdata <- read_csv("http://files.figshare.com/2439061/GSE37704_featurecounts.csv")
 head(countdata)
 metadata <- read_csv("http://files.figshare.com/2439060/GSE37704_metadata.csv")
 head(metadata)

 # Give it a matrix of counts, get back a
 counts2fpkm <- function(x, length, log=FALSE, prior.count=.25) {
  # sanity checks
  if (class(x)!="matrix") stop("x must be a matrix")
  if (nrow(x)!=length(length)) stop("dimensions of count matrix and gene lengths don't match")
  # library size is sum of reads in each sample
  lib.size <- colSums(x)
  if (log) {
    # If you're log scaling, you'll have to add something to the zeros, so
    # adjust the library sizes accordingly. prior.count is the average count to
    # be added to each observation to avoid taking log of zero.
    prior.count.scaled <- lib.size/mean(lib.size) * prior.count
    lib.size <- lib.size + 2 * prior.count.scaled
  }
  # Per million
  lib.size <- 1e-06 * lib.size
  if (log) {
    cpm <- log2(t((t(x) + prior.count.scaled)/lib.size))
  } else {
    cpm <- t(t(x)/lib.size)
  }
  # per kilobase
  length.kb <- length/1000
  if (log) {
    fpkm <- cpm-log2(length.kb)
  } else {
    fpkm <- cpm/length.kb
  }
  return(fpkm)
 }

 # Make a count matrix
 counts <- select(countdata, starts_with("SRR")) %>% as.matrix
 head(counts, 10)

 # get the fpkm and log fpkm
 fpkm     <- counts2fpkm(counts, length=countdata$length, log=FALSE)
 head(fpkm, 10)

 # this is better than log(fpkm(...)) because deals with zeros better.
 log2fpkm <- counts2fpkm(counts, length=countdata$length, log=TRUE)
 head(log2fpkm, 10)

 # z score is (x-mu)/sigma
 zscores <- apply(log2fpkm, 2, function(x) (x-mean(x)/sd(x)))
 head(zscores)

 # Some scatterplots
 plot(counts, zscores) # not monotonic because of length
 plot(fpkm, zscores) # monotonic, but skewed
 plot(log2fpkm, zscores) #ahh, more like it

 # Where's a good cutoff? zero?
 hist(zscores, 500, col=1)
 abline(v=0, col=2, lwd=3, lty=2)

 # How many genes are "expressed" in each sample, using these criteria?
 apply(zscores, 2, function(x) table(x>0)) %>% t

 # Get an "expressed" matrix
 expressed <- zscores>0
 head(expressed)

 # Get the indexes of the control and hoxa1 knockdown samples from the metadata
 metadata
 (indexes.control <- which(grepl("control_sirna", metadata$condition)))
 (indexes.knockdn <- which(grepl("hoxa1_kd", metadata$condition)))

 # Create data frame with whether it's expressed in ALL controls, all knockdowns, and the mean counts.
 # add back the ensembl gene, and rearrange things, then join back to original data.
 dfexpressed <- data_frame(control_expressed=apply(expressed[,indexes.control], 1, all),
                          knockdn_expressed=apply(expressed[,indexes.knockdn], 1, all),
                          control_meancounts=rowMeans(counts[, indexes.control]),
                          knockdn_meancounts=rowMeans(counts[, indexes.knockdn])) %>%
  mutate(ensgene=countdata$ensgene) %>%
  select(ensgene, everything()) %>%
  inner_join(countdata, by="ensgene")

 head(dfexpressed, 10)


 library(ggplot2)

 # Plot mean counts on log scale, red if expressed in both
 ggplot(dfexpressed, aes(control_meancounts, knockdn_meancounts)) +
  geom_point(aes(col=(control_expressed & knockdn_expressed)), alpha=1/2) +
  scale_x_log10() + scale_y_log10() +
  scale_colour_manual("Expressed in both", values=c("black", "red"))

 # Plot mean counts on log scale, red if expressed in one but not other
 ggplot(dfexpressed, aes(control_meancounts, knockdn_meancounts)) +
  geom_point(aes(col=control_expressed!=knockdn_expressed), alpha=1/2) +
  scale_x_log10() + scale_y_log10() +
  scale_colour_manual("Different expression pattern", values=c("black", "red"))
	library(dplyr)
	library(readr)

	# Get data from http://dx.doi.org/10.6084/m9.figshare.1601975
	browseURL("http://dx.doi.org/10.6084/m9.figshare.1601975")
	countdata <- read_csv("http://files.figshare.com/2439061/GSE37704_featurecounts.csv")
	head(countdata)
	metadata <- read_csv("http://files.figshare.com/2439060/GSE37704_metadata.csv")
	head(metadata)

	# Give it a matrix of counts, get back a
	counts2fpkm <- function(x, length, log=FALSE, prior.count=.25) {
	# sanity checks
	if (class(x)!="matrix") stop("x must be a matrix")
	if (nrow(x)!=length(length)) stop("dimensions of count matrix and gene lengths don't match")
	# library size is sum of reads in each sample
	lib.size <- colSums(x)
	if (log) {
	# If you're log scaling, you'll have to add something to the zeros, so
	# adjust the library sizes accordingly. prior.count is the average count to
	# be added to each observation to avoid taking log of zero.
	prior.count.scaled <- lib.size/mean(lib.size) * prior.count
	lib.size <- lib.size + 2 * prior.count.scaled
	}
	# Per million
	lib.size <- 1e-06 * lib.size
	if (log) {
	cpm <- log2(t((t(x) + prior.count.scaled)/lib.size))
	} else {
	cpm <- t(t(x)/lib.size)
	}
	# per kilobase
	length.kb <- length/1000
	if (log) {
	fpkm <- cpm-log2(length.kb)
	} else {
	fpkm <- cpm/length.kb
	}
	return(fpkm)
	}

	# Make a count matrix
	counts <- select(countdata, starts_with("SRR")) %>% as.matrix
	head(counts, 10)

	# get the fpkm and log fpkm
	fpkm <- counts2fpkm(counts, length=countdata$length, log=FALSE)
	head(fpkm, 10)

	# this is better than log(fpkm(...)) because deals with zeros better.
	log2fpkm <- counts2fpkm(counts, length=countdata$length, log=TRUE)
	head(log2fpkm, 10)

	# z score is (x-mu)/sigma
	zscores <- apply(log2fpkm, 2, function(x) (x-mean(x)/sd(x)))
	head(zscores)

	# Some scatterplots
	plot(counts, zscores) # not monotonic because of length
	plot(fpkm, zscores) # monotonic, but skewed
	plot(log2fpkm, zscores) #ahh, more like it

	# Where's a good cutoff? zero?
	hist(zscores, 500, col=1)
	abline(v=0, col=2, lwd=3, lty=2)

	# How many genes are "expressed" in each sample, using these criteria?
	apply(zscores, 2, function(x) table(x>0)) %>% t

	# Get an "expressed" matrix
	expressed <- zscores>0
	head(expressed)

	# Get the indexes of the control and hoxa1 knockdown samples from the metadata
	metadata
	(indexes.control <- which(grepl("control_sirna", metadata$condition)))
	(indexes.knockdn <- which(grepl("hoxa1_kd", metadata$condition)))

	# Create data frame with whether it's expressed in ALL controls, all knockdowns, and the mean counts.
	# add back the ensembl gene, and rearrange things, then join back to original data.
	dfexpressed <- data_frame(control_expressed=apply(expressed[,indexes.control], 1, all),
	knockdn_expressed=apply(expressed[,indexes.knockdn], 1, all),
	control_meancounts=rowMeans(counts[, indexes.control]),
	knockdn_meancounts=rowMeans(counts[, indexes.knockdn])) %>%
	mutate(ensgene=countdata$ensgene) %>%
	select(ensgene, everything()) %>%
	inner_join(countdata, by="ensgene")

	head(dfexpressed, 10)


	library(ggplot2)

	# Plot mean counts on log scale, red if expressed in both
	ggplot(dfexpressed, aes(control_meancounts, knockdn_meancounts)) +
	geom_point(aes(col=(control_expressed & knockdn_expressed)), alpha=1/2) +
	scale_x_log10() + scale_y_log10() +
	scale_colour_manual("Expressed in both", values=c("black", "red"))

	# Plot mean counts on log scale, red if expressed in one but not other
	ggplot(dfexpressed, aes(control_meancounts, knockdn_meancounts)) +
	geom_point(aes(col=control_expressed!=knockdn_expressed), alpha=1/2) +
	scale_x_log10() + scale_y_log10() +
	scale_colour_manual("Different expression pattern", values=c("black", "red"))