pmagwene · March 31, 2017 05:03
diff --git a/filter_zero_var_genes.R b/filter_zero_var_genes.R
 library(tidyr)
 library(dplyr)
 library(magrittr)
 library(ggplot2)

 # read data from causton heat expression data set
 # see groups 4 and 9 data/focal papers
 causton <- read.csv("causton-2001-heat-expression.csv")

 # Take a look at data. You'll see that the genes are in rows, 
 # and the time points and gene names are in columns
 head(causton)

 # Reshape data by
 causton.long <- 
  causton %>%
  gather(time, expression, -ORF, -Gene) %>%
  arrange(ORF)  # sort by ORF

 # look at the data after reshaping
 head(causton.long)

 # Calculate the variance by gene
 causton.var <-
  causton.long %>%
  group_by(ORF) %>%
  summarize(var = var(expression, na.rm = TRUE))

 # Find the ORFs with non-zero variance
 non.zero.var.ORFs <-
  causton.var %>%
  filter(var != 0) %$%  # NOTE use of %$% operator from magrittr pkg
  ORF

 # only keep genes with non-zero variance
 causton.trim <-
  causton.long %>%
  filter(ORF %in% non.zero.var.ORFs) 

 # spread to "wide" format appropriate for calculating correlations, with
 # genes in columns, time points in row, after dropping Gene name column
 causton.wide <-
  causton.trim %>%
  select(-Gene) %>%  # drop the Gene name column
  spread(ORF, expression)

 # calculating correlations
 causton.cor <- 
  causton.wide %>%
  select(-time) %>%  # drop the time column before calculating correlations
  cor(use = "pairwise.complete.obs")
	library(tidyr)
	library(dplyr)
	library(magrittr)
	library(ggplot2)

	# read data from causton heat expression data set
	# see groups 4 and 9 data/focal papers
	causton <- read.csv("causton-2001-heat-expression.csv")

	# Take a look at data. You'll see that the genes are in rows,
	# and the time points and gene names are in columns
	head(causton)

	# Reshape data by
	causton.long <-
	causton %>%
	gather(time, expression, -ORF, -Gene) %>%
	arrange(ORF) # sort by ORF

	# look at the data after reshaping
	head(causton.long)

	# Calculate the variance by gene
	causton.var <-
	causton.long %>%
	group_by(ORF) %>%
	summarize(var = var(expression, na.rm = TRUE))

	# Find the ORFs with non-zero variance
	non.zero.var.ORFs <-
	causton.var %>%
	filter(var != 0) %$% # NOTE use of %$% operator from magrittr pkg
	ORF

	# only keep genes with non-zero variance
	causton.trim <-
	causton.long %>%
	filter(ORF %in% non.zero.var.ORFs)

	# spread to "wide" format appropriate for calculating correlations, with
	# genes in columns, time points in row, after dropping Gene name column
	causton.wide <-
	causton.trim %>%
	select(-Gene) %>% # drop the Gene name column
	spread(ORF, expression)

	# calculating correlations
	causton.cor <-
	causton.wide %>%
	select(-time) %>% # drop the time column before calculating correlations
	cor(use = "pairwise.complete.obs")