pmagwene · March 29, 2017 21:32
diff --git a/filter_genes_by_NA.R b/filter_genes_by_NA.R
 library(tidyr)
 library(dplyr)
 library(magrittr)
 library(ggplot2)

 # load data in "wide" format (genes in columns)
 spellman <- read.csv("spellman-reformated.csv")

 # restructure in "long" format
 spellman.long <- gather(spellman, gene, expression, -expt, -time)

 # group by gene and calculate the number of missing (NA) values per gene
 spellman.na <- 
  spellman.long %>%
  group_by(gene) %>%
  summarize(na.count = sum(is.na(expression)))

 # get genes where no more than 5 values are missing
 good.genes <-
  spellman.na %>%
  filter(na.count < 5) %$% gene
  
 # select corresponding columns
 spellman.filtered <-
  spellman %>%
  select(one_of(good.genes))

 dim(spellman)  # dimensions of original data
 dim(spellman.filtered) # dimensions of filtered data
	library(tidyr)
	library(dplyr)
	library(magrittr)
	library(ggplot2)

	# load data in "wide" format (genes in columns)
	spellman <- read.csv("spellman-reformated.csv")

	# restructure in "long" format
	spellman.long <- gather(spellman, gene, expression, -expt, -time)

	# group by gene and calculate the number of missing (NA) values per gene
	spellman.na <-
	spellman.long %>%
	group_by(gene) %>%
	summarize(na.count = sum(is.na(expression)))

	# get genes where no more than 5 values are missing
	good.genes <-
	spellman.na %>%
	filter(na.count < 5) %$% gene

	# select corresponding columns
	spellman.filtered <-
	spellman %>%
	select(one_of(good.genes))

	dim(spellman) # dimensions of original data
	dim(spellman.filtered) # dimensions of filtered data