genomewalker · January 20, 2020 06:58
diff --git a/hmp_sample_selection b/hmp_sample_selection
 library(tidyverse)

 # Functions for pretty histograms
 nclass.all <- function(x, fun = median)
 {
  fun(c(
    nclass.Sturges(x),
    nclass.scott(x),
    nclass.FD(x)
  ))
 }

 calc_bin_width <- function(x, ...)
 {
  rangex <- range(x, na.rm = TRUE)
  (rangex[2] - rangex[1]) / nclass.all(x, ...)
 }

 library(ggplot2)

 StatPercentileX <- ggproto("StatPercentileX", Stat,
                           compute_group = function(data, scales, probs) {
                             percentiles <- quantile(data$x, probs=probs)
                             data.frame(xintercept=percentiles)
                           },
                           required_aes = c("x")
 )

 stat_percentile_x <- function(mapping = NULL, data = NULL, geom = "vline",
                              position = "identity", na.rm = FALSE,
                              show.legend = NA, inherit.aes = TRUE, ...) {
  layer(
    stat = StatPercentileX, data = data, mapping = mapping, geom = geom, 
    position = position, show.legend = show.legend, inherit.aes = inherit.aes,
    params = list(na.rm = na.rm, ...)
  )
 }

 StatPercentileXLabels <- ggproto("StatPercentileXLabels", Stat,
                                 compute_group = function(data, scales, probs) {
                                   percentiles <- quantile(data$x, probs=probs)
                                   data.frame(x=percentiles, y=Inf,
                                              label=paste0("p", probs*100, ": ",
                                                           scales::comma(round(10^percentiles, digits=1))))
                                 },
                                 required_aes = c("x")
 )

 stat_percentile_xlab <- function(mapping = NULL, data = NULL, geom = "text",
                                 position = "identity", na.rm = FALSE,
                                 show.legend = NA, inherit.aes = TRUE, ...) {
  layer(
    stat = StatPercentileXLabels, data = data, mapping = mapping, geom = geom, 
    position = position, show.legend = show.legend, inherit.aes = inherit.aes,
    params = list(na.rm = na.rm, ...)
  )
 }


 # Get assemblies
 HMP1_I_assm <- read_csv("~/Downloads/HMASM.csv") %>%
  select(1:2) %>%
  setNames(c("label", "body_site"))

 # Read data
 data <- read_tsv("~/Downloads/marine_hmp_smpl_norfs.tsv.gz", col_names = FALSE) %>%
  setNames(c("label", "norfs")) %>%
  mutate(study = case_when(grepl("^TARA", label) ~ "TARA",
                           grepl("^MP", label) ~ "MALASPINA",
                           grepl("^OSD", label) ~ "OSD",
                           grepl("^SRS", label) ~ "HMP",
                           TRUE ~ "GOS"))

 # Read HMP QC samples
 HMP_qc <- read_tsv("~/Downloads/HMP_qc_passed.txt", col_names = FALSE) %>%
  setNames("label")
 # Get all cdata
 # HMP1-I date = 2011
 # HMP1-II date = 2014
 HMP_cdata <- read_csv("~/Downloads/HMP_phase2017_cdata.txt", col_names = FALSE) %>%
  mutate(phase = case_when(grepl("2011", X8) ~ "HMP1-I",
                           TRUE ~ "HMP1-II")) %>%
  select(X1, phase) %>% rename(label = X1)

 # Get bad HMP1-I samples
 # 745 samples
 HMP1_I <- HMP_cdata %>%
  inner_join(HMP1_I_assm)

 # 690 HQ
 HMP_bad <- HMP1_I %>%
  filter(phase == "HMP1-I", !(label %in% HMP_qc$label))

 # 
 data_orig <- data %>%
  filter(!(label %in% HMP_bad$label))

 data %>%
  group_by(study) %>%
  count()

 data_orig %>%
  group_by(study) %>%
  count()

 nsamples <- data_orig %>%
  group_by(study) %>%
  count()

 data_orig_summary <- summary(data_orig$norfs)

 ggthemr::ggthemr(palette = "fresh", layout = "scientific")
 data_orig %>%
  ggplot(aes(norfs)) +
  geom_histogram(binwidth = calc_bin_width(log10(data_orig$norfs)), color = "black", alpha = 0.7) +
  stat_percentile_x(probs=c(0.25, 0.5, 0.75), linetype=2, color = "#B7144B") +
  stat_percentile_xlab(probs=c(0.25, 0.5, 0.75), hjust=1, vjust=1.5, angle=90) +
  scale_x_log10(labels = scales::comma) +
  xlab("Number of ORFs") +
  ylab("Counts") 

 p25 <- quantile(data_orig$norfs, probs=0.25)

 orfXsample <- data_orig %>%
  filter(norfs >= p25) %>%
  group_by(study) %>%
  count(sort = TRUE)

 # Get final set of samples
 data_final <- data_orig %>%
  filter(norfs >= p25)

 data_final %>%
  ggplot(aes(norfs)) +
  geom_histogram(binwidth = calc_bin_width(log10(data_orig$norfs)), color = "black", alpha = 0.7) +
  stat_percentile_x(probs=c(0.25, 0.5, 0.75), linetype=2, color = "#B7144B") +
  stat_percentile_xlab(probs=c(0.25, 0.5, 0.75), hjust=1, vjust=1.5, angle=90) +
  scale_x_log10(labels = scales::comma) +
  xlab("Number of ORFs") +
  ylab("Counts")
	library(tidyverse)

	# Functions for pretty histograms
	nclass.all <- function(x, fun = median)
	{
	fun(c(
	nclass.Sturges(x),
	nclass.scott(x),
	nclass.FD(x)
	))
	}

	calc_bin_width <- function(x, ...)
	{
	rangex <- range(x, na.rm = TRUE)
	(rangex[2] - rangex[1]) / nclass.all(x, ...)
	}

	library(ggplot2)

	StatPercentileX <- ggproto("StatPercentileX", Stat,
	compute_group = function(data, scales, probs) {
	percentiles <- quantile(data$x, probs=probs)
	data.frame(xintercept=percentiles)
	},
	required_aes = c("x")
	)

	stat_percentile_x <- function(mapping = NULL, data = NULL, geom = "vline",
	position = "identity", na.rm = FALSE,
	show.legend = NA, inherit.aes = TRUE, ...) {
	layer(
	stat = StatPercentileX, data = data, mapping = mapping, geom = geom,
	position = position, show.legend = show.legend, inherit.aes = inherit.aes,
	params = list(na.rm = na.rm, ...)
	)
	}

	StatPercentileXLabels <- ggproto("StatPercentileXLabels", Stat,
	compute_group = function(data, scales, probs) {
	percentiles <- quantile(data$x, probs=probs)
	data.frame(x=percentiles, y=Inf,
	label=paste0("p", probs*100, ": ",
	scales::comma(round(10^percentiles, digits=1))))
	},
	required_aes = c("x")
	)

	stat_percentile_xlab <- function(mapping = NULL, data = NULL, geom = "text",
	position = "identity", na.rm = FALSE,
	show.legend = NA, inherit.aes = TRUE, ...) {
	layer(
	stat = StatPercentileXLabels, data = data, mapping = mapping, geom = geom,
	position = position, show.legend = show.legend, inherit.aes = inherit.aes,
	params = list(na.rm = na.rm, ...)
	)
	}


	# Get assemblies
	HMP1_I_assm <- read_csv("~/Downloads/HMASM.csv") %>%
	select(1:2) %>%
	setNames(c("label", "body_site"))

	# Read data
	data <- read_tsv("~/Downloads/marine_hmp_smpl_norfs.tsv.gz", col_names = FALSE) %>%
	setNames(c("label", "norfs")) %>%
	mutate(study = case_when(grepl("^TARA", label) ~ "TARA",
	grepl("^MP", label) ~ "MALASPINA",
	grepl("^OSD", label) ~ "OSD",
	grepl("^SRS", label) ~ "HMP",
	TRUE ~ "GOS"))

	# Read HMP QC samples
	HMP_qc <- read_tsv("~/Downloads/HMP_qc_passed.txt", col_names = FALSE) %>%
	setNames("label")
	# Get all cdata
	# HMP1-I date = 2011
	# HMP1-II date = 2014
	HMP_cdata <- read_csv("~/Downloads/HMP_phase2017_cdata.txt", col_names = FALSE) %>%
	mutate(phase = case_when(grepl("2011", X8) ~ "HMP1-I",
	TRUE ~ "HMP1-II")) %>%
	select(X1, phase) %>% rename(label = X1)

	# Get bad HMP1-I samples
	# 745 samples
	HMP1_I <- HMP_cdata %>%
	inner_join(HMP1_I_assm)

	# 690 HQ
	HMP_bad <- HMP1_I %>%
	filter(phase == "HMP1-I", !(label %in% HMP_qc$label))

	#
	data_orig <- data %>%
	filter(!(label %in% HMP_bad$label))

	data %>%
	group_by(study) %>%
	count()

	data_orig %>%
	group_by(study) %>%
	count()

	nsamples <- data_orig %>%
	group_by(study) %>%
	count()

	data_orig_summary <- summary(data_orig$norfs)

	ggthemr::ggthemr(palette = "fresh", layout = "scientific")
	data_orig %>%
	ggplot(aes(norfs)) +
	geom_histogram(binwidth = calc_bin_width(log10(data_orig$norfs)), color = "black", alpha = 0.7) +
	stat_percentile_x(probs=c(0.25, 0.5, 0.75), linetype=2, color = "#B7144B") +
	stat_percentile_xlab(probs=c(0.25, 0.5, 0.75), hjust=1, vjust=1.5, angle=90) +
	scale_x_log10(labels = scales::comma) +
	xlab("Number of ORFs") +
	ylab("Counts")

	p25 <- quantile(data_orig$norfs, probs=0.25)

	orfXsample <- data_orig %>%
	filter(norfs >= p25) %>%
	group_by(study) %>%
	count(sort = TRUE)

	# Get final set of samples
	data_final <- data_orig %>%
	filter(norfs >= p25)

	data_final %>%
	ggplot(aes(norfs)) +
	geom_histogram(binwidth = calc_bin_width(log10(data_orig$norfs)), color = "black", alpha = 0.7) +
	stat_percentile_x(probs=c(0.25, 0.5, 0.75), linetype=2, color = "#B7144B") +
	stat_percentile_xlab(probs=c(0.25, 0.5, 0.75), hjust=1, vjust=1.5, angle=90) +
	scale_x_log10(labels = scales::comma) +
	xlab("Number of ORFs") +
	ylab("Counts")