Skip to content

Instantly share code, notes, and snippets.

@genomewalker
Created January 20, 2020 06:58
Show Gist options
  • Save genomewalker/ba45984aac64fb8e2278606d8231f37a to your computer and use it in GitHub Desktop.
Save genomewalker/ba45984aac64fb8e2278606d8231f37a to your computer and use it in GitHub Desktop.
library(tidyverse)
# Functions for pretty histograms
nclass.all <- function(x, fun = median)
{
fun(c(
nclass.Sturges(x),
nclass.scott(x),
nclass.FD(x)
))
}
calc_bin_width <- function(x, ...)
{
rangex <- range(x, na.rm = TRUE)
(rangex[2] - rangex[1]) / nclass.all(x, ...)
}
library(ggplot2)
StatPercentileX <- ggproto("StatPercentileX", Stat,
compute_group = function(data, scales, probs) {
percentiles <- quantile(data$x, probs=probs)
data.frame(xintercept=percentiles)
},
required_aes = c("x")
)
stat_percentile_x <- function(mapping = NULL, data = NULL, geom = "vline",
position = "identity", na.rm = FALSE,
show.legend = NA, inherit.aes = TRUE, ...) {
layer(
stat = StatPercentileX, data = data, mapping = mapping, geom = geom,
position = position, show.legend = show.legend, inherit.aes = inherit.aes,
params = list(na.rm = na.rm, ...)
)
}
StatPercentileXLabels <- ggproto("StatPercentileXLabels", Stat,
compute_group = function(data, scales, probs) {
percentiles <- quantile(data$x, probs=probs)
data.frame(x=percentiles, y=Inf,
label=paste0("p", probs*100, ": ",
scales::comma(round(10^percentiles, digits=1))))
},
required_aes = c("x")
)
stat_percentile_xlab <- function(mapping = NULL, data = NULL, geom = "text",
position = "identity", na.rm = FALSE,
show.legend = NA, inherit.aes = TRUE, ...) {
layer(
stat = StatPercentileXLabels, data = data, mapping = mapping, geom = geom,
position = position, show.legend = show.legend, inherit.aes = inherit.aes,
params = list(na.rm = na.rm, ...)
)
}
# Get assemblies
HMP1_I_assm <- read_csv("~/Downloads/HMASM.csv") %>%
select(1:2) %>%
setNames(c("label", "body_site"))
# Read data
data <- read_tsv("~/Downloads/marine_hmp_smpl_norfs.tsv.gz", col_names = FALSE) %>%
setNames(c("label", "norfs")) %>%
mutate(study = case_when(grepl("^TARA", label) ~ "TARA",
grepl("^MP", label) ~ "MALASPINA",
grepl("^OSD", label) ~ "OSD",
grepl("^SRS", label) ~ "HMP",
TRUE ~ "GOS"))
# Read HMP QC samples
HMP_qc <- read_tsv("~/Downloads/HMP_qc_passed.txt", col_names = FALSE) %>%
setNames("label")
# Get all cdata
# HMP1-I date = 2011
# HMP1-II date = 2014
HMP_cdata <- read_csv("~/Downloads/HMP_phase2017_cdata.txt", col_names = FALSE) %>%
mutate(phase = case_when(grepl("2011", X8) ~ "HMP1-I",
TRUE ~ "HMP1-II")) %>%
select(X1, phase) %>% rename(label = X1)
# Get bad HMP1-I samples
# 745 samples
HMP1_I <- HMP_cdata %>%
inner_join(HMP1_I_assm)
# 690 HQ
HMP_bad <- HMP1_I %>%
filter(phase == "HMP1-I", !(label %in% HMP_qc$label))
#
data_orig <- data %>%
filter(!(label %in% HMP_bad$label))
data %>%
group_by(study) %>%
count()
data_orig %>%
group_by(study) %>%
count()
nsamples <- data_orig %>%
group_by(study) %>%
count()
data_orig_summary <- summary(data_orig$norfs)
ggthemr::ggthemr(palette = "fresh", layout = "scientific")
data_orig %>%
ggplot(aes(norfs)) +
geom_histogram(binwidth = calc_bin_width(log10(data_orig$norfs)), color = "black", alpha = 0.7) +
stat_percentile_x(probs=c(0.25, 0.5, 0.75), linetype=2, color = "#B7144B") +
stat_percentile_xlab(probs=c(0.25, 0.5, 0.75), hjust=1, vjust=1.5, angle=90) +
scale_x_log10(labels = scales::comma) +
xlab("Number of ORFs") +
ylab("Counts")
p25 <- quantile(data_orig$norfs, probs=0.25)
orfXsample <- data_orig %>%
filter(norfs >= p25) %>%
group_by(study) %>%
count(sort = TRUE)
# Get final set of samples
data_final <- data_orig %>%
filter(norfs >= p25)
data_final %>%
ggplot(aes(norfs)) +
geom_histogram(binwidth = calc_bin_width(log10(data_orig$norfs)), color = "black", alpha = 0.7) +
stat_percentile_x(probs=c(0.25, 0.5, 0.75), linetype=2, color = "#B7144B") +
stat_percentile_xlab(probs=c(0.25, 0.5, 0.75), hjust=1, vjust=1.5, angle=90) +
scale_x_log10(labels = scales::comma) +
xlab("Number of ORFs") +
ylab("Counts")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment