Last active
December 10, 2018 09:18
-
-
Save genomewalker/1cc4bf89b35fb1b51bd062ca6394971d to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env Rscript | |
args = commandArgs(trailingOnly=TRUE) | |
# test if there is at least one argument: if not, return an error | |
if (length(args) == 0) { | |
stop("At least one argument must be supplied (input file).n", call = FALSE) | |
} else if (length(args) == 1) { | |
# default output file | |
args[2] = "alluvial.tsv" | |
} | |
needed = c("tidyverse", "magrittr", "parallel") | |
is.installed <- function(pkg){ | |
is.element(pkg, installed.packages()[,1]) | |
} | |
if (!is.installed("crayon")){suppressMessages(install.packages("crayon"))} | |
suppressMessages(library(crayon)) | |
missing_package <- FALSE | |
cat("\nChecking if all packages are installed...\n\n") | |
# For loop to run through each of the packages | |
for (p in 1:length(needed)){ | |
if(is.installed(needed[p])){ | |
cat(sprintf("%-10s: %s", needed[p], green("Installed\n"))) | |
}else{ | |
cat(sprintf("%-10s: %s", needed[p], red("Not installed\n"))) | |
missing_package <- TRUE | |
} | |
} | |
quit_not_installed <- function(){ | |
cat("\nMissing packages, please install them.\n") | |
quit(save = "no", status = 1) | |
} | |
if (missing_package) { | |
quit_not_installed() | |
}else{ | |
cat("\nAll packages installed.\n")} | |
suppressMessages(library(tidyverse)) | |
suppressMessages(library(magrittr)) | |
suppressMessages(library(parallel)) | |
ncores <- parallel::detectCores()/2 | |
cat(paste0("\nReading file ", args[1], "...")) | |
suppressMessages(cl_tax_orfs <- read_tsv(args[1], col_names = TRUE) %>% | |
mutate(cl_name = as.character(cl_name))) | |
cat(green(" done\n\n")) | |
cat(paste("File", args[1], "has", nrow(cl_tax_orfs), "rows and", ncol(cl_tax_orfs), "columns\n\n")) | |
majority_vote <- function (x, seed = 12345) { | |
set.seed(seed) | |
whichMax <- function(x) { | |
m <- seq_along(x)[x == max(x, na.rm = TRUE)] | |
if (length(m) > 1) | |
sample(m, size = 1) | |
else m | |
} | |
x <- as.vector(x) | |
tab <- table(x) | |
m <- whichMax(tab) | |
out <- list(table = tab, ind = m, majority = names(tab)[m]) | |
return(out) | |
} | |
# Analyse annotations ----------------------------------------------------- | |
# cl_tax_orfs %>% | |
# group_by(cl_name, category) %>% | |
# count() %>% | |
# arrange(desc(n)) %>% | |
# group_by(category) %>% | |
# skimr::skim() | |
propagate_annotation <-function(X, data = data){ | |
cls <- data %>% | |
dplyr::filter(cl_name == X) | |
consensus_superkingdom <- cls %>% | |
dplyr::filter(!is.na(superkingdom)) %>% | |
summarise(consensus_superkingdom = ifelse(n() < 1, NA, majority_vote(superkingdom)$majority)) %>% .$consensus_superkingdom | |
consensus_phylum <- cls %>% | |
dplyr::filter(superkingdom == consensus_superkingdom, | |
!is.na(phylum)) %>% | |
summarise(consensus_phylum = ifelse(n() < 1, paste(consensus_superkingdom, "NA", sep = "_"), majority_vote(phylum)$majority)) %>% .$consensus_phylum | |
consensus_class <- cls %>% | |
dplyr::filter(superkingdom == consensus_superkingdom, | |
phylum == consensus_phylum, | |
!is.na(class)) %>% | |
summarise(consensus_class = ifelse(n() < 1, paste(consensus_phylum, "NA", sep = "_"), majority_vote(class)$majority)) %>% .$consensus_class | |
consensus_order <- cls %>% | |
dplyr::filter(superkingdom == consensus_superkingdom, | |
phylum == consensus_phylum, | |
class == consensus_class, | |
!is.na(order)) %>% | |
summarise(consensus_order = ifelse(n() < 1, paste(consensus_class, "NA", sep = "_"), majority_vote(order)$majority)) %>% .$consensus_order | |
consensus_family <- cls %>% | |
dplyr::filter(superkingdom == consensus_superkingdom, | |
phylum == consensus_phylum, | |
class == consensus_class, | |
order == consensus_order, | |
!is.na(family)) %>% | |
summarise(consensus_family = ifelse(n() < 1, paste(consensus_order, "NA", sep = "_"), majority_vote(family)$majority)) %>% .$consensus_family | |
consensus_genus <- cls %>% | |
dplyr::filter(superkingdom == consensus_superkingdom, | |
phylum == consensus_phylum, | |
class == consensus_class, | |
order == consensus_order, | |
family == consensus_family, | |
!is.na(genus)) %>% | |
summarise(consensus_genus = ifelse(n() < 1, paste(consensus_family, "NA", sep = "_"), majority_vote(genus)$majority)) %>% .$consensus_genus | |
tibble(cl_name = X, consensus_superkingdom, consensus_phylum, consensus_class, | |
consensus_order, consensus_family, consensus_genus) | |
} | |
cat(paste("Propagating taxonomic annotations at cluster level using", cyan(ncores), "cores... ")) | |
cl_tax_consensus <- mclapply(cl_tax_orfs$cl_name %>% unique(), | |
propagate_annotation, data = cl_tax_orfs, mc.cores = ncores) %>% | |
bind_rows() | |
tax_ranks <- c("consensus_superkingdom", "consensus_phylum", "consensus_class", "consensus_order", "consensus_family", "consensus_genus") | |
cat(green("done\n")) | |
# Quick look to the consensus annotations --------------------------------- | |
#map(tax_ranks, function(X){cl_tax_consensus %>% group_by_(X) %>% count(sort = TRUE) %>% ungroup()}) | |
#cl_tax_consensus %>% filter(is.na(consensus_phylum)) | |
# Write results ----------------------------------------------------------- | |
# cl_tax_orfs %>% | |
# select(supercluster, cl_name) %>% | |
# inner_join(cl_tax_consensus %>% select(cl_name, consensus_class, consensus_phylum, consensus_superkingdom)) %>% | |
# write_tsv("~/Downloads/pr2alluvial.tsv") | |
# Annotate at the ORF level ----------------------------------------------- | |
# Uses the data generated above | |
propagate_annotation_na <-function(X, data = data){ | |
cls <- data[X,] %>% | |
select(genus, family, order, class, phylum, superkingdom, orf, cl_name, supercluster) | |
consensus_superkingdom <- cls %>% | |
summarise(consensus_superkingdom = ifelse(is.na(superkingdom), NA, superkingdom))%>% .$consensus_superkingdom | |
consensus_phylum <- cls %>% | |
summarise(consensus_phylum = ifelse(is.na(phylum), paste(consensus_superkingdom, "NA", sep = "_"), phylum)) %>% .$consensus_phylum | |
consensus_class <- cls %>% | |
summarise(consensus_class = ifelse(is.na(class), paste(consensus_phylum, "NA", sep = "_"), class)) %>% .$consensus_class | |
consensus_order <- cls %>% | |
summarise(consensus_order = ifelse(is.na(order), paste(consensus_class, "NA", sep = "_"), order)) %>% .$consensus_order | |
consensus_family <- cls %>% | |
summarise(consensus_family = ifelse(is.na(family), paste(consensus_order, "NA", sep = "_"), family)) %>% .$consensus_family | |
consensus_genus <- cls %>% | |
dplyr::filter(superkingdom == consensus_superkingdom, | |
phylum == consensus_phylum, | |
class == consensus_class, | |
order == consensus_order, | |
family == consensus_family, | |
!is.na(genus)) %>% | |
summarise(consensus_genus = ifelse(n() < 1, paste(consensus_family, "NA", sep = "_"), majority_vote(genus)$majority)) %>% .$consensus_genus | |
tibble(supercluster = cls$supercluster, orf = cls$orf, cl_name = cls$cl_name, consensus_superkingdom, consensus_phylum, consensus_class, | |
consensus_order, consensus_family, consensus_genus) | |
} | |
cat("Collecting consensus annotations... ") | |
pr_clusters_consensus <- cl_tax_orfs %>% | |
select(supercluster, cl_name) %>% | |
inner_join(cl_tax_consensus %>% select(cl_name, consensus_superkingdom, consensus_phylum, consensus_class, consensus_order, consensus_family, consensus_genus), by = "cl_name") | |
cat(green("done"),"\nCollecting ORFs with taxonomic annotations... ") | |
pr_clusters_no_na <- cl_tax_orfs %>% | |
filter(!(is.na(superkingdom) | is.na(phylum))) | |
cat(green("done"), "\nCollecting ORFs without taxonomic annotations... ") | |
cl_tax_consensus_na <- cl_tax_orfs %>% | |
filter(is.na(superkingdom) | is.na(phylum)) %>% | |
select(supercluster, cl_name, orf) %>% | |
unique() %>% | |
inner_join(pr_clusters_consensus %>% select(-supercluster), by = "cl_name") %>% unique() | |
cat(green("done"),"\nPropagating taxonomic annotations at the ORF level using", cyan(ncores), "cores... ") | |
cl_tax_consensus_no_na <- mclapply(1:nrow(pr_clusters_no_na), | |
propagate_annotation_na, data = pr_clusters_no_na, mc.cores = ncores) %>% | |
bind_rows() | |
cat(paste0(green("done"), "\nExporting data for alluvial plot drawing to file ", silver(args[2]), "... ")) | |
bind_rows(cl_tax_consensus_no_na, | |
cl_tax_consensus_na) %>% | |
write_tsv(args[2]) | |
cat(green("done\n")) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment