Skip to content

Instantly share code, notes, and snippets.

@genomewalker
Last active March 29, 2019 06:51
Show Gist options
  • Save genomewalker/1666d6565e6c2c848fbca6230ef37f00 to your computer and use it in GitHub Desktop.
Save genomewalker/1666d6565e6c2c848fbca6230ef37f00 to your computer and use it in GitHub Desktop.
#!/usr/bin/env Rscript
# Check if basic packages are installed -----------------------------------
is.installed <- function(pkg){
is.element(pkg, installed.packages()[,1])
}
if (!is.installed("crayon") || !is.installed("optparse")){
cat("We will try to install the packages crayon and optparse... (this will be only be done once)\n")
Sys.sleep(5)
if (!is.installed("crayon")){
suppressMessages(install.packages("crayon", repos = "http://cran.us.r-project.org"))
}
suppressMessages(library(crayon))
if (!is.installed("optparse")){
suppressMessages(install.packages("optparse", repos = "http://cran.us.r-project.org"))
}
}
suppressMessages(library(crayon))
suppressMessages(library(optparse))
# Check if packaged are installed -----------------------------------------
cat("\nChecking if all packages are installed...\n\n")
needed = c("ape", "tidyverse", "pbmcapply", "phangorn", "maditr", "data.table")
missing_package <- FALSE
# For loop to run through each of the packages
for (p in 1:length(needed)){
if(is.installed(needed[p])){
cat(sprintf("%-10s: %s", needed[p], green("Installed\n")))
}else{
cat(sprintf("%-10s: %s", needed[p], red("Not installed\n")))
missing_package <- TRUE
}
}
quit_not_installed <- function(){
cat("\nMissing packages, please install them.\n")
quit(save = "no", status = 1)
}
if (missing_package) {
quit_not_installed()
}else{
cat("\nAll packages installed.\n")
}
Sys.sleep(2)
system("clear")
# Script command line options ---------------------------------------------
option_list = list(
make_option(c("-t", "--tree"), type="character", default=NULL,
help="Tree file name", metavar="character"),
make_option(c("-d", "--data"), type="character", default=NULL,
help="Contextual data filename", metavar="character"),
make_option(c("-p", "--threads"), type="integer", default=1,
help="Number of threads [default= %default]", metavar="integer"),
make_option(c("-o", "--out"), type="character", default="annotation.tsv",
help="Iutput file name [default= %default]", metavar="character")
)
opt_parser = OptionParser(option_list=option_list);
opt = parse_args(opt_parser);
if (is.null(opt$tree) || is.null(opt$data)){
print_help(opt_parser)
stop("At least two arguments must be supplied (tree file and contextual data files).n", call.=FALSE)
}
suppressMessages(library(ape))
suppressMessages(library(tidyverse))
suppressMessages(library(pbmcapply))
suppressMessages(library(maditr))
suppressMessages(library(data.table))
# Do stuff ----------------------------------------------------------------
ncores <- opt$threads
cat(paste0("\nReading tree ", opt$tree, "..."))
pr_tree <- read.tree(file = opt$tree)
cat(green(" done\n"))
cat(paste(" Tree in file", opt$tree, "has", yellow(scales::comma(Ntip(pr_tree))), "tips and", yellow(scales::comma(Nedge(pr_tree))), "edges\n\n"))
cat(paste0("\nReading contextual data ", opt$data, "..."))
suppressMessages(pr_cdata <- read_tsv(opt$data, col_names = FALSE) %>%
rename(tip.label = X1, supercluster = X2))
cat(paste("done\n File", opt$data, "has", yellow(scales::comma(nrow(pr_cdata))), "rows and", yellow(scales::comma(ncol(pr_cdata))), "columns\n"))
cat("\nExtracting inserted sequences from tree...")
pr_inserted <- tibble(tip.label = pr_tree$tip.label) %>%
filter(grepl("seq", tip.label))
cat(paste0("done\n ", yellow(scales::comma(nrow(pr_inserted)))," sequencs extracted\n\n"))
pairs <- (Ntip(pr_tree) * (Ntip(pr_tree) - 1))/2
cat(paste0("Calculating cophenetic distance (", yellow(scales::comma(pairs))," pairs)... "))
coph <- cophenetic.phylo(pr_tree) %>%
as.dist() %>%
broom::tidy()
cat(green("done\n"))
majority_vote <- function (x, seed = 12345) {
set.seed(seed)
whichMax <- function(x) {
m <- seq_along(x)[x == max(x, na.rm = TRUE)]
if (length(m) > 1)
sample(m, size = 1)
else m
}
x <- as.vector(x)
tab <- table(x)
m <- whichMax(tab)
out <- list(table = tab, ind = m, majority = names(tab)[m])
return(out)
}
assign_orfs <- function(X, phy = phy, data = tips, coph = coph, n_mrca = n_mrca){
top <- coph %>%
dt_filter(item1 == X | item2 == X) %>%
dt_mutate(item1 = as.character(item1), item2 = as.character(item2)) %>%
as_tibble() %>%
rowwise() %>%
mutate(item2 = ifelse(item1 != X, item1, item2), item1 = X) %>%
inner_join(data, by = c( "item2" = "tip.label")) %>%
arrange((distance)) %>%
head(n_mrca)
# top <- coph %>%
# dt_filter(item1 == X | item2 == X)
# f_q <- quantile(top$distance)[[2]]
# top <- top %>%
# dt_filter(distance <= f_q) %>%
# dt_mutate(item1 = as.character(item1), item2 = as.character(item2))
# top <- top[, c("item1", "item2") := list(X, ifelse(item1 != X, item1, item2)), by = .I] %>%
# dt_inner_join(data, by = c( "item2" = "tip.label")) %>%
# arrange((distance)) %>%
# as_tibble() %>%
# head(n_mrca)
bestmrca <- getMRCA(phy, unique(c(top$item1, top$item2)))
mrcatips <- phy$tip.label[unlist(phangorn::Descendants(phy, bestmrca, type = "tips"))]
if(length(data %>% filter(tip.label %in% mrcatips) %>% .$supercluster %>% unique()) != 1){
annot <- majority_vote(data %>% filter(tip.label %in% mrcatips) %>% .$supercluster)$majority
majority <- TRUE
}else{
annot <- data %>% filter(tip.label %in% mrcatips) %>% .$supercluster %>% unique()
majority <- FALSE
}
tibble(tip.label = X, supercluster = annot, majority = majority)
}
cat(paste0("Annotating ORFs with ", yellow(ncores) ," cores...\n"))
orf_annotation <- pbmcapply::pbmclapply(pr_inserted$tip.label,
assign_orfs,
phy = pr_tree,
data = pr_cdata,
coph = coph %>% as.data.table(),
n_mrca = 1,
mc.cores = ncores,
ignore.interactive = T,
max.vector.size = 1e7) %>%
bind_rows()
cat(paste0("Exporting annotations to file ", silver(opt$out), "... "))
write_tsv(orf_annotation, path = opt$out, col_names = FALSE)
cat(green("done\n\n"))
#!/usr/bin/env Rscript
# Check if basic packages are installed -----------------------------------
is.installed <- function(pkg){
is.element(pkg, installed.packages()[,1])
}
if (!is.installed("crayon") || !is.installed("optparse")){
cat("We will try to install the packages crayon and optparse... (this will be only be done once)\n")
Sys.sleep(5)
if (!is.installed("crayon")){
suppressMessages(install.packages("crayon", repos = "http://cran.us.r-project.org"))
}
suppressMessages(library(crayon))
if (!is.installed("optparse")){
suppressMessages(install.packages("optparse", repos = "http://cran.us.r-project.org"))
}
}
suppressMessages(library(crayon))
suppressMessages(library(optparse))
# Check if packaged are installed -----------------------------------------
cat("\nChecking if all packages are installed...\n\n")
needed = c("magrittr", "tidyverse", "pbmcapply", "maditr", "data.table")
missing_package <- FALSE
# For loop to run through each of the packages
for (p in 1:length(needed)){
if(is.installed(needed[p])){
cat(sprintf("%-10s: %s", needed[p], green("Installed\n")))
}else{
cat(sprintf("%-10s: %s", needed[p], red("Not installed\n")))
missing_package <- TRUE
}
}
quit_not_installed <- function(){
cat("\nMissing packages, please install them.\n")
quit(save = "no", status = 1)
}
if (missing_package) {
quit_not_installed()
}else{
cat("\nAll packages installed.\n")
}
Sys.sleep(2)
system("clear")
# Script command line options ---------------------------------------------
option_list = list(
make_option(c("-d", "--data"), type="character", default=NULL,
help="ORF data filename", metavar="character"),
make_option(c("-p", "--threads"), type="integer", default=1,
help="Number of threads [default= %default]", metavar="integer"),
make_option(c("-c", "--components"), type="character", default=NULL,
help="Iutput file name [default= %default]", metavar="character"),
make_option(c("-o", "--out"), type="character", default="alluvial.tsv",
help="Output file name [default= %default]", metavar="character")
)
opt_parser = OptionParser(option_list=option_list);
opt = parse_args(opt_parser);
if (is.null(opt$data)){
print_help(opt_parser)
stop("At least one arguments must be supplied (tree file and contextual data files).n", call.=FALSE)
}
suppressMessages(library(magrittr))
suppressMessages(library(tidyverse))
suppressMessages(library(pbmcapply))
suppressMessages(library(maditr))
suppressMessages(library(data.table))
ncores <- opt$threads
cat(paste0("\nReading file ", opt$data, "..."))
suppressMessages(cl_tax_orfs <- read_tsv(opt$data, col_names = TRUE) %>%
mutate(cl_name = as.character(cl_name)))
cat(green(" done\n"))
cat(paste(" File", opt$data, "has", nrow(cl_tax_orfs), "rows and", ncol(cl_tax_orfs), "columns\n\n"))
cat(paste0("\nReading file ", opt$components, "..."))
suppressMessages(cl_components <- read_tsv(opt$components, col_names = TRUE) %>%
mutate(cl_name = as.character(cl_name)))
cat(green(" done\n"))
cat(paste(" File", opt$components, "has", nrow(cl_components), "rows and", ncol(cl_components), "columns\n\n"))
majority_vote <- function (x, seed = 12345) {
set.seed(seed)
whichMax <- function(x) {
m <- seq_along(x)[x == max(x, na.rm = TRUE)]
if (length(m) > 1)
sample(m, size = 1)
else m
}
x <- as.vector(x)
tab <- table(x)
m <- whichMax(tab)
out <- list(table = tab, ind = m, majority = names(tab)[m])
return(out)
}
# Analyse annotations -----------------------------------------------------
# cl_tax_orfs %>%
# group_by(cl_name, category) %>%
# count() %>%
# arrange(desc(n)) %>%
# group_by(category) %>%
# skimr::skim()
propagate_annotation <-function(X, data = data){
cls <- data %>%
dplyr::filter(cl_name == X)
consensus_superkingdom <- cls %>%
dplyr::filter(!is.na(superkingdom)) %>%
summarise(consensus_superkingdom = ifelse(n() < 1, NA, majority_vote(superkingdom)$majority)) %>% .$consensus_superkingdom
consensus_phylum <- cls %>%
dplyr::filter(superkingdom == consensus_superkingdom,
!is.na(phylum)) %>%
summarise(consensus_phylum = ifelse(n() < 1, paste(consensus_superkingdom, "NA", sep = "_"), majority_vote(phylum)$majority)) %>% .$consensus_phylum
consensus_class <- cls %>%
dplyr::filter(superkingdom == consensus_superkingdom,
phylum == consensus_phylum,
!is.na(class)) %>%
summarise(consensus_class = ifelse(n() < 1, paste(consensus_phylum, "NA", sep = "_"), majority_vote(class)$majority)) %>% .$consensus_class
consensus_order <- cls %>%
dplyr::filter(superkingdom == consensus_superkingdom,
phylum == consensus_phylum,
class == consensus_class,
!is.na(order)) %>%
summarise(consensus_order = ifelse(n() < 1, paste(consensus_class, "NA", sep = "_"), majority_vote(order)$majority)) %>% .$consensus_order
consensus_family <- cls %>%
dplyr::filter(superkingdom == consensus_superkingdom,
phylum == consensus_phylum,
class == consensus_class,
order == consensus_order,
!is.na(family)) %>%
summarise(consensus_family = ifelse(n() < 1, paste(consensus_order, "NA", sep = "_"), majority_vote(family)$majority)) %>% .$consensus_family
consensus_genus <- cls %>%
dplyr::filter(superkingdom == consensus_superkingdom,
phylum == consensus_phylum,
class == consensus_class,
order == consensus_order,
family == consensus_family,
!is.na(genus)) %>%
summarise(consensus_genus = ifelse(n() < 1, paste(consensus_family, "NA", sep = "_"), majority_vote(genus)$majority)) %>% .$consensus_genus
tibble(cl_name = X, consensus_superkingdom, consensus_phylum, consensus_class,
consensus_order, consensus_family, consensus_genus)
}
cat(paste("Propagating taxonomic annotations at cluster level using", cyan(ncores), "cores...\n"))
cl_tax_consensus <- pbmcapply::pbmclapply(cl_tax_orfs$cl_name %>% unique(),
propagate_annotation, data = cl_tax_orfs,
mc.cores = ncores,
ignore.interactive = T,
max.vector.size = 1e7) %>%
bind_rows()
tax_ranks <- c("consensus_superkingdom", "consensus_phylum", "consensus_class", "consensus_order", "consensus_family", "consensus_genus")
cat(paste0("Propagation at taxonomic annotations at cluster level... ", green("done\n\n")))
# Quick look to the consensus annotations ---------------------------------
#map(tax_ranks, function(X){cl_tax_consensus %>% group_by_(X) %>% count(sort = TRUE) %>% ungroup()})
#cl_tax_consensus %>% filter(is.na(consensus_phylum))
# Write results -----------------------------------------------------------
# cl_tax_orfs %>%
# select(supercluster, cl_name) %>%
# inner_join(cl_tax_consensus %>% select(cl_name, consensus_class, consensus_phylum, consensus_superkingdom)) %>%
# write_tsv("~/Downloads/pr2alluvial.tsv")
# Annotate at the ORF level -----------------------------------------------
# Uses the data generated above
propagate_annotation_na <-function(X, data = data){
cls <- data[X,] %>%
select(genus, family, order, class, phylum, superkingdom, orf, cl_name, supercluster)
consensus_superkingdom <- cls %>%
summarise(consensus_superkingdom = ifelse(is.na(superkingdom), NA, superkingdom))%>% .$consensus_superkingdom
consensus_phylum <- cls %>%
summarise(consensus_phylum = ifelse(is.na(phylum), paste(consensus_superkingdom, "NA", sep = "_"), phylum)) %>% .$consensus_phylum
consensus_class <- cls %>%
summarise(consensus_class = ifelse(is.na(class), paste(consensus_phylum, "NA", sep = "_"), class)) %>% .$consensus_class
consensus_order <- cls %>%
summarise(consensus_order = ifelse(is.na(order), paste(consensus_class, "NA", sep = "_"), order)) %>% .$consensus_order
consensus_family <- cls %>%
summarise(consensus_family = ifelse(is.na(family), paste(consensus_order, "NA", sep = "_"), family)) %>% .$consensus_family
consensus_genus <- cls %>%
dplyr::filter(superkingdom == consensus_superkingdom,
phylum == consensus_phylum,
class == consensus_class,
order == consensus_order,
family == consensus_family,
!is.na(genus)) %>%
summarise(consensus_genus = ifelse(n() < 1, paste(consensus_family, "NA", sep = "_"), majority_vote(genus)$majority)) %>% .$consensus_genus
tibble(supercluster = cls$supercluster, orf = cls$orf, cl_name = cls$cl_name, consensus_superkingdom, consensus_phylum, consensus_class,
consensus_order, consensus_family, consensus_genus)
}
cat("Collecting consensus annotations... ")
pr_clusters_consensus <- cl_tax_orfs %>%
select(supercluster, cl_name) %>%
inner_join(cl_tax_consensus %>% select(cl_name, consensus_superkingdom, consensus_phylum, consensus_class, consensus_order, consensus_family, consensus_genus), by = "cl_name")
cat(green("done"),"\nCollecting ORFs with taxonomic annotations... ")
pr_clusters_no_na <- cl_tax_orfs %>%
filter(!(is.na(superkingdom) | is.na(phylum)))
cat(green("done"), "\nCollecting ORFs without taxonomic annotations... ")
cl_tax_consensus_na <- cl_tax_orfs %>%
filter(is.na(superkingdom) | is.na(phylum)) %>%
select(supercluster, cl_name, orf) %>%
unique() %>%
as.data.table() %>%
dt_inner_join(pr_clusters_consensus %>% select(-supercluster), by = "cl_name") %>% unique() %>% as_tibble()
cat(green("done"),"\nPropagating taxonomic annotations at the ORF level using", cyan(ncores), "cores... ")
cl_tax_consensus_no_na <- pbmclapply(1:nrow(pr_clusters_no_na),
propagate_annotation_na,
data = pr_clusters_no_na,
mc.cores = ncores,
ignore.interactive = T,
max.vector.size = 1e7) %>%
bind_rows()
cat(paste0("Propagation of taxonomic annotations at the ORF level... ", green("done"), "\n\nExporting data for alluvial plot drawing to file ", silver(opt$out), "... "))
bind_rows(cl_tax_consensus_no_na,
cl_tax_consensus_na) %>%
left_join(cl_components, by = "cl_name") %>%
write_tsv(opt$out)
cat(green("done\n"))
#!/usr/bin/env python
import os
import argparse
from Bio import AlignIO
import sys
version = "0.1.1 (28.11.17)"
name = os.path.basename(sys.argv[0]) #get scriptname from actual script filename that was called
parser=argparse.ArgumentParser(description="Converts alignments in FastaFormat to (strict & interleaved; relaxed & interleaved if '-r' is set) phylip format. Will raise error if alignments contain dots (\".\"), so replace those with dashes (\"-\") beforehand (e.g. using sed)")
parser.add_argument('-i','--input', action = "store", dest = "input", required = True, help = "(aligned) input fasta")
parser.add_argument('-o','--output', action = "store", dest = "output", help = "Output filename (default = <Input-file>.phylip")
parser.add_argument('-r','--relaxed', action = "store_true", dest = "relaxed", default = False, help = "output in \"relaxed\" phylip format. (default = False --> Output is strict phylip")
args=parser.parse_args()
if not args.output:
args.output = args.input + ".phylip" #if no outputname was specified set outputname based on inputname
def main():
infile = open(args.input, "r")
outfile = open(args.output, "w")
alignments = AlignIO.parse(infile, "fasta")
if args.relaxed:
AlignIO.write(alignments, outfile, "phylip-relaxed")
else:
AlignIO.write(alignments, outfile, "phylip")
infile.close()
outfile.close()
sys.stderr.write("\nfinished\n")
main()
#!/bin/bash
set -e
function error_handler() {
printf "${RED}ERROR:${NC} Error occurred in script at line: ${RED}${1}${NC}\n"
printf "${RED}ERROR:${NC} Line exited with status: ${RED}${2}${NC}\n"
exit
}
trap 'error_handler ${LINENO} $?' ERR
declare -r GREEN='\033[1;32m'
declare -r RED='\033[1;31m'
declare -r YELLOW='\033[1;33m'
declare -r CYAN='\033[1;36m'
declare -r PURPLE='\033[1;35m'
# declare -r BLUE='\033[1;34m'
declare -r GREY='\033[1;30m'
declare -r NC='\033[0m'
declare -r SEQKITBIN=seqkit
declare -r TAXACONBIN=/scratch/antonio/PRs/WF/EPA-NG/taxnameconvert-2.4/taxnameconvert.pl
declare -r MODELTBIN=/scratch/antonio/PRs/WF/EPA-NG/modeltest-ng/bin/modeltest-ng
declare -r RAXMLBIN=~/opt/standard-RAxML/raxmlHPC-PTHREADS-AVX2
declare -r F2PBIN=~/opt/scripts/fasta2phylip.py
declare -r PAPABIN=~/opt/papara_nt/papara
declare -r EPABIN=/scratch/antonio/PRs/WF/epa-ng/bin/epa-ng
declare -r ODSEQBIN=~/opt/OD-Seq_mg/OD-seq
declare -r TAXITBIN=taxit
declare -r PPLACERBIN=pplacer
declare -r GAPPABIN=/scratch/antonio/PRs/WF/gappa/bin/gappa
declare -r CDHITBIN=cdhit
declare -r ORFANNOT=~/opt/scripts/annotate_query_sequences.R
declare -r THREADS=64
declare -r GUPPYBIN=guppy
declare -r ALLUVIALBIN=~/opt/scripts/create_alluvial.R
#declare -r COMPFILE=k_components.tsv
# Files used during the analysis
#
# Basic files
model_files=(MT.ckp MT.log MT.out)
raxml_files=(RAxML_binaryModelParameters.OPT RAxML_info.OPT RAxML_log.OPT RAxML_result.OPT)
papara_files=(MICrhoDE_prot_aligned.fixed.phy query.0.fasta query.0.core.fasta query.0.outlier.fasta query.fasta papara_alignment.ALN papara_alignment.ALN0 papara_log.ALN papara_log.ALN0 papara_quality.ALN papara_quality.ALN0)
epang_files=(EPA_PR/epa_info.log EPA_PR/epa_result.jplace)
pplacer_files=(query.PR.jplace)
#gappa_files_e=(epa-ng_labelled_tree epa-ng_per_pquery_assign epa-ng_profile.csv MICrhoDE_prot_aligned.query-SC.epa-ng.tsv)
#gappa_files_p=(pplacer_labelled_tree pplacer_per_pquery_assign pplacer_profile.csv MICrhoDE_prot_aligned.query-SC.pplacer.tsv)
basic_files_download=(k_tax_stat.tsv.gz kwp_tax_stat.tsv.gz gu_tax_stat.tsv.gz k_cl_orfs.tsv.gz kwp_cl_orfs.tsv.gz gu_cl_orfs.tsv.gz)
basic_files=(k_tax_stat.tsv.gz kwp_tax_stat.tsv.gz gu_tax_stat.tsv.gz k_cl_orfs.tsv.gz kwp_cl_orfs.tsv.gz gu_cl_orfs.tsv.gz k_pr_tax.tsv kwp_pr_tax.tsv gu_pr_tax.tsv k_pr_orfs.tsv kwp_pr_orfs.tsv gu_pr_orfs.tsv)
graft_files_e=(grafted_tree.epa-ng.tre)
graft_files_p=(grafted_tree.pplacer.tre)
annot_files_e=(epa-ng_per_pquery_assign.tsv)
annot_files_p=(pplacer_per_pquery_assign.tsv)
dedup_files=(all_sequences_names_to_extract.dedup all_sequences_names_to_extract.dedup.clstr)
itol_files=(MICrhoDE_itol_strip.txt MICrhoDE_SC_colors.txt MICrhoDE_query-SC_colors.txt MICrhoDE_itol_strip_query-SC.epa-ng.txt MICrhoDE_itol_strip_query-SC.pplacer.txt)
alluvial_files=(PR_tax_orf.pplacer.tsv PR_tax_orf.pplacer.tsv PR_tax_orf.epa-ng.tsv PR_tax_orf.epa-ng.tsv alluvial_data.pplacer.tsv alluvial_data.epa-ng.tsv)
function create_alluvial (){
cat <(printf "short_name\torf\tcl_name\ttaxid\tfunc\tsize\tspecies\tgenus\tfamily\torder\tclass\tphylum\tsuperkingdom\tcategory\tsupercluster\n") > PR_tax_orf."${1}".tsv
join -t $'\t' -1 13 -2 1 <(sort -t $'\t' -k13,13 <(cat <(awk '{print $0"\tk"}' k_pr_tax.tsv) <(awk '{print $0"\tkwp"}' kwp_pr_tax.tsv) <(awk '{print $0"\tgu"}' gu_pr_tax.tsv))) <(sort -t $'\t' -k1,1 <(cat "${1}"_per_pquery_assign.tsv | tr ';' $'\t' | cut -f1,2)) >> PR_tax_orf."${1}".tsv
cat <(printf "orf\tcl_name\tshort_name\tcategory\n") <(awk '{print $0"\tk"}' k_pr_orfs.tsv) <(awk '{print $0"\tkwp"}' kwp_pr_orfs.tsv) <(awk '{print $0"\tgu"}' gu_pr_orfs.tsv) > PR_cl_orf."${1}".tsv
"${ALLUVIALBIN}" -d PR_tax_orf."${1}".tsv -p 64 -c "${COMPFILE}" -o alluvial_data."${1}".tsv
}
function get_time() {
TIME=$(date +%T)
printf "[${GREY}${TIME}${NC}]"
}
function gappa_assign () {
"${GAPPABIN}" analyze assign --jplace-path "${1}" --taxon-file MICrhoDE_prot_aligned.SC.tsv
sed ':a;N;/\nseq/!s/\n/\t/;ta;P;D' per_pquery_assign | awk -vFS="\t" '{if(NF > 2){print $1"\t"$6}else{print $1"\tNO_ASSIGN"}}' > MICrhoDE_prot_aligned.query-SC."${2}".tsv
mv labelled_tree "${2}"_labelled_tree
mv profile.csv "${2}"_profile.csv
mv per_pquery_assign "${2}"_per_pquery_assign
}
function graft_tree () {
"${GUPPYBIN}" tog -o grafted_tree."${2}".tre "${1}"
}
function orf_assign (){
"${ORFANNOT}" -t "${1}" -d "${2}" -p "${THREADS}" -o "${3}"_per_pquery_assign.tsv
}
function check_files_missing(){
files=("$@")
ALL=0
for file in "${files[@]}"; do
if [ ! -f "${file}" ]; then
ALL=1
fi
done
echo "${ALL}"
}
function create_basic_data (){
# Generate files for alluvial plots
# Prepare results from HMMER
# Get consensus taxonomy for each orf
files=("$@")
printf "$(get_time) Downloading cluster data from ${YELLOW}metagenomics.eu${NC}..."
for file in "${files[@]}"; do
URL=http://files.metagenomics.eu/unknowns/
wget -qN "${URL}"/"${file}"
done
printf " ${GREEN}done${NC}\n"
printf "$(get_time) Filtering taxonomic information ${RED}[get a coffee]${NC}..."
join -t $'\t' -1 2 -2 1 <(zcat k_tax_stat.tsv.gz | sort --parallel="${THREADS}" -S20% -k2,2) <(sort -k1,1 "${NAM}".name_mapping.tsv) > k_pr_tax.tsv
join -t $'\t' -1 2 -2 1 <(zcat kwp_tax_stat.tsv.gz | sort --parallel="${THREADS}" -S20% -k2,2) <(sort -k1,1 "${NAM}".name_mapping.tsv) > kwp_pr_tax.tsv
join -t $'\t' -1 2 -2 1 <(zcat gu_tax_stat.tsv.gz | sort --parallel="${THREADS}" -S20% -k2,2) <(sort -k1,1 "${NAM}".name_mapping.tsv) > gu_pr_tax.tsv
printf " ${GREEN}done${NC}\n"
printf "$(get_time) Adding cluster information to query sequences ${RED}[get a coffee]${NC}..."
join -t $'\t' -1 2 -2 1 <(zcat k_cl_orfs.tsv.gz | tr ' ' $'\t' | sort --parallel="${THREADS}" -S20% -k2,2) <(sort -k1,1 "${NAM}".name_mapping.tsv) > k_pr_orfs.tsv
join -t $'\t' -1 2 -2 1 <(zcat kwp_cl_orfs.tsv.gz | tr ' ' $'\t' | sort --parallel="${THREADS}" -S20% -k2,2) <(sort -k1,1 "${NAM}".name_mapping.tsv) > kwp_pr_orfs.tsv
join -t $'\t' -1 2 -2 1 <(zcat gu_cl_orfs.tsv.gz | tr ' ' $'\t' | sort --parallel="${THREADS}" -S20% -k2,2) <(sort -k1,1 "${NAM}".name_mapping.tsv) > gu_pr_orfs.tsv
printf " ${GREEN}done${NC}\n"
}
function create_itol_data () {
paste <(cut -f2 MICrhoDE_prot_aligned.SC.0.tsv | sort -u) <(printf "#72d9d3\n#e7abd3\n#bfd89b\n#9fbeed\n#e6bb8f\n#b2d6c8\n#dac4c8") > MICrhoDE_SC_colors.txt
cat <(printf "DATASET_COLORSTRIP\nSEPARATOR TAB\nDATASET_LABEL\tSuperCluster\nCOLOR\t#ff0000\nCOLOR_BRANCHES\t0\nDATA\n\n") <(join -t $'\t' -1 2 -2 1 <(sort -t $'\t' -k2,2 MICrhoDE_prot_aligned.SC.0.tsv) <(sort -k1,1 MICrhoDE_SC_colors.txt) | awk -vFS="\t" '{print $2"\t"$5"\t"$1}') > MICrhoDE_itol_strip.txt
# Create data file for iTOL using the grappa assignments
paste <(cut -f2 MICrhoDE_prot_aligned.SC.0.tsv | sort -u) <(printf "#72d9d3\n#e7abd3\n#bfd89b\n#9fbeed\n#e6bb8f\n#b2d6c8\n#dac4c8") > MICrhoDE_qurey-SC_colors.txt
cat MICrhoDE_SC_colors.txt <(printf "NO_ASSIGN\t#000000") >> MICrhoDE_query-SC_colors.txt
cat MICrhoDE_itol_strip.txt <(join -t $'\t' -1 2 -2 1 <(sort -t $'\t' -k2,2 <(cat pplacer_per_pquery_assign.tsv | tr ';' $'\t' | cut -f1,2)) <(sort -k1,1 MICrhoDE_query-SC_colors.txt) | awk -vFS="\t" '{print $2"\t"$3"\t"$1}') > MICrhoDE_itol_strip_query-SC.pplacer.txt
cat MICrhoDE_itol_strip.txt <(join -t $'\t' -1 2 -2 1 <(sort -t $'\t' -k2,2 <(cat epa-ng_per_pquery_assign.tsv | tr ';' $'\t' | cut -f1,2)) <(sort -k1,1 MICrhoDE_query-SC_colors.txt) | awk -vFS="\t" '{print $2"\t"$3"\t"$1}') > MICrhoDE_itol_strip_query-SC.epa-ng.txt
}
if [ $# -eq 0 ]; then
printf "$(get_time) ${RED}ERROR:${NC} No arguments supplied. Provide a ${CYAN}FASTA${NC} file with sequences to be placed\n"
exit 1
fi
QUERY=${1}
COMPFILE=${2}
if [ ${QUERY: -6} != ".fasta" ]; then
printf "$(get_time) ${RED}ERROR:${NC} Provide a ${CYAN}FASTA${NC} file with sequences to be placed with ${CYAN}.fasta${NC} extension\n"
exit 1
fi
NAM=$(basename "${QUERY}" .fasta)
# Remove everything after STOP (*) codon on MICrhoDE_prot_aligned.fasta
printf "$(get_time) Getting data from ${YELLOW}MICrhoDE${NC}..."
wget -N -q http://application.sb-roscoff.fr/micrhode/MICrhoDE_prot_aligned.fasta
wget -N -q http://application.sb-roscoff.fr/micrhode/MicRhoDE_051214.txt
wget -N -q http://application.sb-roscoff.fr/micrhode/tree_MicRhoDE_complete.nwk
printf " ${GREEN}done${NC}\n"
printf "$(get_time) Fixing ${YELLOW}MICrhoDE${NC} alignments..."
sed -i -e 's/*p/--/' MICrhoDE_prot_aligned.fasta
# Convert lowercase sequences to uppercase
"${SEQKITBIN}" replace -p .+ -r "ref-{nr}" MICrhoDE_prot_aligned.fasta | seqkit seq -u > MICrhoDE_prot_aligned.fixed.fasta
paste <(grep '>' MICrhoDE_prot_aligned.fasta | tr -d '>') <(grep '>' MICrhoDE_prot_aligned.fixed.fasta | tr -d '>') > MICrhoDE_prot_aligned.name_mapping.tsv
printf " ${GREEN}done${NC}\n"
printf "$(get_time) Fixing ${YELLOW}MICrhoDE${NC} tree..."
# Rename tree (http://www.cibiv.at/software/taxnameconvert/)
"${TAXACONBIN}" -f 1 -t 2 MICrhoDE_prot_aligned.name_mapping.tsv tree_MicRhoDE_complete.nwk tree_MicRhoDE_complete.renamed.nwk > /dev/null 2>&1
printf " ${GREEN}done${NC}\n"
printf "$(get_time) Dereplicating query sequences...\n"
PRES=$(check_files_missing "${dedup_files[@]}")
if [ "${PRES}" -eq 0 ]; then
printf "$(get_time) Dereplicating query sequences... ${RED}skipped${NC}: File exists.\n"
else
rm -f "${dedup_files[@]}"
"${CDHITBIN}" -i "${QUERY}" -c 1 -d 0 -o "${NAM}".dedup
printf "$(get_time) Dereplicating query sequences... ${GREEN}done${NC}\n"
fi
printf "$(get_time) Fixing query sequences..."
# Remove sequences with an X from our PRs
# Convert lowercase sequences to uppercase
"${SEQKITBIN}" replace -p .+ -r "seq-{nr}" "${NAM}".dedup | seqkit seq -u > "${NAM}".fixed.fasta
paste <(grep '>' "${NAM}".dedup | tr -d '>') <(grep '>' "${NAM}".fixed.fasta | tr -d '>') > "${NAM}".name_mapping.tsv
"${SEQKITBIN}" grep -v -s -p X -r "${NAM}".fixed.fasta > "${NAM}".noX.fasta
printf " ${GREEN}done${NC}\n"
printf "$(get_time) Filtering sequences smaller than 100aa..."
# Keep sequences longer than 100aa
"${SEQKITBIN}" seq --quiet -m 100 "${NAM}".noX.fasta > "${NAM}".noX.100.fasta 2> /dev/null
printf " ${GREEN}done${NC}\n"
printf "$(get_time) Identifying best substitution model...\n"
PRES=$(check_files_missing "${model_files[@]}")
if [ "${PRES}" -eq 0 ]; then
printf "$(get_time) Identifying best substitution model... ${RED}skipped${NC}: File exists.\n"
BESTM=$(grep Model: MT.out | awk '{print $2}' | sort | uniq -c | sort -k1,1nr | awk NR==1'{print $2}')
BESTMR=$(grep raxmlHPC MT.out | awk '{print $6}' | uniq -c | sort -k1,1nr | awk NR==1'{print $2}')
printf "$(get_time) Best model: ${GREEN}${BESTM}${NC}\n"
else
rm -f "${model_files[@]}"
# Use modeltest to find the best model
"${MODELTBIN}" -i MICrhoDE_prot_aligned.fixed.fasta -o MT -p "${THREADS}" -d aa -T raxml --force -t user -u tree_MicRhoDE_complete.renamed.nwk
BESTM=$(grep Model: MT.out | awk '{print $2}' | sort | uniq -c | sort -k1,1nr | awk NR==1'{print $2}')
BESTMR=$(grep raxmlHPC MT.out | awk '{print $6}' | uniq -c | sort -k1,1nr | awk NR==1'{print $2}')
printf "$(get_time) Identifying best substitution model... ${GREEN}done${NC}\n"
printf "$(get_time) Best model: ${GREEN}${BESTM}${NC}\n"
fi
printf "$(get_time) Optimizing initial tree parameters and branch lengths...\n"
PRES=$(check_files_missing "${raxml_files[@]}")
if [ "${PRES}" -eq 0 ]; then
printf "$(get_time) Optimizing initial tree parameters and branch lengths... ${RED}skipped${NC}: File exists.\n"
else
rm -f "${raxml_files[@]}"
# Optimize tree parameters and branch lengths
[ -f RAxML_log.OPT ] && rm -rf *.OPT
"${RAXMLBIN}" -s MICrhoDE_prot_aligned.fixed.fasta -T "${THREADS}" -f e -n OPT -t tree_MicRhoDE_complete.renamed.nwk -m "${BESTMR}"
printf "$(get_time) Optimizing initial tree parameters and branch lengths... ${GREEN}done${NC}\n"
fi
# Add our PRs seqs to the REF alignment with PaPaRA (https://github.com/sim82/papara_nt)
# First we need to patch the PaPaRA pvec.h. Get patch from: https://gist.github.com/genomewalker/a3f03846cf7e5ad63be63a9c8c153301
# Patch the file
# git clone --recursive https://github.com/sim82/papara_nt
# cd papara_nt || exit "Cannot create folder"
# wget https://gist.githubusercontent.com/genomewalker/a3f03846cf7e5ad63be63a9c8c153301/raw/4cee4913ea576806f504d47f6e5702e228f9307c/pvec.h.patch
# patch < pvec.h.patch
printf "$(get_time) ${PURPLE}[Iter 1]:${NC} Aligning query sequences to reference alignment with PaPaRA...\n"
PRES=$(check_files_missing "${papara_files[@]}")
if [ "${PRES}" -eq 0 ]; then
printf "$(get_time) Aligning query sequences to reference alignment with PaPaRA... ${RED}skipped${NC}: File exists.\n"
else
rm -f "${papara_files[@]}"
# Convert FASTA ref alignment to relaxed phylip
"${F2PBIN}" -i MICrhoDE_prot_aligned.fixed.fasta -o MICrhoDE_prot_aligned.fixed.phy -r > /dev/null 2>&1
# Align query sequences to reference alignment
[ -f papara_log.ALN0 ] && rm -rf *ALN0
"${PAPABIN}" -t RAxML_result.OPT -s MICrhoDE_prot_aligned.fixed.phy -q "${NAM}".noX.100.fasta -a -j "${THREADS}" -n ALN0 -r
# We will use epa-ng to place the query sequences into the reference tree
# First we split the alignment in reference and query
"${EPABIN}" --split MICrhoDE_prot_aligned.fixed.phy papara_alignment.ALN0
printf "$(get_time) ${PURPLE}[Iter 1]:${NC} Aligning query sequences to reference alignment with PaPaRA... ${GREEN} done${NC}\n"
printf "$(get_time) Identifying outliers (OD-Seq) in query alignment..."
[ -f query.fasta ] && mv query.fasta query.0.fasta
"${ODSEQBIN}" -t 24 -i query.0.fasta -c query.0.core.fasta -o query.0.outlier.fasta
printf "$(get_time) Identifying outliers (OD-Seq) in query alignment... ${GREEN}done${NC}\n"
printf "$(get_time) ${PURPLE}[Iter 2]:${NC} Aligning core query sequences to reference alignment with PaPaRA...\n"
# Align query sequences to reference alignment
[ -f papara_alignment.ALN ] && rm -rf *ALN
"${PAPABIN}" -t RAxML_result.OPT -s MICrhoDE_prot_aligned.fixed.phy -q query.0.core.fasta -a -j "${THREADS}" -n ALN -r
# We will use epa-ng to place the query sequences into the reference tree
# First we split the alignment in reference and query
[ -f query.fasta ] && rm -rf query.fasta
"${EPABIN}" --split MICrhoDE_prot_aligned.fixed.phy papara_alignment.ALN
printf "$(get_time) ${PURPLE}[Iter 2]:${NC} Aligning core query sequences to reference alignment with PaPaRA... ${GREEN}done${NC}\n"
fi
printf "$(get_time) Placing query sequences to reference tree using ${CYAN}epa-ng${NC}...\n"
PRES=$(check_files_missing "${epang_files[@]}")
if [ "${PRES}" -eq 0 ]; then
printf "$(get_time) Placing query sequences to reference tree using ${CYAN}epa-ng${NC}... ${RED}skipped${NC}: File exists.\n"
else
# And place the sequences
rm -f "${epang_files[@]}"
mkdir -p EPA_PR
"${EPABIN}" --tree RAxML_result.OPT --ref-msa MICrhoDE_prot_aligned.fixed.fasta --query query.fasta --outdir EPA_PR --model "${BESTM}" -T "${THREADS}"
printf "$(get_time) Placing query sequences to reference tree using ${CYAN}epa-ng${NC}... ${GREEN}done${NC}\n"
fi
printf "$(get_time) Placing query sequences to reference tree using ${CYAN}pplacer${NC}...\n"
PRES=$(check_files_missing "${pplacer_files[@]}")
if [ "${PRES}" -eq 0 ]; then
printf "$(get_time) Placing query sequences to reference tree using ${CYAN}epa-ng${NC}... ${RED}skipped${NC}: File exists.\n"
else
rm -f "${pplacer_files[@]}"
# Place sequences with pplacer
"${TAXITBIN}" create -l PR -P PR.refpkg --aln-fasta MICrhoDE_prot_aligned.fixed.fasta --tree-stats RAxML_info.OPT --tree-file RAxML_result.OPT
"${PPLACERBIN}" -o query.PR.jplace -p --keep-at-most 20 -c PR.refpkg query.fasta -j "${THREADS}"
printf "$(get_time) Placing query sequences to reference tree using ${CYAN}pplacer${NC}... ${GREEN}done${NC}\n"
fi
# 2. Assign taxonomy
# PRES=$(check_files_missing "${gappa_files_e[@]}")
# if [ "${PRES}" -eq 0 ]; then
# printf "$(get_time) Assigning SuperCluster affiliation with ${CYAN}gappa${NC} using the ${CYAN}epa-ng${NC} results... ${RED}skipped${NC}: File exists.\n"
# else
# rm -f "${gappa_files_e[@]}"
# gappa_assign EPA_PR/epa_result.jplace epa-ng
# printf "$(get_time) Assigning SuperCluster affiliation with ${CYAN}gappa${NC} using the ${CYAN}epa-ng${NC} results... ${GREEN}done${NC}\n"
# fi
#
# printf "$(get_time) Assigning SuperCluster affiliation with ${CYAN}gappa${NC} using the ${CYAN}pplacer${NC} results...\n"
# PRES=$(check_files_missing "${gappa_files_p[@]}")
# if [ "${PRES}" -eq 0 ]; then
# printf "$(get_time) Assigning SuperCluster affiliation with ${CYAN}gappa${NC} using the ${CYAN}pplacer${NC} results... ${RED}skipped${NC}: File exists.\n"
# else
# rm -f "${gappa_files_p[@]}"
# gappa_assign query.PR.jplace pplacer
# printf "$(get_time) Assigning SuperCluster affiliation with ${CYAN}gappa${NC} using the ${CYAN}pplacer${NC} results... ${GREEN}done${NC}\n"
# fi
printf "$(get_time) Grafting ${CYAN}epa-ng${NC} tree with ${CYAN}guppy${NC}...\n"
PRES=$(check_files_missing "${graft_files_e[@]}")
if [ "${PRES}" -eq 0 ]; then
printf "$(get_time) Grafting ${CYAN}epa-ng${NC} tree with ${CYAN}guppy${NC}... ${RED}skipped${NC}: File exists.\n"
else
rm -f "${graft_files_e[@]}"
graft_tree EPA_PR/epa_result.jplace epa-ng
printf "$(get_time) Grafting ${CYAN}epa-ng${NC} tree with ${CYAN}guppy${NC}... ${GREEN}done${NC}\n"
fi
printf "$(get_time) Grafting ${CYAN}pplacer${NC} tree with ${CYAN}guppy${NC}...\n"
PRES=$(check_files_missing "${graft_files_p[@]}")
if [ "${PRES}" -eq 0 ]; then
printf "$(get_time) Grafting ${CYAN}pplacer${NC} tree with ${CYAN}guppy${NC}... ${RED}skipped${NC}: File exists.\n"
else
rm -f "${graft_files_p[@]}"
graft_tree query.PR.jplace pplacer
printf "$(get_time) Grafting ${CYAN}pplacer${NC} tree with ${CYAN}guppy${NC}... ${GREEN}done${NC}\n"
fi
join -t $'\t' -1 1 -2 1 <(tail -n+2 MicRhoDE_051214.txt | cut -f3,8,9,10 | sort -k1,1) <(sort -k1,1 MICrhoDE_prot_aligned.name_mapping.tsv) | awk -vFS="\t" '{print $5"\t"$2"\t"$3"\t"$4}' > MICrhoDE_prot_aligned.SC.0.tsv
paste <(cut -f1 MICrhoDE_prot_aligned.SC.0.tsv) <(cut -f2- MICrhoDE_prot_aligned.SC.0.tsv | tr $'\t' ';') > MICrhoDE_prot_aligned.SC.tsv
printf "$(get_time) Assigning SuperCluster affiliation with using the ${CYAN}epa-ng${NC} results...\n"
PRES=$(check_files_missing "${annot_files_e[@]}")
if [ "${PRES}" -eq 0 ]; then
printf "$(get_time) Assigning SuperCluster affiliation with using the ${CYAN}epa-ng${NC} results... ${RED}skipped${NC}: File exists.\n"
else
rm -f "${annot_files_e[@]}"
orf_assign grafted_tree.epa-ng.tre MICrhoDE_prot_aligned.SC.tsv epa-ng
printf "$(get_time) Assigning SuperCluster affiliation with using the ${CYAN}epa-ng${NC} results... ${GREEN}done${NC}\n"
fi
printf "$(get_time) Assigning SuperCluster affiliation with using the ${CYAN}pplacer${NC} results...\n"
PRES=$(check_files_missing "${annot_files_p[@]}")
if [ "${PRES}" -eq 0 ]; then
printf "$(get_time) Assigning SuperCluster affiliation with using the ${CYAN}pplacer${NC} results... ${RED}skipped${NC}: File exists.\n"
else
rm -f "${annot_files_p[@]}"
orf_assign grafted_tree.pplacer.tre MICrhoDE_prot_aligned.SC.tsv pplacer
printf "$(get_time) Assigning SuperCluster affiliation with using the ${CYAN}pplacer${NC} results... ${GREEN}done${NC}\n"
fi
# Create data file for iTOL/alluvial using the grafted tree assignments
printf "$(get_time) Getting basic data for ${CYAN}iTOL${NC} and ${CYAN}alluvial${NC} plots...\n"
PRES=$(check_files_missing "${basic_files[@]}")
if [ "${PRES}" -eq 0 ]; then
printf "$(get_time) Getting basic data for ${CYAN}iTOL${NC} and ${CYAN}alluvial${NC} plots... ${RED}skipped${NC}: File exists.\n"
else
rm -f "${basic_files[@]}"
create_basic_data "${basic_files_download[@]}"
printf "$(get_time) Getting basic data for ${CYAN}iTOL${NC} and ${CYAN}alluvial${NC} plots... ${GREEN}done${NC}\n"
fi
# Generate files for iTOL
# https://itol.embl.de/help/dataset_color_strip_template.txt
# 1. Assign colors to clusters and create strip file
printf "$(get_time) Creating data for ${CYAN}iTOL${NC}...\n"
PRES=$(check_files_missing "${itol_files[@]}")
if [ "${PRES}" -eq 0 ]; then
printf "$(get_time) Creating data for ${CYAN}iTOL${NC}... ${RED}skipped${NC}: File exists.\n"
else
rm -f "${itol_files[@]}"
create_itol_data
printf "$(get_time) Creating data for ${CYAN}iTOL${NC}... ${GREEN}done${NC}\n"
fi
printf "$(get_time) Creating data for ${CYAN}alluvial${NC} plots...\n"
PRES=$(check_files_missing "${alluvial_files[@]}")
if [ "${PRES}" -eq 0 ]; then
printf "$(get_time) Creating data for ${CYAN}alluvial${NC} plots... ${RED}skipped${NC}: File exists.\n"
else
rm -f "${itol_files[@]}"
create_alluvial pplacer
create_alluvial epa-ng
printf "$(get_time) Creating data for ${CYAN}alluvial${NC} plots... ${GREEN}done${NC}\n"
fi
printf "\n\n$(get_time) ${RED}ANALYSIS FINISHED${NC}\n"
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment