Skip to content

Instantly share code, notes, and snippets.

@genomewalker
Last active September 27, 2017 13:13
Show Gist options
  • Save genomewalker/d4b546124a6cc90b7d060e97d0b77d32 to your computer and use it in GitHub Desktop.
Save genomewalker/d4b546124a6cc90b7d060e97d0b77d32 to your computer and use it in GitHub Desktop.
Tiny script to create aminoacid sequences with different types of mutations
# Example for N. maritimus
# First we generate a set of mutants
python generate_mutants.py -g GCF_000018465.1_ASM1846v1_protein.faa -n 3 -o nmarit
# We label our seeds or references
awk '{if ($0 ~ /^>/) {print $1"_ref"}else{print $0}}' ../GCF_000018465.1_ASM1846v1_protein.faa > nmarit_aa_ref.fasta
# Concatenate mutants and references
cat nmar*sub.fasta nmar*ins.fasta nmar*del.fasta nmar*all.fasta nmarit_aa_ref.fasta > nmarit_all_aa.fasta
# Cluster them with the same approach we did for our clusters
# code adapted to work on a MAC OS X and linuxbrew coreutils
bash unknown_clust.sh nmarit_all_aa.fasta nmarit_aa_all_cls
# Parse clusters
cp nmarit_aa_all_cls/2017_07/tmp/aaclust30_2017_07.tsv nmarit_aaclust30_2017_06.tsv
python parse_clusters.py -t nmarit_aaclust30_2017_06.tsv -o nmarit_aaclust30_2017_06_all -f nmarit_all_aa.fasta
# Plot results
./plot_mutants.r -a nmarit_aaclust30_2017_06_all_alns.tsv -s nmarit_aaclust30_2017_06_all.tsv -n nmarit
from Bio import SeqIO
from Bio.SeqRecord import SeqRecord
import argparse
import re
import subprocess
from tqdm import *
def get_args():
'''This function parses and return arguments passed in'''
# Assign description to the help doc
parser = argparse.ArgumentParser(
description='Script uses EMBOSS msbar '
'to generate mutated AA sequences')
# Add arguments
parser.add_argument(
'-g', '--genome',
type=str,
help='Genome file with amino acid sequences.',
required=True)
parser.add_argument(
'-n', '--nmut',
type=str,
help='Number of mutated sequences per sequence.',
required=True)
# Array for all arguments passed to script
args = parser.parse_args()
# Assign args to variables
infile = args.genome
n_mut = args.nmut
# Return all variable values
return infile, n_mut
def run_msbar(iden, mutrate, record, iter, muttype):
rec = SeqRecord(record.seq,
id=record.id,
name=record.name,
description="")
seq_len = len(rec.seq)
mut = int((mutrate * seq_len))
ident = re.sub("\|", "-", rec.id) + "_" + str(iden) + "_" + str(iter)
rec.id = ident
rec.description = ""
SeqIO.write(rec, "tmp.fasta", "fasta")
command = ['msbar',
'-sequence', 'tmp.fasta',
'-count', str(mut),
'-point', str(muttype),
'-auto', '-stdout',
'-block', "0"
]
try:
seqmut = subprocess.Popen(command, stdout=subprocess.PIPE)
except subprocess.CalledProcessError as e:
print e.output
r = SeqIO.read(seqmut.stdout, "fasta")
return r
def run_msbar_sub(file90, file50, file30, i):
aa90sub = run_msbar(iden="90-sub",
mutrate=0.1,
record=record,
muttype=4,
iter=i)
aa50sub = run_msbar(iden="50-sub",
mutrate=0.5,
record=record,
muttype=4,
iter=i)
aa30sub = run_msbar(iden="30-sub",
mutrate=0.7,
record=record,
muttype=4,
iter=i)
SeqIO.write(aa90sub, file90, "fasta")
SeqIO.write(aa50sub, file50, "fasta")
SeqIO.write(aa30sub, file30, "fasta")
def run_msbar_ins(file90, file50, file30, i):
aa90ins = run_msbar(iden="90-ins",
mutrate=0.1,
record=record,
muttype=2,
iter=i)
aa50ins = run_msbar(iden="50-ins",
mutrate=0.5,
record=record,
muttype=2,
iter=i)
aa30ins = run_msbar(iden="30-ins",
mutrate=0.7,
record=record,
muttype=2,
iter=i)
SeqIO.write(aa90ins, file90, "fasta")
SeqIO.write(aa50ins, file50, "fasta")
SeqIO.write(aa30ins, file30, "fasta")
def run_msbar_del(file90, file50, file30, i):
aa90del = run_msbar(iden="90-del",
mutrate=0.1,
record=record,
muttype=3,
iter=i)
aa50del = run_msbar(iden="50-del",
mutrate=0.5,
record=record,
muttype=3,
iter=i)
aa30del = run_msbar(iden="30-del",
mutrate=0.7,
record=record,
muttype=3,
iter=i)
SeqIO.write(aa90del, file90, "fasta")
SeqIO.write(aa50del, file50, "fasta")
SeqIO.write(aa30del, file30, "fasta")
def run_msbar_all(file90, file50, file30, i):
aa90all = run_msbar(iden="90-all",
mutrate=0.1,
record=record,
muttype="2,3,4",
iter=i)
aa50all = run_msbar(iden="50-all",
mutrate=0.5,
record=record,
muttype="2,3,4",
iter=i)
aa30all = run_msbar(iden="30-all",
mutrate=0.7,
record=record,
muttype="2,3,4",
iter=i)
SeqIO.write(aa90all, file90, "fasta")
SeqIO.write(aa50all, file50, "fasta")
SeqIO.write(aa30all, file30, "fasta")
if __name__ == "__main__":
infile, n_mut = get_args()
out90sub = "aa90_sub.fasta"
out50sub = "aa50_sub.fasta"
out30sub = "aa30_sub.fasta"
out90ins = "aa90_ins.fasta"
out50ins = "aa50_ins.fasta"
out30ins = "aa30_ins.fasta"
out90del = "aa90_del.fasta"
out50del = "aa50_del.fasta"
out30del = "aa30_del.fasta"
out90all = "aa90_all.fasta"
out50all = "aa50_all.fasta"
out30all = "aa30_all.fasta"
output90sub = open(out90sub, "w")
output50sub = open(out50sub, "w")
output30sub = open(out30sub, "w")
output90ins = open(out90ins, "w")
output50ins = open(out50ins, "w")
output30ins = open(out30ins, "w")
output90del = open(out90del, "w")
output50del = open(out50del, "w")
output30del = open(out30del, "w")
output90all = open(out90all, "w")
output50all = open(out50all, "w")
output30all = open(out30all, "w")
for record in SeqIO.parse(infile, "fasta"):
t = trange(int(n_mut), desc=record.id, leave=False)
for i in t:
run_msbar_sub(file90=output90sub,
file50=output50sub,
file30=output30sub,
i=i)
run_msbar_ins(file90=output90ins,
file50=output50ins,
file30=output30ins,
i=i)
run_msbar_del(file90=output90del,
file50=output50del,
file30=output30del,
i=i)
run_msbar_all(file90=output90all,
file50=output50all,
file30=output30all,
i=i)
output90sub.close()
output50sub.close()
output30sub.close()
output90ins.close()
output50ins.close()
output30ins.close()
output90del.close()
output50del.close()
output30del.close()
output90all.close()
output50all.close()
output30all.close()
import argparse
from collections import defaultdict, OrderedDict, Counter
import csv
import re
from Bio import SeqIO
import subprocess
import os
import errno
import glob
import shutil
from statsmodels import robust
import pandas as pd
from tqdm import *
def get_args():
'''This function parses and return arguments passed in'''
# Assign description to the help doc
parser = argparse.ArgumentParser(
description='Script to parse mmseqs tsv files')
# Add arguments
parser.add_argument(
'-t', '--tsvfile',
type=str,
help='MMSEQS tsv file',
required=True)
parser.add_argument(
'-o', '--output',
type=str,
help='Output name',
required=True)
parser.add_argument(
'-f', '--fastafile',
type=str,
help='Fasta file with ORFs used for clustering',
required=True)
# Array for all arguments passed to script
args = parser.parse_args()
# Assign args to variables
infile = args.tsvfile
outfile = args.output
fasta_file = args.fastafile
# Return all variable values
return infile, outfile, fasta_file
class MyCounter(Counter):
def __str__(self):
return ",".join('{}:{}'.format(k, v) for k, v in self.items())
def create_db(f_file, outfile):
command = ['mmseqs', 'createdb', f_file, outfile + '_seqDB']
try:
subprocess.check_output(command)
except subprocess.CalledProcessError as e:
print(e.output)
print(e.returncode)
def search_db(outfile):
tmp = outfile + '_tmp'
os.mkdir(tmp)
command = ['mmseqs',
'search',
outfile + '_seqDB',
outfile + '_seqDB',
outfile + '_seqDB.aln.db',
tmp, '-a',
'--comp-bias-corr', '0',
'-e', '10',
'--threads', '4'
]
try:
subprocess.check_output(command)
except subprocess.CalledProcessError as e:
print(e.output)
print(e.returncode)
def get_alns(outfile):
command = ['mmseqs',
'convertalis',
outfile + '_seqDB',
outfile + '_seqDB',
outfile + '_seqDB.aln.db',
outfile + '_seqDB.aln.m8'
]
try:
subprocess.check_output(command)
except subprocess.CalledProcessError as e:
print(e.output)
print(e.returncode)
def parse_alns(n_cls, values, outfile):
df = pd.read_table(outfile + '_seqDB.aln.m8', header=None)
mean = df.iloc[:, 2].mean()
median = df.iloc[:, 2].median()
mad = robust.mad(df.iloc[:, 2], c=1)
df.insert(12, '', 'cls_' + str(n_cls))
return df, mean, median, mad
def clean_db(filename):
try:
for fl in glob.glob(filename):
os.remove(fl)
except OSError as e: # this would be "except OSError, e:" before Python 2.6
if e.errno != errno.ENOENT: # errno.ENOENT = no such file or directory
raise # re-raise exception if a different error occurred
def clean_tmp(outfile):
try:
shutil.rmtree(outfile + '_tmp')
except OSError as e: # this would be "except OSError, e:" before Python 2.6
if e.errno != errno.ENOENT: # errno.ENOENT = no such file or directory
raise # re-raise exception if a different error occurred
def mmseqs_seq_net(values, seq_dic, n_cls, outfile):
# write elements to file
tf_name = outfile + "_cls_tmp.fasta"
tmp_fasta = open(tf_name, "w")
for k in values:
SeqIO.write(seq_dic[k], tmp_fasta, "fasta")
tmp_fasta.close()
clean_tmp(outfile)
clean_db(outfile + "_seqDB*")
create_db(tf_name, outfile)
search_db(outfile)
get_alns(outfile)
df, mean, median, mad = parse_alns(n_cls, values, outfile)
return df, mean, median, mad
def create_seq_dic(fasta_file, seq_dic):
seq_dic = SeqIO.index(fasta_file, "fasta")
return seq_dic
if __name__ == "__main__":
infile, outfile, fasta_file = get_args()
d1 = defaultdict(list)
dic = defaultdict(list)
sdic = create_seq_dic(fasta_file, dic)
n_cls = 0
pd_all = []
with open(infile) as f:
for line in f:
k = [n for n in line.strip().split('\t')]
d1[k[0]].append(k[1])
d1 = OrderedDict(sorted(d1.items(),
key=lambda (k, v): len(v),
reverse=True))
rref = re.compile(r'.*_ref')
rsub = re.compile(r'.*-sub.*')
rins = re.compile(r'.*-ins.*')
rdel = re.compile(r'.*-del.*')
rall = re.compile(r'.*-all.*')
rsub90 = re.compile(r'.*90-sub.*')
rins90 = re.compile(r'.*90-ins.*')
rdel90 = re.compile(r'.*90-del.*')
rall90 = re.compile(r'.*90-all.*')
rsub50 = re.compile(r'.*50-sub.*')
rins50 = re.compile(r'.*50-ins.*')
rdel50 = re.compile(r'.*50-del.*')
rall50 = re.compile(r'.*50-all.*')
rsub30 = re.compile(r'.*30-sub.*')
rins30 = re.compile(r'.*30-ins.*')
rdel30 = re.compile(r'.*30-del.*')
rall30 = re.compile(r'.*30-all.*')
with open(outfile + ".tsv", 'wb') as csv_file:
writer = csv.writer(csv_file, delimiter="\t")
header = ["cls_num", "num_mem", "is_rep_ref", "reference", "num_ref",
"num_sub", "num_ins", "num_del", "num_all", "num_diff", "diff_orf",
"mean_iden", "median_iden", "mad_iden",
"num_sub90", "num_ins90", "num_del90", "num_all90",
"num_sub50", "num_ins50", "num_del50", "num_all50",
"num_sub30", "num_ins30", "num_del30", "num_all30",
"members"]
writer.writerow(header)
# t = trange(len(d1), desc='cls_' + n_cls, leave=False)
for key, value in tqdm(d1.items(), desc='Processing', leave=False):
n_cls = n_cls + 1
nref = len(filter(rref.match, value))
nsub = len(filter(rsub.match, value))
nins = len(filter(rins.match, value))
ndel = len(filter(rdel.match, value))
nall = len(filter(rall.match, value))
nsub90 = len(filter(rsub90.match, value))
nins90 = len(filter(rins90.match, value))
ndel90 = len(filter(rdel90.match, value))
nall90 = len(filter(rall90.match, value))
nsub50 = len(filter(rsub50.match, value))
nins50 = len(filter(rins50.match, value))
ndel50 = len(filter(rdel50.match, value))
nall50 = len(filter(rall50.match, value))
nsub30 = len(filter(rsub30.match, value))
nins30 = len(filter(rins30.match, value))
ndel30 = len(filter(rdel30.match, value))
nall30 = len(filter(rall30.match, value))
repres = bool(rref.match(key))
counts = Counter([re.sub("_\d\d-\w\w\w_..*|_ref", "", m) for m in value])
if len(value) > 1:
df, mean, median, mad = mmseqs_seq_net(value, sdic, n_cls, outfile)
pd_all.append(df)
else:
df = mean = median = mad = 'NA'
writer.writerow(["cls_" + str(n_cls), len(value), repres, key, nref,
nsub, nins, ndel, nall, len(counts),
MyCounter(counts), mean, median, mad,
nsub90, nins90, ndel90, nall90,
nsub50, nins50, ndel50, nall50,
nsub30, nins30, ndel30, nall30,
','.join(value)])
df = pd.concat(pd_all, ignore_index=True)
df.to_csv(outfile + "_alns.tsv", sep='\t')
import argparse
from collections import defaultdict, OrderedDict, Counter
import csv
import re
from Bio import SeqIO
import subprocess
import os
import errno
import glob
import shutil
from statsmodels import robust
import pandas as pd
from tqdm import *
def get_args():
'''This function parses and return arguments passed in'''
# Assign description to the help doc
parser = argparse.ArgumentParser(
description='Script to parse mmseqs tsv files')
# Add arguments
parser.add_argument(
'-t', '--tsvfile',
type=str,
help='MMSEQS tsv file',
required=True)
parser.add_argument(
'-o', '--output',
type=str,
help='Output name',
required=True)
parser.add_argument(
'-f', '--fastafile',
type=str,
help='Fasta file with ORFs used for clustering',
required=True)
# Array for all arguments passed to script
args = parser.parse_args()
# Assign args to variables
infile = args.tsvfile
outfile = args.output
fasta_file = args.fastafile
# Return all variable values
return infile, outfile, fasta_file
class MyCounter(Counter):
def __str__(self):
return ",".join('{}:{}'.format(k, v) for k, v in self.items())
def create_db(f_file, outfile):
command = ['mmseqs', 'createdb', f_file, outfile + '_seqDB']
try:
subprocess.check_output(command)
except subprocess.CalledProcessError as e:
print(e.output)
print(e.returncode)
def search_db(outfile):
tmp = outfile + '_tmp'
os.mkdir(tmp)
command = ['mmseqs',
'search',
outfile + '_seqDB',
outfile + '_seqDB',
outfile + '_seqDB.aln.db',
tmp, '-a',
'--comp-bias-corr', '0',
'-e', '10',
'--threads', '4'
]
try:
subprocess.check_output(command)
except subprocess.CalledProcessError as e:
print(e.output)
print(e.returncode)
def get_alns(outfile):
command = ['mmseqs',
'convertalis',
outfile + '_seqDB',
outfile + '_seqDB',
outfile + '_seqDB.aln.db',
outfile + '_seqDB.aln.m8'
]
try:
subprocess.check_output(command)
except subprocess.CalledProcessError as e:
print(e.output)
print(e.returncode)
def parse_alns(n_cls, values, outfile):
df = pd.read_table(outfile + '_seqDB.aln.m8', header=None)
mean = df.iloc[:, 2].mean()
median = df.iloc[:, 2].median()
mad = robust.mad(df.iloc[:, 2], c=1)
df.insert(12, '', 'cls_' + str(n_cls))
return df, mean, median, mad
def clean_db(filename):
try:
for fl in glob.glob(filename):
os.remove(fl)
except OSError as e: # this would be "except OSError, e:" before Python 2.6
if e.errno != errno.ENOENT: # errno.ENOENT = no such file or directory
raise # re-raise exception if a different error occurred
def clean_tmp(outfile):
try:
shutil.rmtree(outfile + '_tmp')
except OSError as e: # this would be "except OSError, e:" before Python 2.6
if e.errno != errno.ENOENT: # errno.ENOENT = no such file or directory
raise # re-raise exception if a different error occurred
def mmseqs_seq_net(values, seq_dic, n_cls, outfile):
# write elements to file
tf_name = outfile + "_cls_tmp.fasta"
tmp_fasta = open(tf_name, "w")
for k in values:
SeqIO.write(seq_dic[k], tmp_fasta, "fasta")
tmp_fasta.close()
clean_tmp(outfile)
clean_db(outfile + "_seqDB*")
create_db(tf_name, outfile)
search_db(outfile)
get_alns(outfile)
df, mean, median, mad = parse_alns(n_cls, values, outfile)
return df, mean, median, mad
def create_seq_dic(fasta_file, seq_dic):
seq_dic = SeqIO.index(fasta_file, "fasta")
return seq_dic
if __name__ == "__main__":
infile, outfile, fasta_file = get_args()
d1 = defaultdict(list)
dic = defaultdict(list)
sdic = create_seq_dic(fasta_file, dic)
n_cls = 0
pd_all = []
with open(infile) as f:
for line in f:
k = [n for n in line.strip().split('\t')]
d1[k[0]].append(k[1])
d1 = OrderedDict(sorted(d1.items(),
key=lambda (k, v): len(v),
reverse=True))
with open(outfile + ".tsv", 'wb') as csv_file:
writer = csv.writer(csv_file, delimiter="\t")
header = ["cls_num", "num_mem", "reference",
"num_diff", "diff_orf",
"mean_iden", "median_iden", "mad_iden",
"members"]
writer.writerow(header)
# t = trange(len(d1), desc='cls_' + n_cls, leave=False)
for key, value in tqdm(d1.items(), desc='Processing', leave=False):
n_cls = n_cls + 1
counts = Counter([re.sub("_\d\d-\w\w\w_..*|_ref", "", m) for m in value])
if len(value) > 1:
df, mean, median, mad = mmseqs_seq_net(value, sdic, n_cls, outfile)
pd_all.append(df)
else:
df = mean = median = mad = 'NA'
writer.writerow(["cls_" + str(n_cls), len(value), key, len(counts),
MyCounter(counts), mean, median, mad,
','.join(value)])
df = pd.concat(pd_all, ignore_index=True)
df.to_csv(outfile + "_alns.tsv", sep='\t')
#!/usr/bin/env Rscript
suppressMessages(library("optparse", quietly = TRUE))
suppressMessages(library(tidyverse, quietly = TRUE))
suppressMessages(library(igraph, quietly = TRUE))
suppressMessages(library(pbmcapply, quietly = TRUE))
suppressMessages(library(cowplot, quietly = TRUE))
options(warn=-1)
option_list = list(
make_option(c("-a", "--aln"), type="character", default=NULL,
help="cluster alignment file", metavar="character"),
make_option(c("-s", "--stats"), type="character", default=NULL,
help="cluster statistic file", metavar="character"),
make_option(c("-n", "--namefile"), type="character", default=NULL,
help="Filename to save", metavar="character")
)
opt_parser = OptionParser(option_list=option_list);
opt = parse_args(opt_parser);
if (length(opt) < 3){
print_help(opt_parser)
stop("At least one argument must be supplied\n", call.=TRUE)
}
binary_search_filter_1 <- function(g = NULL, low = NULL, high = NULL, w = NULL) {
x <- g
if (high < low) {
return(NULL)
}else {
mid <- floor((low + high) / 2)
x <- delete.edges(g, which(E(g)$weight <= w[mid]))
# cat("low:", low, " mid:", mid, " high:", high, " mid_weight:", weights[mid], " components:", count_components(x), " edges:", ecount(x), "\n")
if ((mid - low) == 0){
x <- delete.edges(g, which(E(g)$weight < w[mid-1]))
# cat("Final: low:", low - 1, " mid:", mid - 1, " high:", high - 1, " mid_weight:", weights[mid - 1], " components:",count_components(x), " edges:", ecount(x), "\n")
return(list(weight=mid - 1, graph=x))
exit
}
if (((high - low) == 0)){
x<-delete.edges(g, which(E(g)$weight < w[mid+1]))
# cat("Final: low:", low, " mid:", mid, " high:", high, " mid_weight:", weights[mid], " components:",count_components(x), " edges:", ecount(x), "\n")
return(list(weight=mid , graph=x))
exit
}
if (!(is.connected(x))){
binary_search_filter_1(g, low, mid - 1)
}else if (is.connected(x)){
binary_search_filter_1(g, mid + 1, high)
}
}
}
get_refs <- function(X = X, aln = aln, data = data){
if ((data %>% filter(cls_num == X) %>% .$num_mem >= 2) & (data %>% filter(cls_num == X) %>% .$num_ref > 0)) {
ref <- data %>% filter(cls_num == X) %>% .$reference
l <- aln %>%
filter(cls == X) %>%
graph_from_data_frame(directed = FALSE) %>%
simplify(remove.multiple = TRUE, remove.loops = TRUE, edge.attr.comb = "max")
weights <- unique(sort(E(l)$weight, decreasing = FALSE))
if (length(weights) > 0){
l <- binary_search_filter_1(g = l, low = 1, high = length(weights), w = weights)$graph
s <- strength(l) %>% sort %>% tail(1) %>% names()
b <- betweenness(l, weights = 1/E(l)$weight, directed = FALSE, normalized = TRUE) %>% sort %>% tail(1) %>% names()
p <- page_rank(l, weights = E(l)$weight, directed = FALSE)$vector %>% sort %>% tail(1) %>% names()
e <- eigen_centrality(l, weights = E(l)$weight, directed = FALSE)$vector %>% sort %>% tail(1) %>% names()
c <- closeness(l, weights = 1/E(l)$weight, normalized = TRUE) %>% sort %>% tail(1) %>% names()
d <- data.frame(cls = X, cl_ref = ref, s_ref = s, b_ref = b, pr_ref = p, e_ref = e, c_ref = c, d = graph.density(l))
return(d)
}else{
d <- data.frame(cls = X, cl_ref = ref, s_ref = NA, b_ref = NA, pr_ref = NA, e_ref = NA, c_ref = NA, d = NA)
return(d)
}
}
# }else{
# d <- data.frame(cls = X, cl_ref = ref, s_ref = NA, b_ref = NA, pr_ref = NA, e_ref = NA, d = NA)
# }
}
suppressMessages(df <- read_tsv(opt$stats, col_names = T))
suppressMessages(alns <- read_tsv(opt$aln, col_names = F))
alns <- alns %>%
select(X2,X3,X4, X14) %>%
rename(node1 = X2, node2 = X3, weight = X4, cls = X14) %>%
filter(!is.na(cls))
# Size of clusters and mix
cat("Number of clusters:", df %>% .$cls_num %>% length,
"\nNumber of singletons:", df %>% filter(num_mem == 1) %>% .$cls_num %>% length,
"\nNumber of clusters > 1 members:", df %>% filter(num_mem > 1) %>% .$cls_num %>% length, "\n")
df_sing <- df %>%
filter(num_mem == 1) %>%
select(starts_with('num')) %>%
select(-num_mem, -num_ref, -num_sub, -num_ins, -num_del, -num_all, -num_diff) %>%
gather() %>%
mutate(key = gsub('num_',"", key)) %>%
mutate(Identity = paste(gsub('ins|sub|del|all',"", key), "%", sep = ""), type = gsub('90|50|30',"", key), "%", sep = "") %>%
select(-key) %>%
group_by(Identity, type) %>%
summarise(N=sum(value)) %>%
mutate(class = "Singletons")
df_no_ref <- df %>%
filter(num_mem > 1, num_ref < 1) %>%
select(starts_with('num')) %>%
select(-num_mem, -num_ref, -num_sub, -num_ins, -num_del, -num_all, -num_diff) %>%
gather() %>%
mutate(key = gsub('num_',"", key)) %>%
mutate(Identity = paste(gsub('ins|sub|del|all',"", key), "%", sep = ""), type = gsub('90|50|30',"", key)) %>%
select(-key) %>%
group_by(Identity, type) %>%
summarise(N=sum(value)) %>%
mutate(class = "Without seeds")
df_ref <- df %>%
filter(num_mem > 1, num_ref > 0) %>%
select(starts_with('num')) %>%
select(-num_mem, -num_ref, -num_sub, -num_ins, -num_del, -num_all, -num_diff) %>%
gather() %>%
mutate(key = gsub('num_',"", key)) %>%
mutate(Identity = paste(gsub('ins|sub|del|all',"", key), "%", sep = ""), type = gsub('90|50|30',"", key), "%", sep = "") %>%
select(-key) %>%
group_by(Identity, type) %>%
summarise(N=sum(value)) %>%
mutate(class = "With seeds")
df_mut <- bind_rows(df_sing, df_no_ref, df_ref)
df_mut$Identity <- plyr::mapvalues(df_mut$Identity, from = c("90%", "50%", "30%"), to = c("10%", "50%", "70%"))
df_mut$Identity <- factor(df_mut$Identity, levels = c("10%", "50%", "70%"))
p1 <- ggplot(df_mut %>% filter(class == "Singletons"), aes(type, N, fill = Identity)) +
geom_bar(stat = "identity") +
theme_light() +
ylab("Number of ORFs") +
xlab("Type of mutation") +
facet_wrap(~class) +
scale_fill_manual(values = c("#FF0000", "#00A08A", "#F2AD00")) +
guides(fill=guide_legend(title="Modified\nresidues"))
p2 <- ggplot(df_mut %>% filter(class == "Without seeds"), aes(type, N, fill = Identity)) +
geom_bar(stat = "identity") +
theme_light() +
ylab("Number of ORFs") +
xlab("Type of mutation") +
facet_wrap(~class) +
scale_fill_manual(values = c("#FF0000", "#00A08A", "#F2AD00"))
p3 <- ggplot(df_mut %>% filter(class == "With seeds"), aes(type, N, fill = Identity)) +
geom_bar(stat = "identity") +
theme_light() +
ylab("Number of ORFs") +
xlab("Type of mutation") +
facet_wrap(~class) +
scale_fill_manual(values = c("#FF0000", "#00A08A", "#F2AD00"))
prow <- plot_grid( p1 + theme(legend.position="none"),
p2 + theme(legend.position="none"),
p3 + theme(legend.position="none"),
align = 'vh',
labels = c("A", "B", "C"),
hjust = -1,
nrow = 1
)
legend <- get_legend(p1)
p <- plot_grid( prow, legend, rel_widths = c(3, .3))
fname <- paste(opt$namefile, "_seed.pdf", sep = "")
fname <- file.path(getwd(), fname)
ggsave(filename = fname, plot = p, width = 11, height = 3, units = "in", device = "pdf")
### 991 267
#432 236
p1 <- bind_rows(df %>%
filter(num_mem > 1, num_ref < 1) %>% mutate(class = "Without seeds"),
df %>%
filter(num_mem > 1, num_ref >= 1) %>% mutate(class = "With seeds")) %>%
ggplot(aes(num_diff, fill = class)) +
geom_bar(alpha = 0.8) +
theme_light() +
xlab("Different ORFs per cluster") +
ylab("Number of clusters") +
theme(legend.title = element_blank()) +
scale_fill_manual(values = c("#C84359", "#4A4A4A"))
p2 <- bind_rows(df %>%
filter(num_mem > 1, num_ref < 1) %>% mutate(class = "Without seeds"),
df %>%
filter(num_mem > 1, num_ref >= 1) %>% mutate(class = "With seeds")) %>%
ggplot(aes(median_iden, fill = class)) +
geom_density(alpha = 0.8) +
theme_light() +
ylab("Density") +
xlab("Median similarity") +
theme(legend.title = element_blank()) +
scale_fill_manual(values = c("#C84359", "#4A4A4A"))
prow <- plot_grid( p1 + theme(legend.position="none"),
p2 + theme(legend.position="none"),
align = 'vh',
labels = c("A", "B"),
hjust = -1,
nrow = 1
)
legend <- get_legend(p1)
p <- plot_grid(prow, legend, rel_widths = c(3, .5))
fname <- paste(opt$namefile, "_medsim.pdf", sep = "")
fname <- file.path(getwd(), fname)
ggsave(filename = fname, plot = p, width = 11, height = 3, units = "in", device = "pdf")
k <- pbmclapply(X = df$cls_num, get_refs, aln = alns, data = df, mc.cores = 4, ignore.interactive = TRUE)
k <- bind_rows(k) %>% tbl_df()
d <- data_frame(
MMSEQS = k %>% filter(grepl("ref", cl_ref)) %>% .$cls %>% length(),
Strength = k %>% filter(grepl("ref", s_ref)) %>% .$cls %>% length(),
Betwenness = k %>% filter(grepl("ref", b_ref)) %>% .$cls %>% length(),
Pagerank = k %>% filter(grepl("ref", pr_ref)) %>% .$cls %>% length(),
Eingenvector = k %>% filter(grepl("ref", e_ref)) %>% .$cls %>% length(),
Closeness = k %>% filter(grepl("ref", c_ref)) %>% .$cls %>% length()
) %>% gather %>% mutate(Proportion = value/dim(k)[1])
d$key <- factor(d$key, levels = d %>% arrange(desc(value)) %>% .$key)
#581 384
p <- ggplot(d, aes(key, Proportion)) +
geom_bar(stat = "identity", fill = "#4A4A4A") +
theme_light() +
xlab("Representative method") +
geom_text(aes(label=value), hjust=0.5, vjust=-.3, size = 3) +
scale_y_continuous(labels = scales::percent) +
ggtitle("Seed sequences as cluster representative") +
theme(plot.title = element_text(size = 10, face = "bold"))
fname <- paste(opt$namefile, "_repmet.pdf", sep = "")
fname <- file.path(getwd(), fname)
ggsave(filename = fname, plot = p, device = "pdf", width = 8, height = 4, units = "in")
#k %>% mutate(same_ref = ifelse(cl_ref == e_ref, TRUE, FALSE)) %>% group_by(same_ref) %>% count
# k %>%
# ggplot(aes(d)) +
# geom_density(fill = "#4A4A4A") +
# theme_light() +
# ylab("Density") +
# xlab("Graph density")
#!/bin/bash
set -e
set -x
#[ -z "$MMDIR" ] && echo "Please set the environment variable \$MMDIR to your MMSEQS installation directory." && exit 1;
[ "$#" -lt 2 ] && echo "Please provide <sequenceDB> <outDir>" && exit 1;
[ ! -f "$1" ] && echo "Sequence database $1 not found!" && exit 1;
[ -d "$2" ] && echo "Output directory $2 exists already!" && exit 1;
function abspath() {
if [ -d "$1" ]; then
(cd "$1"; pwd)
elif [ -f "$1" ]; then
if [[ $1 == */* ]]; then
echo "$(cd "${1%/*}"; pwd)/${1##*/}"
else
echo "$(pwd)/$1"
fi
fi
}
export OMP_NUM_THREADS=4
MMSEQSEXE="mmseqs"
RELEASE="${3:-$(gdate "+%Y_%m")}"
SHORTRELEASE="${4:-$(gdate "+%y%m")}"
INPUT=$1
OUTDIR=$2/$RELEASE
TMPDIR=$OUTDIR/tmp
mkdir -p $TMPDIR
OUTDIR=$(abspath $OUTDIR)
TMPDIR=$(abspath $TMPDIR)
PREFILTER_COMMON="$COMMON --diag-score 1 --min-ungapped-score 15 --spaced-kmer-mode 1 --max-seq-len 32768 --split-mode 1"
PREFILTER0_PAR="--max-seqs 20 --comp-bias-corr 0 --k-score 145 ${PREFILTER_COMMON}"
PREFILTER1_PAR="--max-seqs 100 --comp-bias-corr 1 --k-score 135 ${PREFILTER_COMMON}"
PREFILTER2_PAR="--max-seqs 300 --comp-bias-corr 1 --k-score 90 -k 6 ${PREFILTER_COMMON}"
ALIGNMENT_COMMON="$COMMON -e 0.001 --max-seq-len 32768 --max-rejected 2147483647"
ALIGNMENT0_PAR="--max-seqs 20 -c 0.9 --alignment-mode 2 --min-seq-id 0.9 --comp-bias-corr 0 ${ALIGNMENT_COMMON}"
ALIGNMENT1_PAR="--max-seqs 100 -c 0.9 --alignment-mode 2 --min-seq-id 0.9 --comp-bias-corr 1 ${ALIGNMENT_COMMON}"
ALIGNMENT2_PAR="--max-seqs 300 -c 0.8 --alignment-mode 3 --min-seq-id 0.3 --comp-bias-corr 1 ${ALIGNMENT_COMMON}"
CLUSTER0_PAR="--cluster-mode 2"
CLUSTER1_PAR="--cluster-mode 0"
CLUSTER2_PAR="--cluster-mode 0"
SEARCH_PAR="$COMMON --profile --k-score 100"
CSTRANSLATE_PAR="-x 0.3 -c 4 -A $HHLIB/data/cs219.lib -D $HHLIB/data/context_data.lib -I ca3m -f -b"
SEQUENCE_DB="$OUTDIR/aaprot_db"
printf "###############################\n"
printf "#### Creating DB\n"
printf "###############################\n"
#export OMP_PROC_BIND=true
export OMP_NUM_THREADS=4
# we split all sequences that are above 14k in N/14k parts
mmseqs createdb "$INPUT" "${SEQUENCE_DB}" --max-seq-len 32768
gdate --rfc-3339=seconds
printf "###############################\n"
printf "#### Filtering redundancy\n"
printf "###############################\n"
# filter redundancy
INPUT="${SEQUENCE_DB}"
mmseqs clusthash $INPUT "$TMPDIR/aln_redundancy" --min-seq-id 0.9 --threads 4
gdate --rfc-3339=seconds
mmseqs clust $INPUT "$TMPDIR/aln_redundancy" "$TMPDIR/clu_redundancy" ${CLUSTER1_PAR} --threads 4
gdate --rfc-3339=seconds
gawk '{ print $1 }' "$TMPDIR/clu_redundancy.index" > "$TMPDIR/order_redundancy"
mmseqs createsubdb "$TMPDIR/order_redundancy" $INPUT "$TMPDIR/input_step0"
gdate --rfc-3339=seconds
printf "###############################\n"
printf "#### Cluster at 90%% \n"
printf "###############################\n"
#export OMP_NUM_THREADS=16
# go down to 90% and merge fragments (accept fragment if dbcov >= 0.95 && seqId >= 0.9)
STEP=0
INPUT="$TMPDIR/input_step0"
mmseqs prefilter "$INPUT" "$INPUT" "$TMPDIR/pref_step$STEP" ${PREFILTER0_PAR} --threads 4
gdate --rfc-3339=seconds
mmseqs align "$INPUT" "$INPUT" "$TMPDIR/pref_step$STEP" "$TMPDIR/aln_step$STEP" ${ALIGNMENT0_PAR} --threads 4
gdate --rfc-3339=seconds
export OMP_NUM_THREADS=4
mmseqs clust $INPUT "$TMPDIR/aln_step$STEP" "$TMPDIR/clu_step$STEP" ${CLUSTER0_PAR} --threads 4
gdate --rfc-3339=seconds
gawk '{ print $1 }' "$TMPDIR/clu_step$STEP.index" > "$TMPDIR/order_step$STEP"
mmseqs createsubdb "$TMPDIR/order_step$STEP" $INPUT "$TMPDIR/input_step1"
gdate --rfc-3339=seconds
#export OMP_NUM_THREADS=16
# go down to 90% (this step is needed to create big clusters)
STEP=1
INPUT="$TMPDIR/input_step1"
mmseqs prefilter "$INPUT" "$INPUT" "$TMPDIR/pref_step$STEP" ${PREFILTER1_PAR} --threads 4
gdate --rfc-3339=seconds
mmseqs align "$INPUT" "$INPUT" "$TMPDIR/pref_step$STEP" "$TMPDIR/aln_step$STEP" ${ALIGNMENT1_PAR} --threads 4
gdate --rfc-3339=seconds
#export OMP_NUM_THREADS=64
mmseqs clust $INPUT "$TMPDIR/aln_step$STEP" "$TMPDIR/clu_step$STEP" ${CLUSTER1_PAR} --threads 4
gdate --rfc-3339=seconds
printf "###############################\n"
printf "#### Merge redundancy to 90%% clustering\n"
printf "###############################\n"
# create database aaclust 90% (we need to merge redundancy, step_0 and step_1)
mmseqs mergeclusters "${SEQUENCE_DB}" $OUTDIR/aaclust90_$RELEASE \
"$TMPDIR/clu_redundancy" $TMPDIR/clu_step0 $TMPDIR/clu_step1
gdate --rfc-3339=seconds
gawk '{ print $1 }' "$TMPDIR/clu_step$STEP.index" > "$TMPDIR/order_step$STEP"
mmseqs createsubdb "$TMPDIR/order_step$STEP" $INPUT "$TMPDIR/input_step2"
#export OMP_NUM_THREADS=16
# now we cluster down to 30% sequence id to produce a 30% and 50% clustering
STEP=2
INPUT=$TMPDIR/input_step2
gdate --rfc-3339=seconds
mmseqs prefilter $INPUT $INPUT "$TMPDIR/pref_step$STEP" ${PREFILTER2_PAR} --threads 4
gdate --rfc-3339=seconds
mmseqs align $INPUT $INPUT "$TMPDIR/pref_step$STEP" "$TMPDIR/aln_step$STEP" ${ALIGNMENT2_PAR} --threads 4
gdate --rfc-3339=seconds
printf "###############################\n"
printf "#### Cluster at 50%%\n"
printf "###############################\n"
#export OMP_NUM_THREADS=64
# cluster down to 50%
mmseqs filterdb "$TMPDIR/aln_step$STEP" "$TMPDIR/aln_aaclust50" \
--filter-column 3 --filter-regex '(0\.[5-9][0-9]{2}|1\.000)' --threads 4
gdate --rfc-3339=seconds
mmseqs clust $INPUT "$TMPDIR/aln_aaclust50" "$TMPDIR/clu_aaclust50" ${CLUSTER2_PAR} --threads 4
gdate --rfc-3339=seconds
mmseqs mergeclusters "${SEQUENCE_DB}" $OUTDIR/aaclust50_$RELEASE \
"$TMPDIR/clu_redundancy" $TMPDIR/clu_step0 $TMPDIR/clu_step1 $TMPDIR/clu_aaclust50
gdate --rfc-3339=seconds
printf "###############################\n"
printf "#### Cluster at 30%%\n"
printf "###############################\n"
# cluster down to 30%
mmseqs clust $INPUT "$TMPDIR/aln_step$STEP" "$TMPDIR/clu_aaclust30" ${CLUSTER2_PAR} --threads 4
gdate --rfc-3339=seconds
mmseqs mergeclusters "${SEQUENCE_DB}" $OUTDIR/aaclust30_$RELEASE \
"$TMPDIR/clu_redundancy" $TMPDIR/clu_step0 $TMPDIR/clu_step1 $TMPDIR/clu_aaclust30
gdate --rfc-3339=seconds
printf "###############################\n"
printf "#### Prepare output\n"
printf "###############################\n"
#export OMP_NUM_THREADS=16
# generate aaclust final output: the _seed, _conensus und .tsv
# also generated the profiles needed for the aaboost
for i in 30 50 90; do
mmseqs result2profile "${SEQUENCE_DB}" "${SEQUENCE_DB}" "$OUTDIR/aaclust${i}_${RELEASE}" "$TMPDIR/aaclust${i}_${RELEASE}_profile" --threads 16
ln -sf "${SEQUENCE_DB}_h" "$TMPDIR/aaclust${i}_${RELEASE}_profile_h"
ln -sf "${SEQUENCE_DB}_h.index" "$TMPDIR/aaclust${i}_${RELEASE}_profile_h.index"
ln -sf "${SEQUENCE_DB}_h" "$TMPDIR/aaclust${i}_${RELEASE}_profile_consensus_h"
ln -sf "${SEQUENCE_DB}_h.index" "$TMPDIR/aaclust${i}_${RELEASE}_profile_consensus_h.index"
#fixme: won't work with updating
mmseqs mergedbs "$OUTDIR/aaclust${i}_${RELEASE}" "$TMPDIR/aaclust${i}_${RELEASE}_seed" "$OUTDIR/aaprot_db_h" "$OUTDIR/aaprot_db" --prefixes ">"
rm -f "$TMPDIR/aaclust${i}_${RELEASE}_seed.index"
gsed -i 's/\x0//g' "$TMPDIR/aaclust${i}_${RELEASE}_seed"
mmseqs summarizeheaders "${SEQUENCE_DB}_h" "${SEQUENCE_DB}_h" "$OUTDIR/aaclust${i}_${RELEASE}" "$TMPDIR/aaclust${i}_${RELEASE}_summary" --summary-prefix "${i}-${SHORTRELEASE}"
mmseqs mergedbs "$OUTDIR/aaclust${i}_$RELEASE" "$TMPDIR/aaclust${i}_${RELEASE}_consensus" "$TMPDIR/aaclust${i}_${RELEASE}_summary" "$TMPDIR/aaclust${i}_${RELEASE}_profile_consensus" --prefixes ">
"
rm -f "$TMPDIR/aaclust${i}_${RELEASE}_consensus.index"
gsed -i 's/\x0//g' "$TMPDIR/aaclust${i}_${RELEASE}_consensus"
mmseqs createtsv "${SEQUENCE_DB}" "${SEQUENCE_DB}" "$OUTDIR/aaclust${i}_$RELEASE" "$TMPDIR/aaclust${i}_$RELEASE.tsv"
mv -f "$TMPDIR/aaclust${i}_${RELEASE}_seed" "$TMPDIR/aaclust${i}_${RELEASE}_seed.fasta"
mv -f "$TMPDIR/aaclust${i}_${RELEASE}_consensus" "$TMPDIR/aaclust${i}_${RELEASE}_consensus.fasta"
gtar -cv --use-compress-program=pigz --show-transformed-names --transform "s|${TMPDIR:1}/|aaclust${i}_${RELEASE}/|g" -f "$OUTDIR/aaclust${i}_${RELEASE}.tar.gz" "$TMPDIR/aaclust${i}_$RELEASE.tsv" "$TMPDIR/aaclust${i}_${RELEASE}_consensus.fasta" "$TMPDIR/aaclust${i}_${RELEASE}_seed.fasta"
done
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment