Last active
September 27, 2017 13:13
-
-
Save genomewalker/d4b546124a6cc90b7d060e97d0b77d32 to your computer and use it in GitHub Desktop.
Tiny script to create aminoacid sequences with different types of mutations
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Example for N. maritimus | |
# First we generate a set of mutants | |
python generate_mutants.py -g GCF_000018465.1_ASM1846v1_protein.faa -n 3 -o nmarit | |
# We label our seeds or references | |
awk '{if ($0 ~ /^>/) {print $1"_ref"}else{print $0}}' ../GCF_000018465.1_ASM1846v1_protein.faa > nmarit_aa_ref.fasta | |
# Concatenate mutants and references | |
cat nmar*sub.fasta nmar*ins.fasta nmar*del.fasta nmar*all.fasta nmarit_aa_ref.fasta > nmarit_all_aa.fasta | |
# Cluster them with the same approach we did for our clusters | |
# code adapted to work on a MAC OS X and linuxbrew coreutils | |
bash unknown_clust.sh nmarit_all_aa.fasta nmarit_aa_all_cls | |
# Parse clusters | |
cp nmarit_aa_all_cls/2017_07/tmp/aaclust30_2017_07.tsv nmarit_aaclust30_2017_06.tsv | |
python parse_clusters.py -t nmarit_aaclust30_2017_06.tsv -o nmarit_aaclust30_2017_06_all -f nmarit_all_aa.fasta | |
# Plot results | |
./plot_mutants.r -a nmarit_aaclust30_2017_06_all_alns.tsv -s nmarit_aaclust30_2017_06_all.tsv -n nmarit | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from Bio import SeqIO | |
from Bio.SeqRecord import SeqRecord | |
import argparse | |
import re | |
import subprocess | |
from tqdm import * | |
def get_args(): | |
'''This function parses and return arguments passed in''' | |
# Assign description to the help doc | |
parser = argparse.ArgumentParser( | |
description='Script uses EMBOSS msbar ' | |
'to generate mutated AA sequences') | |
# Add arguments | |
parser.add_argument( | |
'-g', '--genome', | |
type=str, | |
help='Genome file with amino acid sequences.', | |
required=True) | |
parser.add_argument( | |
'-n', '--nmut', | |
type=str, | |
help='Number of mutated sequences per sequence.', | |
required=True) | |
# Array for all arguments passed to script | |
args = parser.parse_args() | |
# Assign args to variables | |
infile = args.genome | |
n_mut = args.nmut | |
# Return all variable values | |
return infile, n_mut | |
def run_msbar(iden, mutrate, record, iter, muttype): | |
rec = SeqRecord(record.seq, | |
id=record.id, | |
name=record.name, | |
description="") | |
seq_len = len(rec.seq) | |
mut = int((mutrate * seq_len)) | |
ident = re.sub("\|", "-", rec.id) + "_" + str(iden) + "_" + str(iter) | |
rec.id = ident | |
rec.description = "" | |
SeqIO.write(rec, "tmp.fasta", "fasta") | |
command = ['msbar', | |
'-sequence', 'tmp.fasta', | |
'-count', str(mut), | |
'-point', str(muttype), | |
'-auto', '-stdout', | |
'-block', "0" | |
] | |
try: | |
seqmut = subprocess.Popen(command, stdout=subprocess.PIPE) | |
except subprocess.CalledProcessError as e: | |
print e.output | |
r = SeqIO.read(seqmut.stdout, "fasta") | |
return r | |
def run_msbar_sub(file90, file50, file30, i): | |
aa90sub = run_msbar(iden="90-sub", | |
mutrate=0.1, | |
record=record, | |
muttype=4, | |
iter=i) | |
aa50sub = run_msbar(iden="50-sub", | |
mutrate=0.5, | |
record=record, | |
muttype=4, | |
iter=i) | |
aa30sub = run_msbar(iden="30-sub", | |
mutrate=0.7, | |
record=record, | |
muttype=4, | |
iter=i) | |
SeqIO.write(aa90sub, file90, "fasta") | |
SeqIO.write(aa50sub, file50, "fasta") | |
SeqIO.write(aa30sub, file30, "fasta") | |
def run_msbar_ins(file90, file50, file30, i): | |
aa90ins = run_msbar(iden="90-ins", | |
mutrate=0.1, | |
record=record, | |
muttype=2, | |
iter=i) | |
aa50ins = run_msbar(iden="50-ins", | |
mutrate=0.5, | |
record=record, | |
muttype=2, | |
iter=i) | |
aa30ins = run_msbar(iden="30-ins", | |
mutrate=0.7, | |
record=record, | |
muttype=2, | |
iter=i) | |
SeqIO.write(aa90ins, file90, "fasta") | |
SeqIO.write(aa50ins, file50, "fasta") | |
SeqIO.write(aa30ins, file30, "fasta") | |
def run_msbar_del(file90, file50, file30, i): | |
aa90del = run_msbar(iden="90-del", | |
mutrate=0.1, | |
record=record, | |
muttype=3, | |
iter=i) | |
aa50del = run_msbar(iden="50-del", | |
mutrate=0.5, | |
record=record, | |
muttype=3, | |
iter=i) | |
aa30del = run_msbar(iden="30-del", | |
mutrate=0.7, | |
record=record, | |
muttype=3, | |
iter=i) | |
SeqIO.write(aa90del, file90, "fasta") | |
SeqIO.write(aa50del, file50, "fasta") | |
SeqIO.write(aa30del, file30, "fasta") | |
def run_msbar_all(file90, file50, file30, i): | |
aa90all = run_msbar(iden="90-all", | |
mutrate=0.1, | |
record=record, | |
muttype="2,3,4", | |
iter=i) | |
aa50all = run_msbar(iden="50-all", | |
mutrate=0.5, | |
record=record, | |
muttype="2,3,4", | |
iter=i) | |
aa30all = run_msbar(iden="30-all", | |
mutrate=0.7, | |
record=record, | |
muttype="2,3,4", | |
iter=i) | |
SeqIO.write(aa90all, file90, "fasta") | |
SeqIO.write(aa50all, file50, "fasta") | |
SeqIO.write(aa30all, file30, "fasta") | |
if __name__ == "__main__": | |
infile, n_mut = get_args() | |
out90sub = "aa90_sub.fasta" | |
out50sub = "aa50_sub.fasta" | |
out30sub = "aa30_sub.fasta" | |
out90ins = "aa90_ins.fasta" | |
out50ins = "aa50_ins.fasta" | |
out30ins = "aa30_ins.fasta" | |
out90del = "aa90_del.fasta" | |
out50del = "aa50_del.fasta" | |
out30del = "aa30_del.fasta" | |
out90all = "aa90_all.fasta" | |
out50all = "aa50_all.fasta" | |
out30all = "aa30_all.fasta" | |
output90sub = open(out90sub, "w") | |
output50sub = open(out50sub, "w") | |
output30sub = open(out30sub, "w") | |
output90ins = open(out90ins, "w") | |
output50ins = open(out50ins, "w") | |
output30ins = open(out30ins, "w") | |
output90del = open(out90del, "w") | |
output50del = open(out50del, "w") | |
output30del = open(out30del, "w") | |
output90all = open(out90all, "w") | |
output50all = open(out50all, "w") | |
output30all = open(out30all, "w") | |
for record in SeqIO.parse(infile, "fasta"): | |
t = trange(int(n_mut), desc=record.id, leave=False) | |
for i in t: | |
run_msbar_sub(file90=output90sub, | |
file50=output50sub, | |
file30=output30sub, | |
i=i) | |
run_msbar_ins(file90=output90ins, | |
file50=output50ins, | |
file30=output30ins, | |
i=i) | |
run_msbar_del(file90=output90del, | |
file50=output50del, | |
file30=output30del, | |
i=i) | |
run_msbar_all(file90=output90all, | |
file50=output50all, | |
file30=output30all, | |
i=i) | |
output90sub.close() | |
output50sub.close() | |
output30sub.close() | |
output90ins.close() | |
output50ins.close() | |
output30ins.close() | |
output90del.close() | |
output50del.close() | |
output30del.close() | |
output90all.close() | |
output50all.close() | |
output30all.close() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import argparse | |
from collections import defaultdict, OrderedDict, Counter | |
import csv | |
import re | |
from Bio import SeqIO | |
import subprocess | |
import os | |
import errno | |
import glob | |
import shutil | |
from statsmodels import robust | |
import pandas as pd | |
from tqdm import * | |
def get_args(): | |
'''This function parses and return arguments passed in''' | |
# Assign description to the help doc | |
parser = argparse.ArgumentParser( | |
description='Script to parse mmseqs tsv files') | |
# Add arguments | |
parser.add_argument( | |
'-t', '--tsvfile', | |
type=str, | |
help='MMSEQS tsv file', | |
required=True) | |
parser.add_argument( | |
'-o', '--output', | |
type=str, | |
help='Output name', | |
required=True) | |
parser.add_argument( | |
'-f', '--fastafile', | |
type=str, | |
help='Fasta file with ORFs used for clustering', | |
required=True) | |
# Array for all arguments passed to script | |
args = parser.parse_args() | |
# Assign args to variables | |
infile = args.tsvfile | |
outfile = args.output | |
fasta_file = args.fastafile | |
# Return all variable values | |
return infile, outfile, fasta_file | |
class MyCounter(Counter): | |
def __str__(self): | |
return ",".join('{}:{}'.format(k, v) for k, v in self.items()) | |
def create_db(f_file, outfile): | |
command = ['mmseqs', 'createdb', f_file, outfile + '_seqDB'] | |
try: | |
subprocess.check_output(command) | |
except subprocess.CalledProcessError as e: | |
print(e.output) | |
print(e.returncode) | |
def search_db(outfile): | |
tmp = outfile + '_tmp' | |
os.mkdir(tmp) | |
command = ['mmseqs', | |
'search', | |
outfile + '_seqDB', | |
outfile + '_seqDB', | |
outfile + '_seqDB.aln.db', | |
tmp, '-a', | |
'--comp-bias-corr', '0', | |
'-e', '10', | |
'--threads', '4' | |
] | |
try: | |
subprocess.check_output(command) | |
except subprocess.CalledProcessError as e: | |
print(e.output) | |
print(e.returncode) | |
def get_alns(outfile): | |
command = ['mmseqs', | |
'convertalis', | |
outfile + '_seqDB', | |
outfile + '_seqDB', | |
outfile + '_seqDB.aln.db', | |
outfile + '_seqDB.aln.m8' | |
] | |
try: | |
subprocess.check_output(command) | |
except subprocess.CalledProcessError as e: | |
print(e.output) | |
print(e.returncode) | |
def parse_alns(n_cls, values, outfile): | |
df = pd.read_table(outfile + '_seqDB.aln.m8', header=None) | |
mean = df.iloc[:, 2].mean() | |
median = df.iloc[:, 2].median() | |
mad = robust.mad(df.iloc[:, 2], c=1) | |
df.insert(12, '', 'cls_' + str(n_cls)) | |
return df, mean, median, mad | |
def clean_db(filename): | |
try: | |
for fl in glob.glob(filename): | |
os.remove(fl) | |
except OSError as e: # this would be "except OSError, e:" before Python 2.6 | |
if e.errno != errno.ENOENT: # errno.ENOENT = no such file or directory | |
raise # re-raise exception if a different error occurred | |
def clean_tmp(outfile): | |
try: | |
shutil.rmtree(outfile + '_tmp') | |
except OSError as e: # this would be "except OSError, e:" before Python 2.6 | |
if e.errno != errno.ENOENT: # errno.ENOENT = no such file or directory | |
raise # re-raise exception if a different error occurred | |
def mmseqs_seq_net(values, seq_dic, n_cls, outfile): | |
# write elements to file | |
tf_name = outfile + "_cls_tmp.fasta" | |
tmp_fasta = open(tf_name, "w") | |
for k in values: | |
SeqIO.write(seq_dic[k], tmp_fasta, "fasta") | |
tmp_fasta.close() | |
clean_tmp(outfile) | |
clean_db(outfile + "_seqDB*") | |
create_db(tf_name, outfile) | |
search_db(outfile) | |
get_alns(outfile) | |
df, mean, median, mad = parse_alns(n_cls, values, outfile) | |
return df, mean, median, mad | |
def create_seq_dic(fasta_file, seq_dic): | |
seq_dic = SeqIO.index(fasta_file, "fasta") | |
return seq_dic | |
if __name__ == "__main__": | |
infile, outfile, fasta_file = get_args() | |
d1 = defaultdict(list) | |
dic = defaultdict(list) | |
sdic = create_seq_dic(fasta_file, dic) | |
n_cls = 0 | |
pd_all = [] | |
with open(infile) as f: | |
for line in f: | |
k = [n for n in line.strip().split('\t')] | |
d1[k[0]].append(k[1]) | |
d1 = OrderedDict(sorted(d1.items(), | |
key=lambda (k, v): len(v), | |
reverse=True)) | |
rref = re.compile(r'.*_ref') | |
rsub = re.compile(r'.*-sub.*') | |
rins = re.compile(r'.*-ins.*') | |
rdel = re.compile(r'.*-del.*') | |
rall = re.compile(r'.*-all.*') | |
rsub90 = re.compile(r'.*90-sub.*') | |
rins90 = re.compile(r'.*90-ins.*') | |
rdel90 = re.compile(r'.*90-del.*') | |
rall90 = re.compile(r'.*90-all.*') | |
rsub50 = re.compile(r'.*50-sub.*') | |
rins50 = re.compile(r'.*50-ins.*') | |
rdel50 = re.compile(r'.*50-del.*') | |
rall50 = re.compile(r'.*50-all.*') | |
rsub30 = re.compile(r'.*30-sub.*') | |
rins30 = re.compile(r'.*30-ins.*') | |
rdel30 = re.compile(r'.*30-del.*') | |
rall30 = re.compile(r'.*30-all.*') | |
with open(outfile + ".tsv", 'wb') as csv_file: | |
writer = csv.writer(csv_file, delimiter="\t") | |
header = ["cls_num", "num_mem", "is_rep_ref", "reference", "num_ref", | |
"num_sub", "num_ins", "num_del", "num_all", "num_diff", "diff_orf", | |
"mean_iden", "median_iden", "mad_iden", | |
"num_sub90", "num_ins90", "num_del90", "num_all90", | |
"num_sub50", "num_ins50", "num_del50", "num_all50", | |
"num_sub30", "num_ins30", "num_del30", "num_all30", | |
"members"] | |
writer.writerow(header) | |
# t = trange(len(d1), desc='cls_' + n_cls, leave=False) | |
for key, value in tqdm(d1.items(), desc='Processing', leave=False): | |
n_cls = n_cls + 1 | |
nref = len(filter(rref.match, value)) | |
nsub = len(filter(rsub.match, value)) | |
nins = len(filter(rins.match, value)) | |
ndel = len(filter(rdel.match, value)) | |
nall = len(filter(rall.match, value)) | |
nsub90 = len(filter(rsub90.match, value)) | |
nins90 = len(filter(rins90.match, value)) | |
ndel90 = len(filter(rdel90.match, value)) | |
nall90 = len(filter(rall90.match, value)) | |
nsub50 = len(filter(rsub50.match, value)) | |
nins50 = len(filter(rins50.match, value)) | |
ndel50 = len(filter(rdel50.match, value)) | |
nall50 = len(filter(rall50.match, value)) | |
nsub30 = len(filter(rsub30.match, value)) | |
nins30 = len(filter(rins30.match, value)) | |
ndel30 = len(filter(rdel30.match, value)) | |
nall30 = len(filter(rall30.match, value)) | |
repres = bool(rref.match(key)) | |
counts = Counter([re.sub("_\d\d-\w\w\w_..*|_ref", "", m) for m in value]) | |
if len(value) > 1: | |
df, mean, median, mad = mmseqs_seq_net(value, sdic, n_cls, outfile) | |
pd_all.append(df) | |
else: | |
df = mean = median = mad = 'NA' | |
writer.writerow(["cls_" + str(n_cls), len(value), repres, key, nref, | |
nsub, nins, ndel, nall, len(counts), | |
MyCounter(counts), mean, median, mad, | |
nsub90, nins90, ndel90, nall90, | |
nsub50, nins50, ndel50, nall50, | |
nsub30, nins30, ndel30, nall30, | |
','.join(value)]) | |
df = pd.concat(pd_all, ignore_index=True) | |
df.to_csv(outfile + "_alns.tsv", sep='\t') |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import argparse | |
from collections import defaultdict, OrderedDict, Counter | |
import csv | |
import re | |
from Bio import SeqIO | |
import subprocess | |
import os | |
import errno | |
import glob | |
import shutil | |
from statsmodels import robust | |
import pandas as pd | |
from tqdm import * | |
def get_args(): | |
'''This function parses and return arguments passed in''' | |
# Assign description to the help doc | |
parser = argparse.ArgumentParser( | |
description='Script to parse mmseqs tsv files') | |
# Add arguments | |
parser.add_argument( | |
'-t', '--tsvfile', | |
type=str, | |
help='MMSEQS tsv file', | |
required=True) | |
parser.add_argument( | |
'-o', '--output', | |
type=str, | |
help='Output name', | |
required=True) | |
parser.add_argument( | |
'-f', '--fastafile', | |
type=str, | |
help='Fasta file with ORFs used for clustering', | |
required=True) | |
# Array for all arguments passed to script | |
args = parser.parse_args() | |
# Assign args to variables | |
infile = args.tsvfile | |
outfile = args.output | |
fasta_file = args.fastafile | |
# Return all variable values | |
return infile, outfile, fasta_file | |
class MyCounter(Counter): | |
def __str__(self): | |
return ",".join('{}:{}'.format(k, v) for k, v in self.items()) | |
def create_db(f_file, outfile): | |
command = ['mmseqs', 'createdb', f_file, outfile + '_seqDB'] | |
try: | |
subprocess.check_output(command) | |
except subprocess.CalledProcessError as e: | |
print(e.output) | |
print(e.returncode) | |
def search_db(outfile): | |
tmp = outfile + '_tmp' | |
os.mkdir(tmp) | |
command = ['mmseqs', | |
'search', | |
outfile + '_seqDB', | |
outfile + '_seqDB', | |
outfile + '_seqDB.aln.db', | |
tmp, '-a', | |
'--comp-bias-corr', '0', | |
'-e', '10', | |
'--threads', '4' | |
] | |
try: | |
subprocess.check_output(command) | |
except subprocess.CalledProcessError as e: | |
print(e.output) | |
print(e.returncode) | |
def get_alns(outfile): | |
command = ['mmseqs', | |
'convertalis', | |
outfile + '_seqDB', | |
outfile + '_seqDB', | |
outfile + '_seqDB.aln.db', | |
outfile + '_seqDB.aln.m8' | |
] | |
try: | |
subprocess.check_output(command) | |
except subprocess.CalledProcessError as e: | |
print(e.output) | |
print(e.returncode) | |
def parse_alns(n_cls, values, outfile): | |
df = pd.read_table(outfile + '_seqDB.aln.m8', header=None) | |
mean = df.iloc[:, 2].mean() | |
median = df.iloc[:, 2].median() | |
mad = robust.mad(df.iloc[:, 2], c=1) | |
df.insert(12, '', 'cls_' + str(n_cls)) | |
return df, mean, median, mad | |
def clean_db(filename): | |
try: | |
for fl in glob.glob(filename): | |
os.remove(fl) | |
except OSError as e: # this would be "except OSError, e:" before Python 2.6 | |
if e.errno != errno.ENOENT: # errno.ENOENT = no such file or directory | |
raise # re-raise exception if a different error occurred | |
def clean_tmp(outfile): | |
try: | |
shutil.rmtree(outfile + '_tmp') | |
except OSError as e: # this would be "except OSError, e:" before Python 2.6 | |
if e.errno != errno.ENOENT: # errno.ENOENT = no such file or directory | |
raise # re-raise exception if a different error occurred | |
def mmseqs_seq_net(values, seq_dic, n_cls, outfile): | |
# write elements to file | |
tf_name = outfile + "_cls_tmp.fasta" | |
tmp_fasta = open(tf_name, "w") | |
for k in values: | |
SeqIO.write(seq_dic[k], tmp_fasta, "fasta") | |
tmp_fasta.close() | |
clean_tmp(outfile) | |
clean_db(outfile + "_seqDB*") | |
create_db(tf_name, outfile) | |
search_db(outfile) | |
get_alns(outfile) | |
df, mean, median, mad = parse_alns(n_cls, values, outfile) | |
return df, mean, median, mad | |
def create_seq_dic(fasta_file, seq_dic): | |
seq_dic = SeqIO.index(fasta_file, "fasta") | |
return seq_dic | |
if __name__ == "__main__": | |
infile, outfile, fasta_file = get_args() | |
d1 = defaultdict(list) | |
dic = defaultdict(list) | |
sdic = create_seq_dic(fasta_file, dic) | |
n_cls = 0 | |
pd_all = [] | |
with open(infile) as f: | |
for line in f: | |
k = [n for n in line.strip().split('\t')] | |
d1[k[0]].append(k[1]) | |
d1 = OrderedDict(sorted(d1.items(), | |
key=lambda (k, v): len(v), | |
reverse=True)) | |
with open(outfile + ".tsv", 'wb') as csv_file: | |
writer = csv.writer(csv_file, delimiter="\t") | |
header = ["cls_num", "num_mem", "reference", | |
"num_diff", "diff_orf", | |
"mean_iden", "median_iden", "mad_iden", | |
"members"] | |
writer.writerow(header) | |
# t = trange(len(d1), desc='cls_' + n_cls, leave=False) | |
for key, value in tqdm(d1.items(), desc='Processing', leave=False): | |
n_cls = n_cls + 1 | |
counts = Counter([re.sub("_\d\d-\w\w\w_..*|_ref", "", m) for m in value]) | |
if len(value) > 1: | |
df, mean, median, mad = mmseqs_seq_net(value, sdic, n_cls, outfile) | |
pd_all.append(df) | |
else: | |
df = mean = median = mad = 'NA' | |
writer.writerow(["cls_" + str(n_cls), len(value), key, len(counts), | |
MyCounter(counts), mean, median, mad, | |
','.join(value)]) | |
df = pd.concat(pd_all, ignore_index=True) | |
df.to_csv(outfile + "_alns.tsv", sep='\t') |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env Rscript | |
suppressMessages(library("optparse", quietly = TRUE)) | |
suppressMessages(library(tidyverse, quietly = TRUE)) | |
suppressMessages(library(igraph, quietly = TRUE)) | |
suppressMessages(library(pbmcapply, quietly = TRUE)) | |
suppressMessages(library(cowplot, quietly = TRUE)) | |
options(warn=-1) | |
option_list = list( | |
make_option(c("-a", "--aln"), type="character", default=NULL, | |
help="cluster alignment file", metavar="character"), | |
make_option(c("-s", "--stats"), type="character", default=NULL, | |
help="cluster statistic file", metavar="character"), | |
make_option(c("-n", "--namefile"), type="character", default=NULL, | |
help="Filename to save", metavar="character") | |
) | |
opt_parser = OptionParser(option_list=option_list); | |
opt = parse_args(opt_parser); | |
if (length(opt) < 3){ | |
print_help(opt_parser) | |
stop("At least one argument must be supplied\n", call.=TRUE) | |
} | |
binary_search_filter_1 <- function(g = NULL, low = NULL, high = NULL, w = NULL) { | |
x <- g | |
if (high < low) { | |
return(NULL) | |
}else { | |
mid <- floor((low + high) / 2) | |
x <- delete.edges(g, which(E(g)$weight <= w[mid])) | |
# cat("low:", low, " mid:", mid, " high:", high, " mid_weight:", weights[mid], " components:", count_components(x), " edges:", ecount(x), "\n") | |
if ((mid - low) == 0){ | |
x <- delete.edges(g, which(E(g)$weight < w[mid-1])) | |
# cat("Final: low:", low - 1, " mid:", mid - 1, " high:", high - 1, " mid_weight:", weights[mid - 1], " components:",count_components(x), " edges:", ecount(x), "\n") | |
return(list(weight=mid - 1, graph=x)) | |
exit | |
} | |
if (((high - low) == 0)){ | |
x<-delete.edges(g, which(E(g)$weight < w[mid+1])) | |
# cat("Final: low:", low, " mid:", mid, " high:", high, " mid_weight:", weights[mid], " components:",count_components(x), " edges:", ecount(x), "\n") | |
return(list(weight=mid , graph=x)) | |
exit | |
} | |
if (!(is.connected(x))){ | |
binary_search_filter_1(g, low, mid - 1) | |
}else if (is.connected(x)){ | |
binary_search_filter_1(g, mid + 1, high) | |
} | |
} | |
} | |
get_refs <- function(X = X, aln = aln, data = data){ | |
if ((data %>% filter(cls_num == X) %>% .$num_mem >= 2) & (data %>% filter(cls_num == X) %>% .$num_ref > 0)) { | |
ref <- data %>% filter(cls_num == X) %>% .$reference | |
l <- aln %>% | |
filter(cls == X) %>% | |
graph_from_data_frame(directed = FALSE) %>% | |
simplify(remove.multiple = TRUE, remove.loops = TRUE, edge.attr.comb = "max") | |
weights <- unique(sort(E(l)$weight, decreasing = FALSE)) | |
if (length(weights) > 0){ | |
l <- binary_search_filter_1(g = l, low = 1, high = length(weights), w = weights)$graph | |
s <- strength(l) %>% sort %>% tail(1) %>% names() | |
b <- betweenness(l, weights = 1/E(l)$weight, directed = FALSE, normalized = TRUE) %>% sort %>% tail(1) %>% names() | |
p <- page_rank(l, weights = E(l)$weight, directed = FALSE)$vector %>% sort %>% tail(1) %>% names() | |
e <- eigen_centrality(l, weights = E(l)$weight, directed = FALSE)$vector %>% sort %>% tail(1) %>% names() | |
c <- closeness(l, weights = 1/E(l)$weight, normalized = TRUE) %>% sort %>% tail(1) %>% names() | |
d <- data.frame(cls = X, cl_ref = ref, s_ref = s, b_ref = b, pr_ref = p, e_ref = e, c_ref = c, d = graph.density(l)) | |
return(d) | |
}else{ | |
d <- data.frame(cls = X, cl_ref = ref, s_ref = NA, b_ref = NA, pr_ref = NA, e_ref = NA, c_ref = NA, d = NA) | |
return(d) | |
} | |
} | |
# }else{ | |
# d <- data.frame(cls = X, cl_ref = ref, s_ref = NA, b_ref = NA, pr_ref = NA, e_ref = NA, d = NA) | |
# } | |
} | |
suppressMessages(df <- read_tsv(opt$stats, col_names = T)) | |
suppressMessages(alns <- read_tsv(opt$aln, col_names = F)) | |
alns <- alns %>% | |
select(X2,X3,X4, X14) %>% | |
rename(node1 = X2, node2 = X3, weight = X4, cls = X14) %>% | |
filter(!is.na(cls)) | |
# Size of clusters and mix | |
cat("Number of clusters:", df %>% .$cls_num %>% length, | |
"\nNumber of singletons:", df %>% filter(num_mem == 1) %>% .$cls_num %>% length, | |
"\nNumber of clusters > 1 members:", df %>% filter(num_mem > 1) %>% .$cls_num %>% length, "\n") | |
df_sing <- df %>% | |
filter(num_mem == 1) %>% | |
select(starts_with('num')) %>% | |
select(-num_mem, -num_ref, -num_sub, -num_ins, -num_del, -num_all, -num_diff) %>% | |
gather() %>% | |
mutate(key = gsub('num_',"", key)) %>% | |
mutate(Identity = paste(gsub('ins|sub|del|all',"", key), "%", sep = ""), type = gsub('90|50|30',"", key), "%", sep = "") %>% | |
select(-key) %>% | |
group_by(Identity, type) %>% | |
summarise(N=sum(value)) %>% | |
mutate(class = "Singletons") | |
df_no_ref <- df %>% | |
filter(num_mem > 1, num_ref < 1) %>% | |
select(starts_with('num')) %>% | |
select(-num_mem, -num_ref, -num_sub, -num_ins, -num_del, -num_all, -num_diff) %>% | |
gather() %>% | |
mutate(key = gsub('num_',"", key)) %>% | |
mutate(Identity = paste(gsub('ins|sub|del|all',"", key), "%", sep = ""), type = gsub('90|50|30',"", key)) %>% | |
select(-key) %>% | |
group_by(Identity, type) %>% | |
summarise(N=sum(value)) %>% | |
mutate(class = "Without seeds") | |
df_ref <- df %>% | |
filter(num_mem > 1, num_ref > 0) %>% | |
select(starts_with('num')) %>% | |
select(-num_mem, -num_ref, -num_sub, -num_ins, -num_del, -num_all, -num_diff) %>% | |
gather() %>% | |
mutate(key = gsub('num_',"", key)) %>% | |
mutate(Identity = paste(gsub('ins|sub|del|all',"", key), "%", sep = ""), type = gsub('90|50|30',"", key), "%", sep = "") %>% | |
select(-key) %>% | |
group_by(Identity, type) %>% | |
summarise(N=sum(value)) %>% | |
mutate(class = "With seeds") | |
df_mut <- bind_rows(df_sing, df_no_ref, df_ref) | |
df_mut$Identity <- plyr::mapvalues(df_mut$Identity, from = c("90%", "50%", "30%"), to = c("10%", "50%", "70%")) | |
df_mut$Identity <- factor(df_mut$Identity, levels = c("10%", "50%", "70%")) | |
p1 <- ggplot(df_mut %>% filter(class == "Singletons"), aes(type, N, fill = Identity)) + | |
geom_bar(stat = "identity") + | |
theme_light() + | |
ylab("Number of ORFs") + | |
xlab("Type of mutation") + | |
facet_wrap(~class) + | |
scale_fill_manual(values = c("#FF0000", "#00A08A", "#F2AD00")) + | |
guides(fill=guide_legend(title="Modified\nresidues")) | |
p2 <- ggplot(df_mut %>% filter(class == "Without seeds"), aes(type, N, fill = Identity)) + | |
geom_bar(stat = "identity") + | |
theme_light() + | |
ylab("Number of ORFs") + | |
xlab("Type of mutation") + | |
facet_wrap(~class) + | |
scale_fill_manual(values = c("#FF0000", "#00A08A", "#F2AD00")) | |
p3 <- ggplot(df_mut %>% filter(class == "With seeds"), aes(type, N, fill = Identity)) + | |
geom_bar(stat = "identity") + | |
theme_light() + | |
ylab("Number of ORFs") + | |
xlab("Type of mutation") + | |
facet_wrap(~class) + | |
scale_fill_manual(values = c("#FF0000", "#00A08A", "#F2AD00")) | |
prow <- plot_grid( p1 + theme(legend.position="none"), | |
p2 + theme(legend.position="none"), | |
p3 + theme(legend.position="none"), | |
align = 'vh', | |
labels = c("A", "B", "C"), | |
hjust = -1, | |
nrow = 1 | |
) | |
legend <- get_legend(p1) | |
p <- plot_grid( prow, legend, rel_widths = c(3, .3)) | |
fname <- paste(opt$namefile, "_seed.pdf", sep = "") | |
fname <- file.path(getwd(), fname) | |
ggsave(filename = fname, plot = p, width = 11, height = 3, units = "in", device = "pdf") | |
### 991 267 | |
#432 236 | |
p1 <- bind_rows(df %>% | |
filter(num_mem > 1, num_ref < 1) %>% mutate(class = "Without seeds"), | |
df %>% | |
filter(num_mem > 1, num_ref >= 1) %>% mutate(class = "With seeds")) %>% | |
ggplot(aes(num_diff, fill = class)) + | |
geom_bar(alpha = 0.8) + | |
theme_light() + | |
xlab("Different ORFs per cluster") + | |
ylab("Number of clusters") + | |
theme(legend.title = element_blank()) + | |
scale_fill_manual(values = c("#C84359", "#4A4A4A")) | |
p2 <- bind_rows(df %>% | |
filter(num_mem > 1, num_ref < 1) %>% mutate(class = "Without seeds"), | |
df %>% | |
filter(num_mem > 1, num_ref >= 1) %>% mutate(class = "With seeds")) %>% | |
ggplot(aes(median_iden, fill = class)) + | |
geom_density(alpha = 0.8) + | |
theme_light() + | |
ylab("Density") + | |
xlab("Median similarity") + | |
theme(legend.title = element_blank()) + | |
scale_fill_manual(values = c("#C84359", "#4A4A4A")) | |
prow <- plot_grid( p1 + theme(legend.position="none"), | |
p2 + theme(legend.position="none"), | |
align = 'vh', | |
labels = c("A", "B"), | |
hjust = -1, | |
nrow = 1 | |
) | |
legend <- get_legend(p1) | |
p <- plot_grid(prow, legend, rel_widths = c(3, .5)) | |
fname <- paste(opt$namefile, "_medsim.pdf", sep = "") | |
fname <- file.path(getwd(), fname) | |
ggsave(filename = fname, plot = p, width = 11, height = 3, units = "in", device = "pdf") | |
k <- pbmclapply(X = df$cls_num, get_refs, aln = alns, data = df, mc.cores = 4, ignore.interactive = TRUE) | |
k <- bind_rows(k) %>% tbl_df() | |
d <- data_frame( | |
MMSEQS = k %>% filter(grepl("ref", cl_ref)) %>% .$cls %>% length(), | |
Strength = k %>% filter(grepl("ref", s_ref)) %>% .$cls %>% length(), | |
Betwenness = k %>% filter(grepl("ref", b_ref)) %>% .$cls %>% length(), | |
Pagerank = k %>% filter(grepl("ref", pr_ref)) %>% .$cls %>% length(), | |
Eingenvector = k %>% filter(grepl("ref", e_ref)) %>% .$cls %>% length(), | |
Closeness = k %>% filter(grepl("ref", c_ref)) %>% .$cls %>% length() | |
) %>% gather %>% mutate(Proportion = value/dim(k)[1]) | |
d$key <- factor(d$key, levels = d %>% arrange(desc(value)) %>% .$key) | |
#581 384 | |
p <- ggplot(d, aes(key, Proportion)) + | |
geom_bar(stat = "identity", fill = "#4A4A4A") + | |
theme_light() + | |
xlab("Representative method") + | |
geom_text(aes(label=value), hjust=0.5, vjust=-.3, size = 3) + | |
scale_y_continuous(labels = scales::percent) + | |
ggtitle("Seed sequences as cluster representative") + | |
theme(plot.title = element_text(size = 10, face = "bold")) | |
fname <- paste(opt$namefile, "_repmet.pdf", sep = "") | |
fname <- file.path(getwd(), fname) | |
ggsave(filename = fname, plot = p, device = "pdf", width = 8, height = 4, units = "in") | |
#k %>% mutate(same_ref = ifelse(cl_ref == e_ref, TRUE, FALSE)) %>% group_by(same_ref) %>% count | |
# k %>% | |
# ggplot(aes(d)) + | |
# geom_density(fill = "#4A4A4A") + | |
# theme_light() + | |
# ylab("Density") + | |
# xlab("Graph density") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
set -e | |
set -x | |
#[ -z "$MMDIR" ] && echo "Please set the environment variable \$MMDIR to your MMSEQS installation directory." && exit 1; | |
[ "$#" -lt 2 ] && echo "Please provide <sequenceDB> <outDir>" && exit 1; | |
[ ! -f "$1" ] && echo "Sequence database $1 not found!" && exit 1; | |
[ -d "$2" ] && echo "Output directory $2 exists already!" && exit 1; | |
function abspath() { | |
if [ -d "$1" ]; then | |
(cd "$1"; pwd) | |
elif [ -f "$1" ]; then | |
if [[ $1 == */* ]]; then | |
echo "$(cd "${1%/*}"; pwd)/${1##*/}" | |
else | |
echo "$(pwd)/$1" | |
fi | |
fi | |
} | |
export OMP_NUM_THREADS=4 | |
MMSEQSEXE="mmseqs" | |
RELEASE="${3:-$(gdate "+%Y_%m")}" | |
SHORTRELEASE="${4:-$(gdate "+%y%m")}" | |
INPUT=$1 | |
OUTDIR=$2/$RELEASE | |
TMPDIR=$OUTDIR/tmp | |
mkdir -p $TMPDIR | |
OUTDIR=$(abspath $OUTDIR) | |
TMPDIR=$(abspath $TMPDIR) | |
PREFILTER_COMMON="$COMMON --diag-score 1 --min-ungapped-score 15 --spaced-kmer-mode 1 --max-seq-len 32768 --split-mode 1" | |
PREFILTER0_PAR="--max-seqs 20 --comp-bias-corr 0 --k-score 145 ${PREFILTER_COMMON}" | |
PREFILTER1_PAR="--max-seqs 100 --comp-bias-corr 1 --k-score 135 ${PREFILTER_COMMON}" | |
PREFILTER2_PAR="--max-seqs 300 --comp-bias-corr 1 --k-score 90 -k 6 ${PREFILTER_COMMON}" | |
ALIGNMENT_COMMON="$COMMON -e 0.001 --max-seq-len 32768 --max-rejected 2147483647" | |
ALIGNMENT0_PAR="--max-seqs 20 -c 0.9 --alignment-mode 2 --min-seq-id 0.9 --comp-bias-corr 0 ${ALIGNMENT_COMMON}" | |
ALIGNMENT1_PAR="--max-seqs 100 -c 0.9 --alignment-mode 2 --min-seq-id 0.9 --comp-bias-corr 1 ${ALIGNMENT_COMMON}" | |
ALIGNMENT2_PAR="--max-seqs 300 -c 0.8 --alignment-mode 3 --min-seq-id 0.3 --comp-bias-corr 1 ${ALIGNMENT_COMMON}" | |
CLUSTER0_PAR="--cluster-mode 2" | |
CLUSTER1_PAR="--cluster-mode 0" | |
CLUSTER2_PAR="--cluster-mode 0" | |
SEARCH_PAR="$COMMON --profile --k-score 100" | |
CSTRANSLATE_PAR="-x 0.3 -c 4 -A $HHLIB/data/cs219.lib -D $HHLIB/data/context_data.lib -I ca3m -f -b" | |
SEQUENCE_DB="$OUTDIR/aaprot_db" | |
printf "###############################\n" | |
printf "#### Creating DB\n" | |
printf "###############################\n" | |
#export OMP_PROC_BIND=true | |
export OMP_NUM_THREADS=4 | |
# we split all sequences that are above 14k in N/14k parts | |
mmseqs createdb "$INPUT" "${SEQUENCE_DB}" --max-seq-len 32768 | |
gdate --rfc-3339=seconds | |
printf "###############################\n" | |
printf "#### Filtering redundancy\n" | |
printf "###############################\n" | |
# filter redundancy | |
INPUT="${SEQUENCE_DB}" | |
mmseqs clusthash $INPUT "$TMPDIR/aln_redundancy" --min-seq-id 0.9 --threads 4 | |
gdate --rfc-3339=seconds | |
mmseqs clust $INPUT "$TMPDIR/aln_redundancy" "$TMPDIR/clu_redundancy" ${CLUSTER1_PAR} --threads 4 | |
gdate --rfc-3339=seconds | |
gawk '{ print $1 }' "$TMPDIR/clu_redundancy.index" > "$TMPDIR/order_redundancy" | |
mmseqs createsubdb "$TMPDIR/order_redundancy" $INPUT "$TMPDIR/input_step0" | |
gdate --rfc-3339=seconds | |
printf "###############################\n" | |
printf "#### Cluster at 90%% \n" | |
printf "###############################\n" | |
#export OMP_NUM_THREADS=16 | |
# go down to 90% and merge fragments (accept fragment if dbcov >= 0.95 && seqId >= 0.9) | |
STEP=0 | |
INPUT="$TMPDIR/input_step0" | |
mmseqs prefilter "$INPUT" "$INPUT" "$TMPDIR/pref_step$STEP" ${PREFILTER0_PAR} --threads 4 | |
gdate --rfc-3339=seconds | |
mmseqs align "$INPUT" "$INPUT" "$TMPDIR/pref_step$STEP" "$TMPDIR/aln_step$STEP" ${ALIGNMENT0_PAR} --threads 4 | |
gdate --rfc-3339=seconds | |
export OMP_NUM_THREADS=4 | |
mmseqs clust $INPUT "$TMPDIR/aln_step$STEP" "$TMPDIR/clu_step$STEP" ${CLUSTER0_PAR} --threads 4 | |
gdate --rfc-3339=seconds | |
gawk '{ print $1 }' "$TMPDIR/clu_step$STEP.index" > "$TMPDIR/order_step$STEP" | |
mmseqs createsubdb "$TMPDIR/order_step$STEP" $INPUT "$TMPDIR/input_step1" | |
gdate --rfc-3339=seconds | |
#export OMP_NUM_THREADS=16 | |
# go down to 90% (this step is needed to create big clusters) | |
STEP=1 | |
INPUT="$TMPDIR/input_step1" | |
mmseqs prefilter "$INPUT" "$INPUT" "$TMPDIR/pref_step$STEP" ${PREFILTER1_PAR} --threads 4 | |
gdate --rfc-3339=seconds | |
mmseqs align "$INPUT" "$INPUT" "$TMPDIR/pref_step$STEP" "$TMPDIR/aln_step$STEP" ${ALIGNMENT1_PAR} --threads 4 | |
gdate --rfc-3339=seconds | |
#export OMP_NUM_THREADS=64 | |
mmseqs clust $INPUT "$TMPDIR/aln_step$STEP" "$TMPDIR/clu_step$STEP" ${CLUSTER1_PAR} --threads 4 | |
gdate --rfc-3339=seconds | |
printf "###############################\n" | |
printf "#### Merge redundancy to 90%% clustering\n" | |
printf "###############################\n" | |
# create database aaclust 90% (we need to merge redundancy, step_0 and step_1) | |
mmseqs mergeclusters "${SEQUENCE_DB}" $OUTDIR/aaclust90_$RELEASE \ | |
"$TMPDIR/clu_redundancy" $TMPDIR/clu_step0 $TMPDIR/clu_step1 | |
gdate --rfc-3339=seconds | |
gawk '{ print $1 }' "$TMPDIR/clu_step$STEP.index" > "$TMPDIR/order_step$STEP" | |
mmseqs createsubdb "$TMPDIR/order_step$STEP" $INPUT "$TMPDIR/input_step2" | |
#export OMP_NUM_THREADS=16 | |
# now we cluster down to 30% sequence id to produce a 30% and 50% clustering | |
STEP=2 | |
INPUT=$TMPDIR/input_step2 | |
gdate --rfc-3339=seconds | |
mmseqs prefilter $INPUT $INPUT "$TMPDIR/pref_step$STEP" ${PREFILTER2_PAR} --threads 4 | |
gdate --rfc-3339=seconds | |
mmseqs align $INPUT $INPUT "$TMPDIR/pref_step$STEP" "$TMPDIR/aln_step$STEP" ${ALIGNMENT2_PAR} --threads 4 | |
gdate --rfc-3339=seconds | |
printf "###############################\n" | |
printf "#### Cluster at 50%%\n" | |
printf "###############################\n" | |
#export OMP_NUM_THREADS=64 | |
# cluster down to 50% | |
mmseqs filterdb "$TMPDIR/aln_step$STEP" "$TMPDIR/aln_aaclust50" \ | |
--filter-column 3 --filter-regex '(0\.[5-9][0-9]{2}|1\.000)' --threads 4 | |
gdate --rfc-3339=seconds | |
mmseqs clust $INPUT "$TMPDIR/aln_aaclust50" "$TMPDIR/clu_aaclust50" ${CLUSTER2_PAR} --threads 4 | |
gdate --rfc-3339=seconds | |
mmseqs mergeclusters "${SEQUENCE_DB}" $OUTDIR/aaclust50_$RELEASE \ | |
"$TMPDIR/clu_redundancy" $TMPDIR/clu_step0 $TMPDIR/clu_step1 $TMPDIR/clu_aaclust50 | |
gdate --rfc-3339=seconds | |
printf "###############################\n" | |
printf "#### Cluster at 30%%\n" | |
printf "###############################\n" | |
# cluster down to 30% | |
mmseqs clust $INPUT "$TMPDIR/aln_step$STEP" "$TMPDIR/clu_aaclust30" ${CLUSTER2_PAR} --threads 4 | |
gdate --rfc-3339=seconds | |
mmseqs mergeclusters "${SEQUENCE_DB}" $OUTDIR/aaclust30_$RELEASE \ | |
"$TMPDIR/clu_redundancy" $TMPDIR/clu_step0 $TMPDIR/clu_step1 $TMPDIR/clu_aaclust30 | |
gdate --rfc-3339=seconds | |
printf "###############################\n" | |
printf "#### Prepare output\n" | |
printf "###############################\n" | |
#export OMP_NUM_THREADS=16 | |
# generate aaclust final output: the _seed, _conensus und .tsv | |
# also generated the profiles needed for the aaboost | |
for i in 30 50 90; do | |
mmseqs result2profile "${SEQUENCE_DB}" "${SEQUENCE_DB}" "$OUTDIR/aaclust${i}_${RELEASE}" "$TMPDIR/aaclust${i}_${RELEASE}_profile" --threads 16 | |
ln -sf "${SEQUENCE_DB}_h" "$TMPDIR/aaclust${i}_${RELEASE}_profile_h" | |
ln -sf "${SEQUENCE_DB}_h.index" "$TMPDIR/aaclust${i}_${RELEASE}_profile_h.index" | |
ln -sf "${SEQUENCE_DB}_h" "$TMPDIR/aaclust${i}_${RELEASE}_profile_consensus_h" | |
ln -sf "${SEQUENCE_DB}_h.index" "$TMPDIR/aaclust${i}_${RELEASE}_profile_consensus_h.index" | |
#fixme: won't work with updating | |
mmseqs mergedbs "$OUTDIR/aaclust${i}_${RELEASE}" "$TMPDIR/aaclust${i}_${RELEASE}_seed" "$OUTDIR/aaprot_db_h" "$OUTDIR/aaprot_db" --prefixes ">" | |
rm -f "$TMPDIR/aaclust${i}_${RELEASE}_seed.index" | |
gsed -i 's/\x0//g' "$TMPDIR/aaclust${i}_${RELEASE}_seed" | |
mmseqs summarizeheaders "${SEQUENCE_DB}_h" "${SEQUENCE_DB}_h" "$OUTDIR/aaclust${i}_${RELEASE}" "$TMPDIR/aaclust${i}_${RELEASE}_summary" --summary-prefix "${i}-${SHORTRELEASE}" | |
mmseqs mergedbs "$OUTDIR/aaclust${i}_$RELEASE" "$TMPDIR/aaclust${i}_${RELEASE}_consensus" "$TMPDIR/aaclust${i}_${RELEASE}_summary" "$TMPDIR/aaclust${i}_${RELEASE}_profile_consensus" --prefixes "> | |
" | |
rm -f "$TMPDIR/aaclust${i}_${RELEASE}_consensus.index" | |
gsed -i 's/\x0//g' "$TMPDIR/aaclust${i}_${RELEASE}_consensus" | |
mmseqs createtsv "${SEQUENCE_DB}" "${SEQUENCE_DB}" "$OUTDIR/aaclust${i}_$RELEASE" "$TMPDIR/aaclust${i}_$RELEASE.tsv" | |
mv -f "$TMPDIR/aaclust${i}_${RELEASE}_seed" "$TMPDIR/aaclust${i}_${RELEASE}_seed.fasta" | |
mv -f "$TMPDIR/aaclust${i}_${RELEASE}_consensus" "$TMPDIR/aaclust${i}_${RELEASE}_consensus.fasta" | |
gtar -cv --use-compress-program=pigz --show-transformed-names --transform "s|${TMPDIR:1}/|aaclust${i}_${RELEASE}/|g" -f "$OUTDIR/aaclust${i}_${RELEASE}.tar.gz" "$TMPDIR/aaclust${i}_$RELEASE.tsv" "$TMPDIR/aaclust${i}_${RELEASE}_consensus.fasta" "$TMPDIR/aaclust${i}_${RELEASE}_seed.fasta" | |
done |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment