genomewalker · March 5, 2018 12:30
diff --git a/Readme.md b/Readme.md
diff --git a/hh_parser.sh b/hh_parser.sh
 #!/bin/bash
 FILE=$(perl -ne 'print $_')
 python hh_reader.py <(echo "$FILE") | awk '$2 > 90' | awk -F"OS=" '{$0=$1}1'
diff --git a/hh_reader.py b/hh_reader.py
 #!/usr/bin/env python

 """
 Parser for hhr result files created with hhblits|hhsearch|hhalign -o <hhr_file>
 """


 import sys
 from collections import namedtuple


 __author__ = 'Markus Meier ([email protected])'
 __version__ = '1.0'
 __license__ = "GPL-3"


 hhr_alignment = namedtuple('hhr_alignment', ['query_id', 'query_length', 'query_neff',
                                             'template_id', 'template_length', 'template_info',
                                             'template_neff', 'query_ali', 'template_ali',
                                             'start', 'end', 'probability', 'evalue', 'score',
                                             'aligned_cols', 'identity', 'similarity', 'sum_probs'])


 class HHRFormatError(Exception):
    def __init__(self, value):
        self.value = "ERROR: " + value

    def __str__(self):
        return repr(self.value)


 def get_sequence_name(header):
    name = header.replace(">", "").split()[0]
    return name


 def parse_result(lines):
    results = []

    query_id = None
    query_length = None
    query_neff = None
    query_seq = []
    template_id = None
    template_length = None
    template_seq = []
    template_info = None
    query_start = None
    query_end = None
    template_start = None
    template_end = None
    probability = None
    evalue = None
    score = None
    identity = None
    similarity = None
    template_neff = None
    sum_probs = None
    aligned_cols = None

    skipped_ali_tags = ["ss_dssp", "ss_pred", "Consensus"]

    is_alignment_section = False

    for line in lines:
        if(line.startswith("Query")):
            query_id = line.split()[1]
        elif(line.startswith("Match_columns")):
            query_length = int(line.split()[1])
        elif(line.startswith("Neff")):
            query_neff = float(line.split()[1])
        elif(is_alignment_section and (line.startswith("No") or line.startswith("Done!"))):
            if query_start is not None:
                result = hhr_alignment(query_id, query_length, query_neff,
                                       template_id, template_length, template_info, template_neff,
                                       query_seq, template_seq, (
                                           query_start, template_start),
                                       (query_end, template_end), probability, evalue, score,
                                       aligned_cols, identity, similarity, sum_probs)
                results.append(result)
            template_id = None
            template_info = None
            query_seq = []
            template_seq = []
                query_start = None
            query_end = None
            template_start = None
            template_end = None
        elif(line.startswith("Probab")):
            tokens = line.split()
            probability = float(tokens[0].split("=")[1])
            evalue = float(tokens[1].split("=")[1])
            score = float(tokens[2].split("=")[1])
            aligned_cols = int(tokens[3].split("=")[1])
            identity = float(tokens[4].split("=")[1].replace("%", "")) / 100.0
            similarity = float(tokens[5].split("=")[1])
            sum_probs = float(tokens[6].split("=")[1])
            if(len(tokens) > 7):
                template_neff = float(tokens[7].split("=")[1])
            continue
        elif(line.startswith(">")):
            is_alignment_section = True
            template_id = line[1:].split()[0]
            template_info = line
        elif(line.startswith("Q")):
            tokens = line.split()
            if(tokens[1] in skipped_ali_tags):
                continue

            try:
                token_2 = tokens[2].replace("(", "").replace(")", "")
                token_2 = int(token_2)
            except:
                raise HHRFormatError(("Converting failure of start index ({}) "
                                      "of query alignment").format(tokens[2]))

            if query_start is None:
                query_start = token_2
            query_start = min(query_start, token_2)

            try:
                token_4 = tokens[4].replace("(", "").replace(")", "")
                token_4 = int(token_4)
            except:
                raise HHRFormatError(("Converting failure of end index ({}) "
                                      "of query alignment").format(tokens[4]))

            if query_end is None:
                query_end = token_4
            query_end = max(query_end, token_4)
            query_seq.append(tokens[3])
        elif(line.startswith("T")):
            tokens = line.split()
            if(tokens[1] in skipped_ali_tags):
                continue
            template_seq.append(tokens[3])

            try:
                token_2 = tokens[2].replace("(", "").replace(")", "")
                token_2 = int(token_2)
            except:
                raise HHRFormatError(("Converting failure of start index ({}) "
                                      "of template alignment").format(tokens[2]))

            if template_start is None:
                template_start = token_2
            template_start = min(template_start, token_2)

            try:
                token_4 = tokens[4].replace("(", "").replace(")", "")
                token_4 = int(token_4)
            except:
                raise HHRFormatError(("Converting failure of end index ({}) "
                                      "of template alignment").format(tokens[4]))

            if template_end is None:
                template_end = token_4
            template_end = max(template_end, token_4)

            try:
                token_5 = tokens[4].replace("(", "").replace(")", "")
                token_5 = int(token_5)
            except:
                raise HHRFormatError(("Converting failure of template length ({}) "
                                      "in template alignment").format(tokens[5]))
            template_length = token_5
                if(template_id is not None and query_start is not None):
        result = hhr_alignment(query_id, query_length, query_neff,
                               template_id, template_length, template_info, template_neff,
                               "".join(query_seq), "".join(
                                   template_seq), (query_start, template_start),
                               (query_end, template_end), probability, evalue, score,
                               aligned_cols, identity, similarity, sum_probs)
        results.append(result)

    return results


 def read_result(input_file):
    with open(input_file) as fh:
        lines = fh.readlines()
        return parse_result(lines)


 def main():
    counter = 0
    for result in read_result(sys.argv[1]):

        sys.stdout.write(result.query_id + "\t" + str(result.probability) + "\t" +
                         str(result.evalue) + "\t" +
                         result.template_info)


 if __name__ == "__main__":
    main()
diff --git a/hhblits.sh b/hhblits.sh
 #!/bin/bash
 GCC_HOME=/bioinf/software/gcc/gcc-4.9
 OPENMPI_HOME=/bioinf/software/openmpi/openmpi-1.8
 PATH=${GCC_HOME}/bin:${OPENMPI_HOME}/bin:$PATH
 export PATH

 LD_LIBRARY_PATH=${GCC_HOME}/lib64:${OPENMPI_HOME}/lib:"${HOME}"/opt/igraph-0.7.1_mg/lib:$LD_LIBRARY_PATH
 export HHLIB=$HOME/opt/hhsuite_mg
 export PATH=$PATH:$HHLIB/bin:$HHLIB/scripts

 hhblits -i stdin -o /bioinf/home/afernand/SANDBOX/jackhmmer/results/"${FFINDEX_ENTRY_NAME}".hhr -cpu 2 -n 2 -d /bioinf/home/afernand/SANDBOX/uniclust/uniclust30_2017_10/uniclust30_2017_10
	#!/bin/bash
	FILE=$(perl -ne 'print $_')
	python hh_reader.py <(echo "$FILE") \| awk '$2 > 90' \| awk -F"OS=" '{$0=$1}1'
	#!/usr/bin/env python

	"""
	Parser for hhr result files created with hhblits\|hhsearch\|hhalign -o <hhr_file>
	"""


	import sys
	from collections import namedtuple


	__author__ = 'Markus Meier ([email protected])'
	__version__ = '1.0'
	__license__ = "GPL-3"


	hhr_alignment = namedtuple('hhr_alignment', ['query_id', 'query_length', 'query_neff',
	'template_id', 'template_length', 'template_info',
	'template_neff', 'query_ali', 'template_ali',
	'start', 'end', 'probability', 'evalue', 'score',
	'aligned_cols', 'identity', 'similarity', 'sum_probs'])


	class HHRFormatError(Exception):
	def __init__(self, value):
	self.value = "ERROR: " + value

	def __str__(self):
	return repr(self.value)


	def get_sequence_name(header):
	name = header.replace(">", "").split()[0]
	return name


	def parse_result(lines):
	results = []

	query_id = None
	query_length = None
	query_neff = None
	query_seq = []
	template_id = None
	template_length = None
	template_seq = []
	template_info = None
	query_start = None
	query_end = None
	template_start = None
	template_end = None
	probability = None
	evalue = None
	score = None
	identity = None
	similarity = None
	template_neff = None
	sum_probs = None
	aligned_cols = None

	skipped_ali_tags = ["ss_dssp", "ss_pred", "Consensus"]

	is_alignment_section = False

	for line in lines:
	if(line.startswith("Query")):
	query_id = line.split()[1]
	elif(line.startswith("Match_columns")):
	query_length = int(line.split()[1])
	elif(line.startswith("Neff")):
	query_neff = float(line.split()[1])
	elif(is_alignment_section and (line.startswith("No") or line.startswith("Done!"))):
	if query_start is not None:
	result = hhr_alignment(query_id, query_length, query_neff,
	template_id, template_length, template_info, template_neff,
	query_seq, template_seq, (
	query_start, template_start),
	(query_end, template_end), probability, evalue, score,
	aligned_cols, identity, similarity, sum_probs)
	results.append(result)
	template_id = None
	template_info = None
	query_seq = []
	template_seq = []
	query_start = None
	query_end = None
	template_start = None
	template_end = None
	elif(line.startswith("Probab")):
	tokens = line.split()
	probability = float(tokens[0].split("=")[1])
	evalue = float(tokens[1].split("=")[1])
	score = float(tokens[2].split("=")[1])
	aligned_cols = int(tokens[3].split("=")[1])
	identity = float(tokens[4].split("=")[1].replace("%", "")) / 100.0
	similarity = float(tokens[5].split("=")[1])
	sum_probs = float(tokens[6].split("=")[1])
	if(len(tokens) > 7):
	template_neff = float(tokens[7].split("=")[1])
	continue
	elif(line.startswith(">")):
	is_alignment_section = True
	template_id = line[1:].split()[0]
	template_info = line
	elif(line.startswith("Q")):
	tokens = line.split()
	if(tokens[1] in skipped_ali_tags):
	continue

	try:
	token_2 = tokens[2].replace("(", "").replace(")", "")
	token_2 = int(token_2)
	except:
	raise HHRFormatError(("Converting failure of start index ({}) "
	"of query alignment").format(tokens[2]))

	if query_start is None:
	query_start = token_2
	query_start = min(query_start, token_2)

	try:
	token_4 = tokens[4].replace("(", "").replace(")", "")
	token_4 = int(token_4)
	except:
	raise HHRFormatError(("Converting failure of end index ({}) "
	"of query alignment").format(tokens[4]))

	if query_end is None:
	query_end = token_4
	query_end = max(query_end, token_4)
	query_seq.append(tokens[3])
	elif(line.startswith("T")):
	tokens = line.split()
	if(tokens[1] in skipped_ali_tags):
	continue
	template_seq.append(tokens[3])

	try:
	token_2 = tokens[2].replace("(", "").replace(")", "")
	token_2 = int(token_2)
	except:
	raise HHRFormatError(("Converting failure of start index ({}) "
	"of template alignment").format(tokens[2]))

	if template_start is None:
	template_start = token_2
	template_start = min(template_start, token_2)

	try:
	token_4 = tokens[4].replace("(", "").replace(")", "")
	token_4 = int(token_4)
	except:
	raise HHRFormatError(("Converting failure of end index ({}) "
	"of template alignment").format(tokens[4]))

	if template_end is None:
	template_end = token_4
	template_end = max(template_end, token_4)

	try:
	token_5 = tokens[4].replace("(", "").replace(")", "")
	token_5 = int(token_5)
	except:
	raise HHRFormatError(("Converting failure of template length ({}) "
	"in template alignment").format(tokens[5]))
	template_length = token_5
	if(template_id is not None and query_start is not None):
	result = hhr_alignment(query_id, query_length, query_neff,
	template_id, template_length, template_info, template_neff,
	"".join(query_seq), "".join(
	template_seq), (query_start, template_start),
	(query_end, template_end), probability, evalue, score,
	aligned_cols, identity, similarity, sum_probs)
	results.append(result)

	return results


	def read_result(input_file):
	with open(input_file) as fh:
	lines = fh.readlines()
	return parse_result(lines)


	def main():
	counter = 0
	for result in read_result(sys.argv[1]):

	sys.stdout.write(result.query_id + "\t" + str(result.probability) + "\t" +
	str(result.evalue) + "\t" +
	result.template_info)


	if __name__ == "__main__":
	main()
	#!/bin/bash
	GCC_HOME=/bioinf/software/gcc/gcc-4.9
	OPENMPI_HOME=/bioinf/software/openmpi/openmpi-1.8
	PATH=${GCC_HOME}/bin:${OPENMPI_HOME}/bin:$PATH
	export PATH

	LD_LIBRARY_PATH=${GCC_HOME}/lib64:${OPENMPI_HOME}/lib:"${HOME}"/opt/igraph-0.7.1_mg/lib:$LD_LIBRARY_PATH
	export HHLIB=$HOME/opt/hhsuite_mg
	export PATH=$PATH:$HHLIB/bin:$HHLIB/scripts

	hhblits -i stdin -o /bioinf/home/afernand/SANDBOX/jackhmmer/results/"${FFINDEX_ENTRY_NAME}".hhr -cpu 2 -n 2 -d /bioinf/home/afernand/SANDBOX/uniclust/uniclust30_2017_10/uniclust30_2017_10