jrjhealey · July 5, 2017 21:06
diff --git a/tabulateHHpred.py b/tabulateHHpred.py
 # -*- coding: utf-8 -*-

 """
 This script takes the .hhr files output by HHSuite and
 turns the quite verbose file in to a fully tabulated
 version with all the fields separated one, one line per
 file. Thus, the file can be viewed simply in Excel etc.
 It requires the non-standard pandas module.
 """

 # This program is free software: you can redistribute it and/or
 # modify it under the terms of the GNU General Public License as
 # published by the Free Software Foundation, either version 3 of
 # the License, or (at your option) any later version.

 # This program is distributed in the hope that it will be useful,
 # but WITHOUT ANY WARRANTY; without even the implied warranty of
 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
 # GNU General Public License for more details.

 # The license for HHSuite remains with the authors. Please consult
 # their licensce agreements before using this software, and ensure
 # they are cited for any use. Run the script with -b to print rel-
 # evant references.


 import os
 import sys
 import traceback
 import warnings

 __author__ = "Joe R. J. Healey"
 __version__ = "1.0.0"
 __title__ = "tabulateHHpred"
 __license__ = "GPLv3"
 __author_email__ = "[email protected]"

 template = \
 u"""
 ---|------|------------------------|----|-------|-------|------|-----|----|---------|--------------|
 No Hit    Short Desc               Prob E-value P-value  Score    SS Cols Query HMM  Template HMM
 """


 def hhparse(hhresult_file, verbose):
 	'''Convert HHpred's text-based output table in to a pandas dataframe'''
 	from io import StringIO
 	import pandas as pd

 	pattern = StringIO(template).readlines()[1]
 	colBreaks = [i for i, ch in enumerate(pattern) if ch == '|']
 	widths = [j-i for i, j in zip( ([0]+colBreaks)[:-1], colBreaks ) ]

 	hhtable = pd.read_fwf(hhresult_file, skiprows=8, nrows=10, header=0, widths = widths)
 	if verbose is True:
 		print(hhtable)

 	top_hit = str(hhtable.loc[0,'Hit'])[0:4]
 	top_hit_full = hhtable.loc[0,'Hit']
 	top_prob = hhtable.loc[0,'Prob']
 	top_eval = hhtable.loc[0,'E-value']
 	top_pval = hhtable.loc[0,'P-value']
 	top_score = hhtable.loc[0,'Score']

 	if verbose is True:
 		print("Your best hit: (PDB ID | Probability | E-Value | P-Value | Score)")
        	print("\t" +  str(top_hit) + "\t" + str(top_prob) + "\t" + str(top_eval) + "\t" + str(top_pval) + "\t" + str(top_score) )
 	
 	return top_hit, top_hit_full, top_prob, top_eval, top_pval, top_score

 def getFullDesc(hhresult_file,top_hit_full, verbose):
 	with open(hhresult_file, 'r') as hrh:
 		for line in hrh:
 			if line.startswith('>' + top_hit_full):
 				full_desc = line.rstrip('\n')

 	if verbose is True:
 		print full_desc

 	return full_desc

 def getDOI(top_hit):
 	"""Query the PDB REST API to get an associated DOI/Publication"""
 	import requests
 	try:
 		query = requests.get("https://www.ebi.ac.uk/pdbe/api/pdb/entry/publications/" + str(top_hit))
 		qjson = query.json()
 		doi = qjson[top_hit][0]['doi']

 		if not doi:
 			doi = "No DOI found."

 	except KeyError:
 		doi = "Key error. ID likely deprecated."

 	return doi

 def displayRefs():
    """Display relevant references"""
    print('''
    - The following publications pertain to HHSuite:
 	- Alva V., Nam SZ., Söding J., Lupas AN. (2016)
 	  The MPI bioinformatics Toolkit as an integrative platform for advanced protein sequence and structure analysis.
 	  Nucleic Acids Res. pii: gkw348. PMID: 27131380
 	
 	- Remmert M., Biegert A., Hauser A., Söding J. (2011) HHblits: Lightning-fast iterative protein sequence searching by HMM-HMM alignment.
 	  Nat Methods. 9(2):173-5. doi: 10.1038/nmeth.1818. PMID: 22198341
    
 	- Söding J., Biegert A., Lupas AN. (2005) The HHpred interactive server for protein homology detection and structure prediction.
 	  Nucleic Acids Res 33 (Web Server issue), W244-W248. PMID: 15980461
 	
 	- Söding J. (2005) Protein homology detection by HMM-HMM comparison.
 	  Bioinformatics 21: 951-960. PMID: 15531603
 	
 	- Hildebrand A., Remmert A., Biegert A., Söding J. (2009) Fast and accurate automatic structure prediction with HHpred.
 	  Proteins 77(Suppl 9):128-32. doi: 10.1002/prot.22499. PMID: 19626712
 	
 	- Meier A., Söding J. (2015) Automatic Prediction of Protein 3D Structures by Probabilistic Multi-template Homology Modeling.
 	  PLoS Comput Biol. 11(10):e1004343. doi: 10.1371/journal.pcbi.1004343. PMID: 26496371
 	''')
    sys.exit(1)

 def parseArgs():
 	"""Parse commandline arguments"""

 	import argparse
 	try:
   		parser = argparse.ArgumentParser(description='This script converts HHpreds verbose output in to a full table for subsequent analysis.')
 		parser.add_argument(
 			'-i',
 			'--infile',
 			action='store',
 			help='The HHpred output file to parse.')
 		parser.add_argument(
 			'-v',
 			'--verbose',
 			action='store_true',
 			help='Print additional messages to screen. Default behaviour is false, only the result would be printed to screen for piping etc.')
 		parser.add_argument(
 			'-o',
 			'--outfile',
 			action='store',
 			default='None',
 			help='Output file name to store results in. If none provided, the default will be infile.tsv.')			
 		parser.add_argument(
 			'-b',
 			'--bibliography',
 			action='store_true',
 			help='Display references.')
 		parser.add_argument(
 			'-d',
 			'--doi',
 			action='store_true',
 			help='Try to retrieve relevant publications from PDB RESTful API. Warning, this will significantly slow down the processing of results.')
 		args = parser.parse_args()
 		
 	except:
 		print "An exception occured with argument parsing. Check your provided options."
 		traceback.print_exc()

 	return parser.parse_args()

 def main():
 	"""Make function calls and handle arguments for tabulateHHpred.py"""
 	args = parseArgs()

        if args.bibliography is True:
                displayRefs()

 	verbose = args.verbose
 	hhresult_file = args.infile
 	
 	indir = os.path.dirname(os.path.abspath(hhresult_file))

 	split = os.path.splitext(args.infile)
 	basename = os.path.basename(split[0])
 	
 	if args.outfile is 'None':
 		outfile = indir + '/' + basename + '.tsv'
 	else:
 		outfile = args.outfile

 # Main code begins:
 	
 	top_hit, top_hit_full, top_prob, top_eval, top_pval, top_score = hhparse(hhresult_file, verbose)
 	full_desc = getFullDesc(hhresult_file, top_hit_full, verbose)
 	
 	row = (basename + "\t" + str(top_hit) + "\t" + str(top_hit_full) + "\t" + str(top_prob) + "\t" + str(top_eval) + "\t" + str(top_pval) + "\t" + str(top_score) + "\t" + full_desc + "\n")

 	if args.doi is True:
 		doi = getDOI(top_hit)
 		row = (row.rstrip("\n") + "\t" + doi + "\n")
 		
 		with open(outfile, 'w') as ofh:
 			ofh.write(row)
 		print(row)

 	elif args.doi is False:
 		with open(outfile, 'w') as ofh:
 			ofh.write(row)
 		print(row)


 if __name__ == '__main__':
 	main()
	# -- coding: utf-8 --

	"""
	This script takes the .hhr files output by HHSuite and
	turns the quite verbose file in to a fully tabulated
	version with all the fields separated one, one line per
	file. Thus, the file can be viewed simply in Excel etc.
	It requires the non-standard pandas module.
	"""

	# This program is free software: you can redistribute it and/or
	# modify it under the terms of the GNU General Public License as
	# published by the Free Software Foundation, either version 3 of
	# the License, or (at your option) any later version.

	# This program is distributed in the hope that it will be useful,
	# but WITHOUT ANY WARRANTY; without even the implied warranty of
	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	# GNU General Public License for more details.

	# The license for HHSuite remains with the authors. Please consult
	# their licensce agreements before using this software, and ensure
	# they are cited for any use. Run the script with -b to print rel-
	# evant references.


	import os
	import sys
	import traceback
	import warnings

	__author__ = "Joe R. J. Healey"
	__version__ = "1.0.0"
	__title__ = "tabulateHHpred"
	__license__ = "GPLv3"
	__author_email__ = "[email protected]"

	template = \
	u"""
	---\|------\|------------------------\|----\|-------\|-------\|------\|-----\|----\|---------\|--------------\|
	No Hit Short Desc Prob E-value P-value Score SS Cols Query HMM Template HMM
	"""


	def hhparse(hhresult_file, verbose):
	'''Convert HHpred's text-based output table in to a pandas dataframe'''
	from io import StringIO
	import pandas as pd

	pattern = StringIO(template).readlines()[1]
	colBreaks = [i for i, ch in enumerate(pattern) if ch == '\|']
	widths = [j-i for i, j in zip( ([0]+colBreaks)[:-1], colBreaks ) ]

	hhtable = pd.read_fwf(hhresult_file, skiprows=8, nrows=10, header=0, widths = widths)
	if verbose is True:
	print(hhtable)

	top_hit = str(hhtable.loc[0,'Hit'])[0:4]
	top_hit_full = hhtable.loc[0,'Hit']
	top_prob = hhtable.loc[0,'Prob']
	top_eval = hhtable.loc[0,'E-value']
	top_pval = hhtable.loc[0,'P-value']
	top_score = hhtable.loc[0,'Score']

	if verbose is True:
	print("Your best hit: (PDB ID \| Probability \| E-Value \| P-Value \| Score)")
	print("\t" + str(top_hit) + "\t" + str(top_prob) + "\t" + str(top_eval) + "\t" + str(top_pval) + "\t" + str(top_score) )

	return top_hit, top_hit_full, top_prob, top_eval, top_pval, top_score

	def getFullDesc(hhresult_file,top_hit_full, verbose):
	with open(hhresult_file, 'r') as hrh:
	for line in hrh:
	if line.startswith('>' + top_hit_full):
	full_desc = line.rstrip('\n')

	if verbose is True:
	print full_desc

	return full_desc

	def getDOI(top_hit):
	"""Query the PDB REST API to get an associated DOI/Publication"""
	import requests
	try:
	query = requests.get("https://www.ebi.ac.uk/pdbe/api/pdb/entry/publications/" + str(top_hit))
	qjson = query.json()
	doi = qjson[top_hit][0]['doi']

	if not doi:
	doi = "No DOI found."

	except KeyError:
	doi = "Key error. ID likely deprecated."

	return doi

	def displayRefs():
	"""Display relevant references"""
	print('''
	- The following publications pertain to HHSuite:
	- Alva V., Nam SZ., Söding J., Lupas AN. (2016)
	The MPI bioinformatics Toolkit as an integrative platform for advanced protein sequence and structure analysis.
	Nucleic Acids Res. pii: gkw348. PMID: 27131380

	- Remmert M., Biegert A., Hauser A., Söding J. (2011) HHblits: Lightning-fast iterative protein sequence searching by HMM-HMM alignment.
	Nat Methods. 9(2):173-5. doi: 10.1038/nmeth.1818. PMID: 22198341

	- Söding J., Biegert A., Lupas AN. (2005) The HHpred interactive server for protein homology detection and structure prediction.
	Nucleic Acids Res 33 (Web Server issue), W244-W248. PMID: 15980461

	- Söding J. (2005) Protein homology detection by HMM-HMM comparison.
	Bioinformatics 21: 951-960. PMID: 15531603

	- Hildebrand A., Remmert A., Biegert A., Söding J. (2009) Fast and accurate automatic structure prediction with HHpred.
	Proteins 77(Suppl 9):128-32. doi: 10.1002/prot.22499. PMID: 19626712

	- Meier A., Söding J. (2015) Automatic Prediction of Protein 3D Structures by Probabilistic Multi-template Homology Modeling.
	PLoS Comput Biol. 11(10):e1004343. doi: 10.1371/journal.pcbi.1004343. PMID: 26496371
	''')
	sys.exit(1)

	def parseArgs():
	"""Parse commandline arguments"""

	import argparse
	try:
	parser = argparse.ArgumentParser(description='This script converts HHpreds verbose output in to a full table for subsequent analysis.')
	parser.add_argument(
	'-i',
	'--infile',
	action='store',
	help='The HHpred output file to parse.')
	parser.add_argument(
	'-v',
	'--verbose',
	action='store_true',
	help='Print additional messages to screen. Default behaviour is false, only the result would be printed to screen for piping etc.')
	parser.add_argument(
	'-o',
	'--outfile',
	action='store',
	default='None',
	help='Output file name to store results in. If none provided, the default will be infile.tsv.')
	parser.add_argument(
	'-b',
	'--bibliography',
	action='store_true',
	help='Display references.')
	parser.add_argument(
	'-d',
	'--doi',
	action='store_true',
	help='Try to retrieve relevant publications from PDB RESTful API. Warning, this will significantly slow down the processing of results.')
	args = parser.parse_args()

	except:
	print "An exception occured with argument parsing. Check your provided options."
	traceback.print_exc()

	return parser.parse_args()

	def main():
	"""Make function calls and handle arguments for tabulateHHpred.py"""
	args = parseArgs()

	if args.bibliography is True:
	displayRefs()

	verbose = args.verbose
	hhresult_file = args.infile

	indir = os.path.dirname(os.path.abspath(hhresult_file))

	split = os.path.splitext(args.infile)
	basename = os.path.basename(split[0])

	if args.outfile is 'None':
	outfile = indir + '/' + basename + '.tsv'
	else:
	outfile = args.outfile

	# Main code begins:

	top_hit, top_hit_full, top_prob, top_eval, top_pval, top_score = hhparse(hhresult_file, verbose)
	full_desc = getFullDesc(hhresult_file, top_hit_full, verbose)

	row = (basename + "\t" + str(top_hit) + "\t" + str(top_hit_full) + "\t" + str(top_prob) + "\t" + str(top_eval) + "\t" + str(top_pval) + "\t" + str(top_score) + "\t" + full_desc + "\n")

	if args.doi is True:
	doi = getDOI(top_hit)
	row = (row.rstrip("\n") + "\t" + doi + "\n")

	with open(outfile, 'w') as ofh:
	ofh.write(row)
	print(row)

	elif args.doi is False:
	with open(outfile, 'w') as ofh:
	ofh.write(row)
	print(row)


	if __name__ == '__main__':
	main()
No results found