tbrittoborges · January 29, 2019 08:37
diff --git a/interproscan_run.py b/interproscan_run.py
 #!/usr/bin/env python
 # __author__ = 'tbrittoborges'

 import subprocess
 import tempfile

 import sys
 import utils

 seqs = utils.fasta_parser('.fasta')
 # iterate over the fasta file 
 for pid in seqs:
    output_file = 'interproscan/{}'.format(pid)
    # creates a temp file for a single fasta sequence
    with tempfile.NamedTemporaryFile(delete=False) as temp:
        temp.write(">{}\n{}".format(pid, seqs[pid]))
    # launch the job
    p = subprocess.Popen(["python",  "/iprscan5_soappy.py",
                          "--goterms", "--pathways",
                          "--email", "[email protected]",
                          '--sequence', temp.name, "--outformat", "tsv",
                          '--outfile', output_file],
                         stdout=subprocess.PIPE)

    for stdout in p.stdout:
        if stdout == "FINISHED":
          continue
diff --git a/interproscan_summary.py b/interproscan_summary.py
 #!/usr/bin/env python
 __author__ = 'tbrittoborges'
 """
 Created on 11:13 19/09/2014 2014 

 """

 import os
 import csv
 import pandas as pd
 pd.options.display.mpl_style = "default"
 pd.options.display.max_columns = 50

 #https://code.google.com/p/interproscan/wiki/InterProScan5OutputFormats
 # Protein Accession (e.g. P51587)
 # Sequence MD5 digest (e.g. 14086411a2cdf1c4cba63020e1622579)
 # Sequence Length (e.g. 3418)
 # Analysis (e.g. Pfam / PRINTS / Gene3D)
 # Signature Accession (e.g. PF09103 / G3DSA:2.40.50.140)
 # Signature Description (e.g. BRCA2 repeat profile)
 # Start location
 # Stop location
 # Score - is the e-value of the match reported by member database method
 # (e.g. 3.1E-52)
 # Status - is the status of the match (T: true)
 # Date - is the date of the run
 # (InterPro annotations - accession (e.g. IPR002093) - optional column;
 # only displayed if -iprscan option is switched on)
 # (InterPro annotations - description (e.g. BRCA2 repeat) - optional column;
 # only displayed if -iprscan option is switched on)
 # (GO annotations (e.g. GO:0005515) - optional column;
 # only displayed if --goterms option is switched on)
 # (Pathways annotations (e.g. REACT_71) - optional column;
 # only displayed if --pathways option is switched on)


 interproscan_path = "path/to/results/"
 f_list = []
 names = ['pid', 'md5', 'len', 'analysis', 'signature', 'description', 'start',
         'end', 'score', 'status', 'date', 'ips_id', 'ips_des', 'go', 'pathway']
 for f_name in os.listdir(interproscan_path):
    if f_name.endswith(".txt") and not f_name.startswith('.'):
        with open(interproscan_path + f_name) as f_handle:
            f_list.extend(list(csv.reader(
                open(interproscan_path + f_name), delimiter='\t')))
 df = pd.DataFrame(f_list, columns=names)
	#!/usr/bin/env python
	# __author__ = 'tbrittoborges'

	import subprocess
	import tempfile

	import sys
	import utils

	seqs = utils.fasta_parser('.fasta')
	# iterate over the fasta file
	for pid in seqs:
	output_file = 'interproscan/{}'.format(pid)
	# creates a temp file for a single fasta sequence
	with tempfile.NamedTemporaryFile(delete=False) as temp:
	temp.write(">{}\n{}".format(pid, seqs[pid]))
	# launch the job
	p = subprocess.Popen(["python", "/iprscan5_soappy.py",
	"--goterms", "--pathways",
	"--email", "[email protected]",
	'--sequence', temp.name, "--outformat", "tsv",
	'--outfile', output_file],
	stdout=subprocess.PIPE)

	for stdout in p.stdout:
	if stdout == "FINISHED":
	continue
	#!/usr/bin/env python
	__author__ = 'tbrittoborges'
	"""
	Created on 11:13 19/09/2014 2014

	"""

	import os
	import csv
	import pandas as pd
	pd.options.display.mpl_style = "default"
	pd.options.display.max_columns = 50

	#https://code.google.com/p/interproscan/wiki/InterProScan5OutputFormats
	# Protein Accession (e.g. P51587)
	# Sequence MD5 digest (e.g. 14086411a2cdf1c4cba63020e1622579)
	# Sequence Length (e.g. 3418)
	# Analysis (e.g. Pfam / PRINTS / Gene3D)
	# Signature Accession (e.g. PF09103 / G3DSA:2.40.50.140)
	# Signature Description (e.g. BRCA2 repeat profile)
	# Start location
	# Stop location
	# Score - is the e-value of the match reported by member database method
	# (e.g. 3.1E-52)
	# Status - is the status of the match (T: true)
	# Date - is the date of the run
	# (InterPro annotations - accession (e.g. IPR002093) - optional column;
	# only displayed if -iprscan option is switched on)
	# (InterPro annotations - description (e.g. BRCA2 repeat) - optional column;
	# only displayed if -iprscan option is switched on)
	# (GO annotations (e.g. GO:0005515) - optional column;
	# only displayed if --goterms option is switched on)
	# (Pathways annotations (e.g. REACT_71) - optional column;
	# only displayed if --pathways option is switched on)


	interproscan_path = "path/to/results/"
	f_list = []
	names = ['pid', 'md5', 'len', 'analysis', 'signature', 'description', 'start',
	'end', 'score', 'status', 'date', 'ips_id', 'ips_des', 'go', 'pathway']
	for f_name in os.listdir(interproscan_path):
	if f_name.endswith(".txt") and not f_name.startswith('.'):
	with open(interproscan_path + f_name) as f_handle:
	f_list.extend(list(csv.reader(
	open(interproscan_path + f_name), delimiter='\t')))
	df = pd.DataFrame(f_list, columns=names)