azalea · February 2, 2014 21:14
diff --git a/split_then_pairwise_blast.py b/split_then_pairwise_blast.py
 '''This code tried to do things the hard way to solve: http://www.biostars.org/p/92205/ by
 first, the function split_fasta() splits the original fasta file into files containing individual sequence records;
 second, blast_seq_the_hard_way() iterates over these individual files and feeds their filenames to BLAST.
 Please note that BLAST can handle multiple sequences in -query and -subject at the same time, so there is no need
 to split the files. This code is just for demonstration purposes.
 '''
 import os
 import re

 from Bio.Blast.Applications import NcbiblastpCommandline
 from Bio.Blast import NCBIXML
 from Bio import SeqIO
 from Bio.Blast import NCBIWWW


 def split_fasta(input_file):
    '''Splits input_file into separate files, each containing one sequence record.'''
    # The following two lines are used for the output filenames to keep each individual sequence
    basename = os.path.basename(input_file)
    filename_without_extension, extension = os.path.splitext(basename)

    for i, record in enumerate(SeqIO.parse(input_file, 'fasta')):
        # To generate output filenames called filename_0.fsa, filename_1.fsa, etc
        outputfile = '%s_%d%s'%(filename_without_extension, i, extension)
        SeqIO.write(record, outputfile, 'fasta')

    # To keep track of the input_file for future use
    return filename_without_extension, extension


 def blast_seq_the_hard_way(SC_Fasta, HS_Fasta):
    input_file_sc, sc_extension = split_fasta(SC_Fasta)
    input_file_hs, hs_extension = split_fasta(HS_Fasta)
    blastp = "C:\\Program Files\\NCBI\\blast-2.2.29+\\bin\\blastp"

    # Find the files filename_0.fsa etc
    for file_sc in [f for f in os.listdir(os.getcwd()) if re.search(r'^%s_\d+%s$'%(input_file_sc, sc_extension), f)]:
        for file_hs in [f for f in os.listdir(os.getcwd()) if re.search(r'^%s_\d+%s$'%(input_file_hs, hs_extension), f)]:
            print file_hs
            blast_output_file = '%s_%s_blast.xml'%(file_sc, file_hs)
            # I provide an "out" parameter here to save the blast output in the "blast_output_file"
            NcbiblastpCommandline(blastp, query=file_sc, subject=file_hs, outfmt=5, out=blast_output_file)()
            # Now you can read the blast output from the file and process it however you want
            # blast_result_record = NCBIXML.read(open(blast_output_file))

 blast_seq_the_hard_way('sc.fsa', 'hsap.fsa')
	'''This code tried to do things the hard way to solve: http://www.biostars.org/p/92205/ by
	first, the function split_fasta() splits the original fasta file into files containing individual sequence records;
	second, blast_seq_the_hard_way() iterates over these individual files and feeds their filenames to BLAST.
	Please note that BLAST can handle multiple sequences in -query and -subject at the same time, so there is no need
	to split the files. This code is just for demonstration purposes.
	'''
	import os
	import re

	from Bio.Blast.Applications import NcbiblastpCommandline
	from Bio.Blast import NCBIXML
	from Bio import SeqIO
	from Bio.Blast import NCBIWWW


	def split_fasta(input_file):
	'''Splits input_file into separate files, each containing one sequence record.'''
	# The following two lines are used for the output filenames to keep each individual sequence
	basename = os.path.basename(input_file)
	filename_without_extension, extension = os.path.splitext(basename)

	for i, record in enumerate(SeqIO.parse(input_file, 'fasta')):
	# To generate output filenames called filename_0.fsa, filename_1.fsa, etc
	outputfile = '%s_%d%s'%(filename_without_extension, i, extension)
	SeqIO.write(record, outputfile, 'fasta')

	# To keep track of the input_file for future use
	return filename_without_extension, extension


	def blast_seq_the_hard_way(SC_Fasta, HS_Fasta):
	input_file_sc, sc_extension = split_fasta(SC_Fasta)
	input_file_hs, hs_extension = split_fasta(HS_Fasta)
	blastp = "C:\\Program Files\\NCBI\\blast-2.2.29+\\bin\\blastp"

	# Find the files filename_0.fsa etc
	for file_sc in [f for f in os.listdir(os.getcwd()) if re.search(r'^%s_\d+%s$'%(input_file_sc, sc_extension), f)]:
	for file_hs in [f for f in os.listdir(os.getcwd()) if re.search(r'^%s_\d+%s$'%(input_file_hs, hs_extension), f)]:
	print file_hs
	blast_output_file = '%s_%s_blast.xml'%(file_sc, file_hs)
	# I provide an "out" parameter here to save the blast output in the "blast_output_file"
	NcbiblastpCommandline(blastp, query=file_sc, subject=file_hs, outfmt=5, out=blast_output_file)()
	# Now you can read the blast output from the file and process it however you want
	# blast_result_record = NCBIXML.read(open(blast_output_file))

	blast_seq_the_hard_way('sc.fsa', 'hsap.fsa')