Skip to content

Instantly share code, notes, and snippets.

@azalea
Created February 2, 2014 21:14
Show Gist options
  • Save azalea/8775085 to your computer and use it in GitHub Desktop.
Save azalea/8775085 to your computer and use it in GitHub Desktop.
This code tried to do things the hard way to solve: http://www.biostars.org/p/92205/ by first, the function split_fasta() splits the original fasta file into files containing individual sequence records; second, blast_seq_the_hard_way() iterates over these individual files and feeds their filenames to BLAST. Please note that BLAST can handle mul…
'''This code tried to do things the hard way to solve: http://www.biostars.org/p/92205/ by
first, the function split_fasta() splits the original fasta file into files containing individual sequence records;
second, blast_seq_the_hard_way() iterates over these individual files and feeds their filenames to BLAST.
Please note that BLAST can handle multiple sequences in -query and -subject at the same time, so there is no need
to split the files. This code is just for demonstration purposes.
'''
import os
import re
from Bio.Blast.Applications import NcbiblastpCommandline
from Bio.Blast import NCBIXML
from Bio import SeqIO
from Bio.Blast import NCBIWWW
def split_fasta(input_file):
'''Splits input_file into separate files, each containing one sequence record.'''
# The following two lines are used for the output filenames to keep each individual sequence
basename = os.path.basename(input_file)
filename_without_extension, extension = os.path.splitext(basename)
for i, record in enumerate(SeqIO.parse(input_file, 'fasta')):
# To generate output filenames called filename_0.fsa, filename_1.fsa, etc
outputfile = '%s_%d%s'%(filename_without_extension, i, extension)
SeqIO.write(record, outputfile, 'fasta')
# To keep track of the input_file for future use
return filename_without_extension, extension
def blast_seq_the_hard_way(SC_Fasta, HS_Fasta):
input_file_sc, sc_extension = split_fasta(SC_Fasta)
input_file_hs, hs_extension = split_fasta(HS_Fasta)
blastp = "C:\\Program Files\\NCBI\\blast-2.2.29+\\bin\\blastp"
# Find the files filename_0.fsa etc
for file_sc in [f for f in os.listdir(os.getcwd()) if re.search(r'^%s_\d+%s$'%(input_file_sc, sc_extension), f)]:
for file_hs in [f for f in os.listdir(os.getcwd()) if re.search(r'^%s_\d+%s$'%(input_file_hs, hs_extension), f)]:
print file_hs
blast_output_file = '%s_%s_blast.xml'%(file_sc, file_hs)
# I provide an "out" parameter here to save the blast output in the "blast_output_file"
NcbiblastpCommandline(blastp, query=file_sc, subject=file_hs, outfmt=5, out=blast_output_file)()
# Now you can read the blast output from the file and process it however you want
# blast_result_record = NCBIXML.read(open(blast_output_file))
blast_seq_the_hard_way('sc.fsa', 'hsap.fsa')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment