Created
February 2, 2014 21:14
-
-
Save azalea/8775085 to your computer and use it in GitHub Desktop.
This code tried to do things the hard way to solve: http://www.biostars.org/p/92205/ by first, the function split_fasta() splits the original fasta file into files containing individual sequence records; second, blast_seq_the_hard_way() iterates over these individual files and feeds their filenames to BLAST. Please note that BLAST can handle mul…
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
'''This code tried to do things the hard way to solve: http://www.biostars.org/p/92205/ by | |
first, the function split_fasta() splits the original fasta file into files containing individual sequence records; | |
second, blast_seq_the_hard_way() iterates over these individual files and feeds their filenames to BLAST. | |
Please note that BLAST can handle multiple sequences in -query and -subject at the same time, so there is no need | |
to split the files. This code is just for demonstration purposes. | |
''' | |
import os | |
import re | |
from Bio.Blast.Applications import NcbiblastpCommandline | |
from Bio.Blast import NCBIXML | |
from Bio import SeqIO | |
from Bio.Blast import NCBIWWW | |
def split_fasta(input_file): | |
'''Splits input_file into separate files, each containing one sequence record.''' | |
# The following two lines are used for the output filenames to keep each individual sequence | |
basename = os.path.basename(input_file) | |
filename_without_extension, extension = os.path.splitext(basename) | |
for i, record in enumerate(SeqIO.parse(input_file, 'fasta')): | |
# To generate output filenames called filename_0.fsa, filename_1.fsa, etc | |
outputfile = '%s_%d%s'%(filename_without_extension, i, extension) | |
SeqIO.write(record, outputfile, 'fasta') | |
# To keep track of the input_file for future use | |
return filename_without_extension, extension | |
def blast_seq_the_hard_way(SC_Fasta, HS_Fasta): | |
input_file_sc, sc_extension = split_fasta(SC_Fasta) | |
input_file_hs, hs_extension = split_fasta(HS_Fasta) | |
blastp = "C:\\Program Files\\NCBI\\blast-2.2.29+\\bin\\blastp" | |
# Find the files filename_0.fsa etc | |
for file_sc in [f for f in os.listdir(os.getcwd()) if re.search(r'^%s_\d+%s$'%(input_file_sc, sc_extension), f)]: | |
for file_hs in [f for f in os.listdir(os.getcwd()) if re.search(r'^%s_\d+%s$'%(input_file_hs, hs_extension), f)]: | |
print file_hs | |
blast_output_file = '%s_%s_blast.xml'%(file_sc, file_hs) | |
# I provide an "out" parameter here to save the blast output in the "blast_output_file" | |
NcbiblastpCommandline(blastp, query=file_sc, subject=file_hs, outfmt=5, out=blast_output_file)() | |
# Now you can read the blast output from the file and process it however you want | |
# blast_result_record = NCBIXML.read(open(blast_output_file)) | |
blast_seq_the_hard_way('sc.fsa', 'hsap.fsa') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment