Skip to content

Instantly share code, notes, and snippets.

@peterk87
Created May 3, 2013 19:39
Show Gist options
  • Select an option

  • Save peterk87/5513274 to your computer and use it in GitHub Desktop.

Select an option

Save peterk87/5513274 to your computer and use it in GitHub Desktop.
Python: Parse tabular BLAST results from file into 2D dictionary ([query id/name] -> [subject id/name] -> [list of BLAST results (one or more)])
# This file contains a set of functions for parsing out some useful information
# from BLAST results files saved in BLAST's tabular output format ("-outfmt 6").
# Biopython is required for reading multifasta files and storing sequences.
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio.Alphabet import IUPAC
# if all of your genome sequences are within one multifasta file
recs = [rec for rec in SeqIO.parse('all_genomes.fasta', 'fasta')]
# if they are within multiple multifasta files:
# fasta_files contains the paths to all of your multifasta files
fasta_files = []
recs = [rec for f in fasta_files for rec in SeqIO.parse(f, 'fasta')]
# in this case, each multifasta could have multiple contigs so each contig may
# have a different id/name. This id/name would be it's identifier for BLAST.
# get a dictionary of your strain/contig names and the corresponding SeqRecord
# so that you can retrieve the SeqRecord using the name of that SeqRecord
strain_name_seq_rec = {rec.name:rec for rec in recs}
# The lengths can be stored in a dictionary where the query id/name can be
# used to retrieve the length of that query.
# query_files contains all of the query multifasta file paths
query_files = []
query_lengths = {rec.name:len(rec.seq) for q_file in query_files for rec in SeqIO.parse(q_file, 'fasta')}
def adjust_subject_indices(q_s_i, q_e_i, s_s_i, s_e_i, q_len, revcomp):
"""
Adjust subject indices based on query indices and query length.
Args:
q_s_i (int): Query start index
q_e_i (int): Query end index
s_s_i (int): Subject start index
s_e_i (int): Subject end index
q_len (int): Query length
revcomp (bool): Reverse complemented?
Returns:
A tuple containing the adjusted subject start and end indices.
"""
add_to_start = q_s_i - 1
add_to_end = q_len - q_e_i
if revcomp:
s_s_i -= add_to_end
s_e_i += add_to_start
else:
s_s_i -= add_to_start
s_e_i += add_to_end
s_s_i -= 1
s_e_i -= 1
return (s_s_i, s_e_i)
def extract_extended_seq(sseq, q_s_i, q_e_i, s_s_i, s_e_i, q_len):
"""
Extract the extended subject sequence based on query start and end indices.
Args:
sseq (Bio.Sequence,string): Subject sequence
q_s_i (int): Query start index
q_e_i (int): Query end index
s_s_i (int): Subject start index
s_e_i (int): Subject end index
q_len (int): Query length
Returns:
The extended subject sequence that matches up with the query
"""
revcomp = s_s_i > s_e_i
s_s_i, s_e_i = adjust_subject_indices(
q_s_i,
q_e_i,
s_s_i,
s_e_i,
q_len,
revcomp)
if s_s_i < 0:
if len(sseq) < s_e_i:
return sseq
else:
return sseq[0: s_e_i + 1]
else:
if len(sseq) - s_s_i < s_e_i - s_s_i + 1:
return sseq[s_s_i:len(sseq) - s_s_i]
else:
return sseq[s_s_i:s_e_i + 1]
def parse_tabular_blast_results(blast_results_file):
"""
Parse out the BLAST results from a tabular format BLAST results file into a
2D dictionary containing query id/name -> subject id/name -> BLAST result.
Args:
blast_results_file (string): Path to tabular BLAST results file.
Returns:
2D dictionary containing query id/name -> subject id/name -> BLAST result
"""
# results are stored in a 2D dictionary - query -> subject -> BLAST result
results = {}
with open(blast_results_file,'r') as f:
for line in f:
line = line.rstrip()
sp = line.split('\t')
# get query id
q = sp[0]
q_len = query_lengths[q]
# get subject id
s = sp[1]
# get the subject SeqRecord object
sseq = strain_name_seq_rec[s]
# get % identity and convert to decimal (e.g. 96.7 -> 0.967)
pid = float(sp[2])/100.0
# get alignment length
aln_len = int(sp[3])
# get query start index
q_s_i = int(sp[6])
# get query end index
q_e_i = int(sp[7])
# get subject start index
s_s_i = int(sp[8])
# get subject end index
s_e_i = int(sp[9])
# try to get the extended subject sequence based on the full length
# of the query. If not able to grab the full length, grab as much
# as possible.
extracted_seq = extract_extended_seq(sseq,
q_s_i,
q_e_i,
s_s_i,
s_e_i,
q_len)
# number of identities (count of positive sequence matches)
identities = int(aln_len*pid)
# the results for a particular query could be whatever you need
# in this example, a dictionary is made where different results have
# different identifiers for instance
result_dict = {
'aln_len':aln_len,
'q_len':q_len,
'pid':pid,
'extracted_seq': extracted_seq
}
# could add more items to this BLAST result dictionary
if q not in results:
results[q] = {s:[result_dict]}
else:
if s not in results[q]:
results[q][s] = [result_dict]
else:
results[q][s].append(result_dict)
return results
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment