Created
May 3, 2013 19:39
-
-
Save peterk87/5513274 to your computer and use it in GitHub Desktop.
Python: Parse tabular BLAST results from file into 2D dictionary ([query id/name] -> [subject id/name] -> [list of BLAST results (one or more)])
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # This file contains a set of functions for parsing out some useful information | |
| # from BLAST results files saved in BLAST's tabular output format ("-outfmt 6"). | |
| # Biopython is required for reading multifasta files and storing sequences. | |
| from Bio.Seq import Seq | |
| from Bio.SeqRecord import SeqRecord | |
| from Bio.Alphabet import IUPAC | |
| # if all of your genome sequences are within one multifasta file | |
| recs = [rec for rec in SeqIO.parse('all_genomes.fasta', 'fasta')] | |
| # if they are within multiple multifasta files: | |
| # fasta_files contains the paths to all of your multifasta files | |
| fasta_files = [] | |
| recs = [rec for f in fasta_files for rec in SeqIO.parse(f, 'fasta')] | |
| # in this case, each multifasta could have multiple contigs so each contig may | |
| # have a different id/name. This id/name would be it's identifier for BLAST. | |
| # get a dictionary of your strain/contig names and the corresponding SeqRecord | |
| # so that you can retrieve the SeqRecord using the name of that SeqRecord | |
| strain_name_seq_rec = {rec.name:rec for rec in recs} | |
| # The lengths can be stored in a dictionary where the query id/name can be | |
| # used to retrieve the length of that query. | |
| # query_files contains all of the query multifasta file paths | |
| query_files = [] | |
| query_lengths = {rec.name:len(rec.seq) for q_file in query_files for rec in SeqIO.parse(q_file, 'fasta')} | |
| def adjust_subject_indices(q_s_i, q_e_i, s_s_i, s_e_i, q_len, revcomp): | |
| """ | |
| Adjust subject indices based on query indices and query length. | |
| Args: | |
| q_s_i (int): Query start index | |
| q_e_i (int): Query end index | |
| s_s_i (int): Subject start index | |
| s_e_i (int): Subject end index | |
| q_len (int): Query length | |
| revcomp (bool): Reverse complemented? | |
| Returns: | |
| A tuple containing the adjusted subject start and end indices. | |
| """ | |
| add_to_start = q_s_i - 1 | |
| add_to_end = q_len - q_e_i | |
| if revcomp: | |
| s_s_i -= add_to_end | |
| s_e_i += add_to_start | |
| else: | |
| s_s_i -= add_to_start | |
| s_e_i += add_to_end | |
| s_s_i -= 1 | |
| s_e_i -= 1 | |
| return (s_s_i, s_e_i) | |
| def extract_extended_seq(sseq, q_s_i, q_e_i, s_s_i, s_e_i, q_len): | |
| """ | |
| Extract the extended subject sequence based on query start and end indices. | |
| Args: | |
| sseq (Bio.Sequence,string): Subject sequence | |
| q_s_i (int): Query start index | |
| q_e_i (int): Query end index | |
| s_s_i (int): Subject start index | |
| s_e_i (int): Subject end index | |
| q_len (int): Query length | |
| Returns: | |
| The extended subject sequence that matches up with the query | |
| """ | |
| revcomp = s_s_i > s_e_i | |
| s_s_i, s_e_i = adjust_subject_indices( | |
| q_s_i, | |
| q_e_i, | |
| s_s_i, | |
| s_e_i, | |
| q_len, | |
| revcomp) | |
| if s_s_i < 0: | |
| if len(sseq) < s_e_i: | |
| return sseq | |
| else: | |
| return sseq[0: s_e_i + 1] | |
| else: | |
| if len(sseq) - s_s_i < s_e_i - s_s_i + 1: | |
| return sseq[s_s_i:len(sseq) - s_s_i] | |
| else: | |
| return sseq[s_s_i:s_e_i + 1] | |
| def parse_tabular_blast_results(blast_results_file): | |
| """ | |
| Parse out the BLAST results from a tabular format BLAST results file into a | |
| 2D dictionary containing query id/name -> subject id/name -> BLAST result. | |
| Args: | |
| blast_results_file (string): Path to tabular BLAST results file. | |
| Returns: | |
| 2D dictionary containing query id/name -> subject id/name -> BLAST result | |
| """ | |
| # results are stored in a 2D dictionary - query -> subject -> BLAST result | |
| results = {} | |
| with open(blast_results_file,'r') as f: | |
| for line in f: | |
| line = line.rstrip() | |
| sp = line.split('\t') | |
| # get query id | |
| q = sp[0] | |
| q_len = query_lengths[q] | |
| # get subject id | |
| s = sp[1] | |
| # get the subject SeqRecord object | |
| sseq = strain_name_seq_rec[s] | |
| # get % identity and convert to decimal (e.g. 96.7 -> 0.967) | |
| pid = float(sp[2])/100.0 | |
| # get alignment length | |
| aln_len = int(sp[3]) | |
| # get query start index | |
| q_s_i = int(sp[6]) | |
| # get query end index | |
| q_e_i = int(sp[7]) | |
| # get subject start index | |
| s_s_i = int(sp[8]) | |
| # get subject end index | |
| s_e_i = int(sp[9]) | |
| # try to get the extended subject sequence based on the full length | |
| # of the query. If not able to grab the full length, grab as much | |
| # as possible. | |
| extracted_seq = extract_extended_seq(sseq, | |
| q_s_i, | |
| q_e_i, | |
| s_s_i, | |
| s_e_i, | |
| q_len) | |
| # number of identities (count of positive sequence matches) | |
| identities = int(aln_len*pid) | |
| # the results for a particular query could be whatever you need | |
| # in this example, a dictionary is made where different results have | |
| # different identifiers for instance | |
| result_dict = { | |
| 'aln_len':aln_len, | |
| 'q_len':q_len, | |
| 'pid':pid, | |
| 'extracted_seq': extracted_seq | |
| } | |
| # could add more items to this BLAST result dictionary | |
| if q not in results: | |
| results[q] = {s:[result_dict]} | |
| else: | |
| if s not in results[q]: | |
| results[q][s] = [result_dict] | |
| else: | |
| results[q][s].append(result_dict) | |
| return results |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment