Created
March 13, 2020 01:15
-
-
Save pansapiens/870908abc7d0292ace0654a036487303 to your computer and use it in GitHub Desktop.
AAIndex parsing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import urllib.request as request | |
from collections import defaultdict | |
def parse_aaindex2(lines, default=None): | |
""" | |
Parse the lines of an AAIndex2 substitution matrix, return a dict of the entire database keyed by | |
AAIndex identifier. | |
The aaindex[id]['matrix'] dictionary is the same structure as Biopython's `Bio.SubsMat.MatrixInfo` | |
substitution matricies. | |
Example: | |
import urllib.request as request | |
aaindex2_text = request.urlopen('ftp://ftp.genome.jp/pub/db/community/aaindex/aaindex2').read().decode().splitlines() | |
aaindex2 = parse_aaindex2(aaindex_text) | |
grantham_matrix = aaindex2['GRAR740104']['matrix'] | |
""" | |
aaindex = {} | |
current_id = None | |
record_type = None | |
for ll in lines: | |
l = ll.strip() | |
if l == '': | |
continue | |
if ll[0] in ['H', 'D', 'R', 'A', 'T', '*', 'J', 'M']: | |
record_type = l[0] | |
if l == '//': | |
record_type = '//' | |
if record_type == 'H': | |
current_id = l[2:] | |
aaindex[current_id] = defaultdict(str) | |
matrix = {} | |
i_row = 0 | |
if record_type in ['D', 'R', 'A', 'T', '*', 'J']: | |
aaindex[current_id][record_type] += l[2:] | |
if record_type == 'M': | |
if ll[0] == 'M': | |
s = l[2:].split(',') | |
rows = list(s[0].split('=')[1].strip()) | |
cols = list(s[1].split('=')[1].strip()) | |
aaindex[current_id]['row_index'] = rows | |
aaindex[current_id]['col_index'] = cols | |
else: | |
values = [] | |
for v in l.split(): | |
if v != '-': | |
values.append(float(v)) | |
else: | |
values.append(None) | |
for i_col, v in enumerate(values): | |
matrix[(cols[i_col], rows[i_row])] = v | |
i_row += 1 | |
if record_type == '//': | |
aaindex[current_id]['matrix'] = matrix | |
record_type = None | |
return aaindex | |
def get_aaindex2(url='ftp://ftp.genome.jp/pub/db/community/aaindex/aaindex2') | |
aaindex2_text = request.urlopen(url).read().decode().splitlines() | |
return parse_aaindex2(aaindex2_text) | |
if __name__ == '__main__': | |
from pprint import pprint | |
aaindex2 = get_aaindex2() | |
pprint(aaindex2) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment