Created
February 6, 2015 16:17
-
-
Save whacked/86d764a9e2b2607da741 to your computer and use it in GitHub Desktop.
traverse ncbi taxonomy?
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
''' | |
> head nodes.dmp | |
1 | 1 | no rank | | 8 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | | | |
2 | 131567 | superkingdom | | 0 | 0 | 11 | 0 | 0 | 0 | 0 | 0 | | | |
6 | 335928 | genus | | 0 | 1 | 11 | 1 | 0 | 1 | 0 | 0 | | | |
7 | 6 | species | AC | 0 | 1 | 11 | 1 | 0 | 1 | 1 | 0 | | | |
9 | 32199 | species | BA | 0 | 1 | 11 | 1 | 0 | 1 | 1 | 0 | | | |
10 | 135621 | genus | | 0 | 1 | 11 | 1 | 0 | 1 | 0 | 0 | | | |
11 | 1707 | species | CG | 0 | 1 | 11 | 1 | 0 | 1 | 1 | 0 | | | |
13 | 203488 | genus | | 0 | 1 | 11 | 1 | 0 | 1 | 0 | 0 | | | |
14 | 13 | species | DT | 0 | 1 | 11 | 1 | 0 | 1 | 1 | 0 | | | |
16 | 32011 | genus | | 0 | 1 | 11 | 1 | 0 | 1 | 0 | 0 | | | |
> head names.dmp | |
1 | all | | synonym | | |
1 | root | | scientific name | | |
2 | Bacteria | Bacteria <prokaryote> | scientific name | | |
2 | Monera | Monera <Bacteria> | in-part | | |
2 | Procaryotae | Procaryotae <Bacteria> | in-part | | |
2 | Prokaryota | Prokaryota <Bacteria> | in-part | | |
2 | Prokaryotae | Prokaryotae <Bacteria> | in-part | | |
2 | bacteria | bacteria <blast2> | blast name | | |
2 | eubacteria | | genbank common name | | |
2 | not Bacteria Haeckel 1894 | | synonym | | |
''' | |
from time import time as now | |
try: | |
rawnodes, rawnames | |
print('files already loaded') | |
except: | |
t0 = now() | |
rawnodes = open('nodes.dmp').read() | |
rawnames = open('names.dmp').read() | |
print('loaded files in %s sec' % (now() - t0)) | |
try: | |
namemap | |
print('name map already loaded') | |
except: | |
t0 = now() | |
namemap = {int(id):name for (id, name) in [[part.strip() for part in line.split('|', 2)[:2]] for line in rawnames.splitlines()]} | |
print('loaded name map in %s sec' % (now() - t0)) | |
try: | |
genusmap, parentmap | |
print('genus+parent map already loaded') | |
except: | |
t0 = now() | |
genusmap = {} | |
parentmap = {} | |
for line in rawnodes.splitlines(): | |
child, parent, category = [part.strip() for part in line.split('|', 3)][:3] | |
child = int(child) | |
parent = int(parent) | |
genusmap[child] = category | |
parentmap[child] = parent | |
print('loaded genus+parent map in %s sec' % (now() - t0)) | |
VERBOSE = False | |
def query(node_id, iter=0): | |
if iter > 10: | |
if VERBOSE: print(' * * * warning! big tree! * * *') | |
prefix = '>>'*(iter+1) | |
category = genusmap.get(node_id) | |
if VERBOSE: print('%s ID %s\tis type: %s' % (prefix, node_id, category)) | |
if category == 'genus': | |
print('|__ found genus at %s\n' % (namemap[node_id])) | |
else: | |
if node_id not in parentmap: | |
if VERBOSE: print(' * * * warning: no parent! * * *') | |
elif node_id == parentmap[node_id]: | |
if VERBOSE: print(' ! ! ! end of tree ! ! !') | |
else: | |
query(parentmap[node_id], iter+1) | |
import random | |
allid = namemap.keys() | |
for i in range(10): | |
rand_id = random.choice(allid) | |
print('='*50) | |
print('querying: %s\t%s' % (rand_id, namemap[rand_id])) | |
query(rand_id) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment