Last active
July 12, 2018 14:29
-
-
Save AnnaTSW0609/c06f700278786a05dd548d549f0d65d3 to your computer and use it in GitHub Desktop.
Plant_Parser
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
taxonomy_id ensembl_gene_id symbol gene_chrom_start gene_chrom_end chr_name chrom_strand description type_of_gene | |
athaliana_taxid AT3G11415 3588684 3589685 3 1 ncRNA | |
athaliana_taxid AT1G31258 11171225 11171835 1 1 other RNA [Source:TAIR;Acc:AT1G31258] ncRNA | |
athaliana_taxid AT5G24735 8468331 8469262 5 -1 other RNA [Source:TAIR;Acc:AT5G24735] ncRNA | |
athaliana_taxid AT2G45780 18854555 18855184 2 -1 other RNA [Source:TAIR;Acc:AT2G45780] ncRNA | |
athaliana_taxid AT2G42425 17661834 17662358 2 -1 Unknown gene [Source:TAIR;Acc:AT2G42425] ncRNA | |
athaliana_taxid AT4G01533 667148 669073 4 -1 other RNA [Source:TAIR;Acc:AT4G01533] ncRNA | |
athaliana_taxid AT4G09195 16880340 16880538 4 1 ncRNA | |
athaliana_taxid AT1G79075 29745939 29746842 1 1 other RNA [Source:TAIR;Acc:AT1G79075] ncRNA | |
athaliana_taxid AT5G10278 3226321 3232723 5 1 other RNA [Source:TAIR;Acc:AT5G10278] ncRNA | |
athaliana_taxid AT1G67238 25158115 25158454 1 -1 other RNA [Source:TAIR;Acc:AT1G67238] ncRNA | |
athaliana_taxid AT4G36648 17280528 17282730 4 -1 other RNA [Source:TAIR;Acc:AT4G36648] ncRNA | |
athaliana_taxid AT4G03605 178519 178717 4 1 ncRNA | |
athaliana_taxid AT2G15128 6564104 6564316 2 1 other RNA [Source:TAIR;Acc:AT2G15128] ncRNA | |
athaliana_taxid AT3G02832 616380 617322 3 1 ncRNA | |
athaliana_taxid AT3G48115 17771170 17772926 3 -1 other RNA [Source:TAIR;Acc:AT3G48115] ncRNA | |
athaliana_taxid AT1G07977 20412149 20412497 1 1 ncRNA | |
athaliana_taxid AT1G18745 6465487 6466454 1 -1 ncRNA | |
athaliana_taxid AT5G40275 16098190 16100051 5 -1 other RNA [Source:TAIR;Acc:AT5G40275] ncRNA | |
athaliana_taxid AT4G09405 17393758 17393956 4 -1 ncRNA | |
athaliana_taxid AT5G02975 6397787 6397985 5 -1 ncRNA | |
athaliana_taxid AT5G24206 8213626 8214689 5 -1 other RNA [Source:TAIR;Acc:AT5G24206] ncRNA | |
athaliana_taxid AT5G23155 7792732 7794952 5 1 other RNA [Source:TAIR;Acc:AT5G23155] ncRNA | |
athaliana_taxid AT3G45638 16755215 16756290 3 -1 other RNA [Source:TAIR;Acc:AT3G45638] ncRNA | |
athaliana_taxid AT5G24205 8210822 8213086 5 -1 other RNA [Source:TAIR;Acc:AT5G24205] ncRNA | |
athaliana_taxid AT5G03285 793514 793717 5 1 other RNA [Source:TAIR;Acc:AT5G03285] ncRNA | |
athaliana_taxid AT3G00980 104968 105271 3 -1 ncRNA | |
athaliana_taxid AT3G25795 9417551 9418705 3 -1 other RNA [Source:TAIR;Acc:AT3G25795] ncRNA | |
athaliana_taxid AT2G42485 17690531 17691018 2 1 other RNA [Source:TAIR;Acc:AT2G42485] ncRNA | |
athaliana_taxid AT3G52748 19548972 19549589 3 1 other RNA [Source:TAIR;Acc:AT3G52748] ncRNA | |
athaliana_taxid AT3G52742 19547358 19549011 3 -1 other RNA [Source:TAIR;Acc:AT3G52742] ncRNA | |
athaliana_taxid AT3G60176 22240155 22241164 3 -1 other RNA [Source:TAIR;Acc:AT3G60176] ncRNA | |
athaliana_taxid AT1G32172 11578506 11580547 1 1 other RNA [Source:TAIR;Acc:AT1G32172] ncRNA | |
athaliana_taxid AT1G26558 9175611 9177598 1 1 other RNA [Source:TAIR;Acc:AT1G26558] ncRNA | |
athaliana_taxid AT4G16892 9506322 9508337 4 -1 other RNA [Source:TAIR;Acc:AT4G16892] ncRNA | |
athaliana_taxid AT2G10537 4083603 4083913 2 -1 other RNA [Source:TAIR;Acc:AT2G10537] ncRNA | |
athaliana_taxid AT1G28900 10101028 10101099 1 1 pre-tRNA [Source:TAIR;Acc:AT1G28900] tRNA | |
athaliana_taxid ATCG01290 TRNI.4 152264 152337 Pt 1 tRNA-Ile [Source:TAIR;Acc:ATCG01290] tRNA | |
athaliana_taxid AT4G39195 18254989 18255072 4 -1 pre-tRNA [Source:TAIR;Acc:AT4G39195] tRNA | |
athaliana_taxid AT1G57300 21297221 21297302 1 1 pre-tRNA [Source:TAIR;Acc:AT1G57300] tRNA | |
athaliana_taxid AT2G15950 6946791 6946879 2 -1 pre-tRNA [Source:TAIR;Acc:AT2G15950] tRNA | |
athaliana_taxid ATCG00060 TRNQ 6616 6687 Pt -1 tRNA-Gln [Source:TAIR;Acc:ATCG00060] tRNA | |
athaliana_taxid AT5G63145 25330036 25330107 5 1 pre-tRNA [Source:TAIR;Acc:AT5G63145] tRNA | |
athaliana_taxid AT2G36510 15320234 15320316 2 1 pre-tRNA [Source:TAIR;Acc:AT2G36510] tRNA | |
athaliana_taxid AT4G28915 14267281 14267362 4 1 pre-tRNA [Source:TAIR;Acc:AT4G28915] tRNA | |
athaliana_taxid AT2G25400 10813839 10813911 2 1 pre-tRNA [Source:TAIR;Acc:AT2G25400] tRNA | |
athaliana_taxid AT1G45240 17158979 17159050 1 1 pre-tRNA [Source:TAIR;Acc:AT1G45240] tRNA | |
athaliana_taxid AT5G11475 3669000 3669072 5 -1 pre-tRNA [Source:TAIR;Acc:AT5G11475] tRNA | |
athaliana_taxid AT1G74570 28020383 28020463 1 -1 pre-tRNA [Source:TAIR;Acc:AT1G74570] tRNA | |
athaliana_taxid AT5G23665 7981398 7981469 5 1 pre-tRNA [Source:TAIR;Acc:AT5G23665] tRNA | |
athaliana_taxid AT2G02600 706796 706868 2 1 pre-tRNA [Source:TAIR;Acc:AT2G02600] tRNA | |
athaliana_taxid AT2G33660 14244699 14244770 2 1 pre-tRNA [Source:TAIR;Acc:AT2G33660] tRNA | |
athaliana_taxid ATMG00340 TRNY.1 104221 104295 Mt 1 tRNA-Tyr [Source:TAIR;Acc:ATMG00340] tRNA | |
athaliana_taxid AT5G53487 21723114 21723184 5 -1 pre-tRNA [Source:TAIR;Acc:AT5G53487] tRNA | |
athaliana_taxid AT1G01870 306384 306456 1 1 pre-tRNA [Source:TAIR;Acc:AT1G01870] tRNA | |
athaliana_taxid AT5G02385 508430 508502 5 -1 pre-tRNA [Source:TAIR;Acc:AT5G02385] tRNA | |
athaliana_taxid AT4G31075 15120650 15120722 4 1 pre-tRNA [Source:TAIR;Acc:AT4G31075] tRNA | |
athaliana_taxid AT4G32475 15671393 15671463 4 1 pre-tRNA [Source:TAIR;Acc:AT4G32475] tRNA | |
athaliana_taxid AT3G25715 9372374 9372447 3 -1 pre-tRNA [Source:TAIR;Acc:AT3G25715] tRNA | |
athaliana_taxid AT2G07754 3303735 3303822 2 -1 pre-tRNA [Source:TAIR;Acc:AT2G07754] tRNA | |
athaliana_taxid AT1G79240 29804123 29804195 1 1 pre-tRNA [Source:TAIR;Acc:AT1G79240] tRNA | |
athaliana_taxid AT1G30430 10756722 10756794 1 1 pre-tRNA [Source:TAIR;Acc:AT1G30430] tRNA | |
athaliana_taxid AT1G28950 10106898 10106969 1 1 pre-tRNA [Source:TAIR;Acc:AT1G28950] tRNA | |
athaliana_taxid ATMG00380 TRNN 105877 105948 Mt 1 tRNA-Asn [Source:TAIR;Acc:ATMG00380] tRNA | |
athaliana_taxid AT5G66535 26556023 26556096 5 -1 pre-tRNA [Source:TAIR;Acc:AT5G66535] tRNA |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
import re | |
def plant_parser(data_folder): | |
"""loading ensembl gene to symbol+name mapping""" | |
datafile = os.path.join(data_folder, 'plant_gene_test.txt') | |
pattern_ID = re.compile('^AT') | |
with open (datafile, "r+") as a: | |
for line in a: | |
line = line.strip() | |
x = re.split("[\t\n]", line) # Plan to put this splitting result list as class attribute, so all later functions can use it | |
doc = {"taxid" : x[0]} | |
if pattern_ID.match(x[1]) != None: | |
if x[2] != '': # add the symbol only if the field is not empty | |
doc["symbol"] = x[2] | |
if len(x[7]) != 0: | |
head, sep, tail = x[7].partition(' [Source:') | |
doc['name'] = head | |
doc["_id"] = x[1] | |
print(doc) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os.path | |
import copy | |
from biothings.utils.common import SubStr | |
from biothings.utils.dataload import tab2dict, tab2list, value_convert, normalized_value, \ | |
list2dict, dict_nodup, dict_attrmerge, tab2dict_iter | |
def _not_LRG(ld): | |
return not ld[1].startswith("LRG_") | |
def load_ensembl_main(data_folder): #Start here | |
"""loading ensembl gene to symbol+name mapping""" | |
def _fn(x): | |
import logging | |
out = {'taxid' : x[0]} | |
if x[1].strip() not in ['', '\\N']: | |
out['symbol'] = x[1].strip() | |
if x[2].strip() not in ['', '\\N']: | |
_name = SubStr(x[2].strip(), '', ' [Source:').strip() | |
if _name: | |
out['name'] = _name | |
return out | |
datafile = os.path.join(data_folder, 'plant_gene_test.txt') | |
for datadict in tab2dict_iter(datafile, (0, 1, 2, 7, 8), 1, includefn=_not_LRG): | |
datadict = value_convert(datadict, _fn) | |
for id,doc in datadict.items(): | |
doc['_id'] = id | |
print (doc) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment