Skip to content

Instantly share code, notes, and snippets.

@AnnaTSW0609
Last active July 12, 2018 14:29
Show Gist options
  • Save AnnaTSW0609/c06f700278786a05dd548d549f0d65d3 to your computer and use it in GitHub Desktop.
Save AnnaTSW0609/c06f700278786a05dd548d549f0d65d3 to your computer and use it in GitHub Desktop.
Plant_Parser
taxonomy_id ensembl_gene_id symbol gene_chrom_start gene_chrom_end chr_name chrom_strand description type_of_gene
athaliana_taxid AT3G11415 3588684 3589685 3 1 ncRNA
athaliana_taxid AT1G31258 11171225 11171835 1 1 other RNA [Source:TAIR;Acc:AT1G31258] ncRNA
athaliana_taxid AT5G24735 8468331 8469262 5 -1 other RNA [Source:TAIR;Acc:AT5G24735] ncRNA
athaliana_taxid AT2G45780 18854555 18855184 2 -1 other RNA [Source:TAIR;Acc:AT2G45780] ncRNA
athaliana_taxid AT2G42425 17661834 17662358 2 -1 Unknown gene [Source:TAIR;Acc:AT2G42425] ncRNA
athaliana_taxid AT4G01533 667148 669073 4 -1 other RNA [Source:TAIR;Acc:AT4G01533] ncRNA
athaliana_taxid AT4G09195 16880340 16880538 4 1 ncRNA
athaliana_taxid AT1G79075 29745939 29746842 1 1 other RNA [Source:TAIR;Acc:AT1G79075] ncRNA
athaliana_taxid AT5G10278 3226321 3232723 5 1 other RNA [Source:TAIR;Acc:AT5G10278] ncRNA
athaliana_taxid AT1G67238 25158115 25158454 1 -1 other RNA [Source:TAIR;Acc:AT1G67238] ncRNA
athaliana_taxid AT4G36648 17280528 17282730 4 -1 other RNA [Source:TAIR;Acc:AT4G36648] ncRNA
athaliana_taxid AT4G03605 178519 178717 4 1 ncRNA
athaliana_taxid AT2G15128 6564104 6564316 2 1 other RNA [Source:TAIR;Acc:AT2G15128] ncRNA
athaliana_taxid AT3G02832 616380 617322 3 1 ncRNA
athaliana_taxid AT3G48115 17771170 17772926 3 -1 other RNA [Source:TAIR;Acc:AT3G48115] ncRNA
athaliana_taxid AT1G07977 20412149 20412497 1 1 ncRNA
athaliana_taxid AT1G18745 6465487 6466454 1 -1 ncRNA
athaliana_taxid AT5G40275 16098190 16100051 5 -1 other RNA [Source:TAIR;Acc:AT5G40275] ncRNA
athaliana_taxid AT4G09405 17393758 17393956 4 -1 ncRNA
athaliana_taxid AT5G02975 6397787 6397985 5 -1 ncRNA
athaliana_taxid AT5G24206 8213626 8214689 5 -1 other RNA [Source:TAIR;Acc:AT5G24206] ncRNA
athaliana_taxid AT5G23155 7792732 7794952 5 1 other RNA [Source:TAIR;Acc:AT5G23155] ncRNA
athaliana_taxid AT3G45638 16755215 16756290 3 -1 other RNA [Source:TAIR;Acc:AT3G45638] ncRNA
athaliana_taxid AT5G24205 8210822 8213086 5 -1 other RNA [Source:TAIR;Acc:AT5G24205] ncRNA
athaliana_taxid AT5G03285 793514 793717 5 1 other RNA [Source:TAIR;Acc:AT5G03285] ncRNA
athaliana_taxid AT3G00980 104968 105271 3 -1 ncRNA
athaliana_taxid AT3G25795 9417551 9418705 3 -1 other RNA [Source:TAIR;Acc:AT3G25795] ncRNA
athaliana_taxid AT2G42485 17690531 17691018 2 1 other RNA [Source:TAIR;Acc:AT2G42485] ncRNA
athaliana_taxid AT3G52748 19548972 19549589 3 1 other RNA [Source:TAIR;Acc:AT3G52748] ncRNA
athaliana_taxid AT3G52742 19547358 19549011 3 -1 other RNA [Source:TAIR;Acc:AT3G52742] ncRNA
athaliana_taxid AT3G60176 22240155 22241164 3 -1 other RNA [Source:TAIR;Acc:AT3G60176] ncRNA
athaliana_taxid AT1G32172 11578506 11580547 1 1 other RNA [Source:TAIR;Acc:AT1G32172] ncRNA
athaliana_taxid AT1G26558 9175611 9177598 1 1 other RNA [Source:TAIR;Acc:AT1G26558] ncRNA
athaliana_taxid AT4G16892 9506322 9508337 4 -1 other RNA [Source:TAIR;Acc:AT4G16892] ncRNA
athaliana_taxid AT2G10537 4083603 4083913 2 -1 other RNA [Source:TAIR;Acc:AT2G10537] ncRNA
athaliana_taxid AT1G28900 10101028 10101099 1 1 pre-tRNA [Source:TAIR;Acc:AT1G28900] tRNA
athaliana_taxid ATCG01290 TRNI.4 152264 152337 Pt 1 tRNA-Ile [Source:TAIR;Acc:ATCG01290] tRNA
athaliana_taxid AT4G39195 18254989 18255072 4 -1 pre-tRNA [Source:TAIR;Acc:AT4G39195] tRNA
athaliana_taxid AT1G57300 21297221 21297302 1 1 pre-tRNA [Source:TAIR;Acc:AT1G57300] tRNA
athaliana_taxid AT2G15950 6946791 6946879 2 -1 pre-tRNA [Source:TAIR;Acc:AT2G15950] tRNA
athaliana_taxid ATCG00060 TRNQ 6616 6687 Pt -1 tRNA-Gln [Source:TAIR;Acc:ATCG00060] tRNA
athaliana_taxid AT5G63145 25330036 25330107 5 1 pre-tRNA [Source:TAIR;Acc:AT5G63145] tRNA
athaliana_taxid AT2G36510 15320234 15320316 2 1 pre-tRNA [Source:TAIR;Acc:AT2G36510] tRNA
athaliana_taxid AT4G28915 14267281 14267362 4 1 pre-tRNA [Source:TAIR;Acc:AT4G28915] tRNA
athaliana_taxid AT2G25400 10813839 10813911 2 1 pre-tRNA [Source:TAIR;Acc:AT2G25400] tRNA
athaliana_taxid AT1G45240 17158979 17159050 1 1 pre-tRNA [Source:TAIR;Acc:AT1G45240] tRNA
athaliana_taxid AT5G11475 3669000 3669072 5 -1 pre-tRNA [Source:TAIR;Acc:AT5G11475] tRNA
athaliana_taxid AT1G74570 28020383 28020463 1 -1 pre-tRNA [Source:TAIR;Acc:AT1G74570] tRNA
athaliana_taxid AT5G23665 7981398 7981469 5 1 pre-tRNA [Source:TAIR;Acc:AT5G23665] tRNA
athaliana_taxid AT2G02600 706796 706868 2 1 pre-tRNA [Source:TAIR;Acc:AT2G02600] tRNA
athaliana_taxid AT2G33660 14244699 14244770 2 1 pre-tRNA [Source:TAIR;Acc:AT2G33660] tRNA
athaliana_taxid ATMG00340 TRNY.1 104221 104295 Mt 1 tRNA-Tyr [Source:TAIR;Acc:ATMG00340] tRNA
athaliana_taxid AT5G53487 21723114 21723184 5 -1 pre-tRNA [Source:TAIR;Acc:AT5G53487] tRNA
athaliana_taxid AT1G01870 306384 306456 1 1 pre-tRNA [Source:TAIR;Acc:AT1G01870] tRNA
athaliana_taxid AT5G02385 508430 508502 5 -1 pre-tRNA [Source:TAIR;Acc:AT5G02385] tRNA
athaliana_taxid AT4G31075 15120650 15120722 4 1 pre-tRNA [Source:TAIR;Acc:AT4G31075] tRNA
athaliana_taxid AT4G32475 15671393 15671463 4 1 pre-tRNA [Source:TAIR;Acc:AT4G32475] tRNA
athaliana_taxid AT3G25715 9372374 9372447 3 -1 pre-tRNA [Source:TAIR;Acc:AT3G25715] tRNA
athaliana_taxid AT2G07754 3303735 3303822 2 -1 pre-tRNA [Source:TAIR;Acc:AT2G07754] tRNA
athaliana_taxid AT1G79240 29804123 29804195 1 1 pre-tRNA [Source:TAIR;Acc:AT1G79240] tRNA
athaliana_taxid AT1G30430 10756722 10756794 1 1 pre-tRNA [Source:TAIR;Acc:AT1G30430] tRNA
athaliana_taxid AT1G28950 10106898 10106969 1 1 pre-tRNA [Source:TAIR;Acc:AT1G28950] tRNA
athaliana_taxid ATMG00380 TRNN 105877 105948 Mt 1 tRNA-Asn [Source:TAIR;Acc:ATMG00380] tRNA
athaliana_taxid AT5G66535 26556023 26556096 5 -1 pre-tRNA [Source:TAIR;Acc:AT5G66535] tRNA
import os
import re
def plant_parser(data_folder):
"""loading ensembl gene to symbol+name mapping"""
datafile = os.path.join(data_folder, 'plant_gene_test.txt')
pattern_ID = re.compile('^AT')
with open (datafile, "r+") as a:
for line in a:
line = line.strip()
x = re.split("[\t\n]", line) # Plan to put this splitting result list as class attribute, so all later functions can use it
doc = {"taxid" : x[0]}
if pattern_ID.match(x[1]) != None:
if x[2] != '': # add the symbol only if the field is not empty
doc["symbol"] = x[2]
if len(x[7]) != 0:
head, sep, tail = x[7].partition(' [Source:')
doc['name'] = head
doc["_id"] = x[1]
print(doc)
import os.path
import copy
from biothings.utils.common import SubStr
from biothings.utils.dataload import tab2dict, tab2list, value_convert, normalized_value, \
list2dict, dict_nodup, dict_attrmerge, tab2dict_iter
def _not_LRG(ld):
return not ld[1].startswith("LRG_")
def load_ensembl_main(data_folder): #Start here
"""loading ensembl gene to symbol+name mapping"""
def _fn(x):
import logging
out = {'taxid' : x[0]}
if x[1].strip() not in ['', '\\N']:
out['symbol'] = x[1].strip()
if x[2].strip() not in ['', '\\N']:
_name = SubStr(x[2].strip(), '', ' [Source:').strip()
if _name:
out['name'] = _name
return out
datafile = os.path.join(data_folder, 'plant_gene_test.txt')
for datadict in tab2dict_iter(datafile, (0, 1, 2, 7, 8), 1, includefn=_not_LRG):
datadict = value_convert(datadict, _fn)
for id,doc in datadict.items():
doc['_id'] = id
print (doc)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment