Skip to content

Instantly share code, notes, and snippets.

@keithshep
Last active August 29, 2015 14:26
Show Gist options
  • Save keithshep/f640b8db7359309a3034 to your computer and use it in GitHub Desktop.
Save keithshep/f640b8db7359309a3034 to your computer and use it in GitHub Desktop.
import csv
import requests
def build_ensembl_biomart_dict(dataset_name, key_attr, val_attr):
# see http://ensembl.org/biomart/martview/ for the web application
biomart_request_url_template = \
'''http://ensembl.org/biomart/martservice?query=''' \
'''<?xml version="1.0" encoding="UTF-8"?>''' \
'''<!DOCTYPE Query>''' \
'''<Query virtualSchemaName="default" formatter="CSV" header="0" uniqueRows="0" count="" datasetConfigVersion="0.6">''' \
'''<Dataset name="{}" interface="default">''' \
'''<Attribute name="{}"/>''' \
'''<Attribute name="{}"/>''' \
'''</Dataset>''' \
'''</Query>'''
biomart_request_url = biomart_request_url_template.format(dataset_name, key_attr, val_attr)
req = requests.get(biomart_request_url, stream=True)
return dict(csv.reader(req.iter_lines()))
def main():
nm_to_trans_dict = build_ensembl_biomart_dict(
'hsapiens_gene_ensembl',
'refseq_mrna',
'ensembl_transcript_id')
trans_to_gene_dict = build_ensembl_biomart_dict(
'hsapiens_gene_ensembl',
'ensembl_transcript_id',
'ensembl_gene_id')
nm_ids = ('NM_005665', 'NM_021797', 'NM_020440', 'NM_001039703',
'NM_001009931', 'NM_002963', 'NM_001204087', 'NM_002455',
'NM_001105205', 'NM_005598')
for nm_id in nm_ids:
trans_id = nm_to_trans_dict[nm_id]
gene_id = trans_to_gene_dict[trans_id]
print('NM ID: {}, transcript: {}, gene: {}'.format(nm_id, trans_id, gene_id))
if __name__ == '__main__':
main()
@keithshep
Copy link
Author

resulting output:

NM ID: NM_005665, transcript: ENST00000370331, gene: ENSG00000067208
NM ID: NM_021797, transcript: ENST00000430615, gene: ENSG00000134216
NM ID: NM_020440, transcript: ENST00000393203, gene: ENSG00000134247
NM ID: NM_001039703, transcript: ENST00000583866, gene: ENSG00000271425
NM ID: NM_001009931, transcript: ENST00000368801, gene: ENSG00000197915
NM ID: NM_002963, transcript: ENST00000368723, gene: ENSG00000143556
NM ID: NM_001204087, transcript: ENST00000618040, gene: ENSG00000143603
NM ID: NM_002455, transcript: ENST00000368376, gene: ENSG00000173171
NM ID: NM_001105205, transcript: ENST00000368347, gene: ENSG00000160753
NM ID: NM_005598, transcript: ENST00000302101, gene: ENSG00000171786

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment