Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Save MikeTrizna/29d5e68a7a9bd9cac43d394f868920ce to your computer and use it in GitHub Desktop.
Save MikeTrizna/29d5e68a7a9bd9cac43d394f868920ce to your computer and use it in GitHub Desktop.
eutils fasta xml
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"metadata": {
"ExecuteTime": {
"start_time": "2020-03-31T12:55:13.914098Z",
"end_time": "2020-03-31T12:55:15.114831Z"
},
"trusted": true
},
"cell_type": "code",
"source": "import pandas as pd\n\nimport requests\nfrom lxml import objectify\n\nimport time\nfrom tenacity import retry\nfrom tenacity.stop import stop_after_attempt\nfrom tenacity.wait import wait_fixed\n",
"execution_count": 1,
"outputs": []
},
{
"metadata": {
"ExecuteTime": {
"start_time": "2020-03-31T12:55:15.116276Z",
"end_time": "2020-03-31T12:55:15.326057Z"
},
"trusted": true
},
"cell_type": "code",
"source": "search_url = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi'\nsearch_term = \"coi[Gene] OR cox1[Gene] OR co1[Gene]\"\n\nsearch_params = {'term': search_term,\n 'db': 'nuccore',\n 'retmode': 'json',\n 'retmax': 10,\n 'idtype':'acc',\n 'usehistory': 'y'}\n\nr = requests.post(search_url, data=search_params)\nsearch_results = r.json()\nresult_count = int(search_results['esearchresult']['count'])\nsearch_results",
"execution_count": 2,
"outputs": [
{
"output_type": "execute_result",
"execution_count": 2,
"data": {
"text/plain": "{'header': {'type': 'esearch', 'version': '0.3'},\n 'esearchresult': {'count': '3542369',\n 'retmax': '10',\n 'retstart': '0',\n 'querykey': '1',\n 'webenv': 'NCID_1_87154153_130.14.18.48_9001_1585659315_335741641_0MetA0_S_MegaStore',\n 'idlist': ['KY430860.2',\n 'KY430858.2',\n 'KY430816.2',\n 'MN317567.1',\n 'NC_046603.1',\n 'NC_046596.1',\n 'NC_046595.1',\n 'NC_046594.1',\n 'NC_046592.1',\n 'NC_046591.1'],\n 'translationset': [],\n 'translationstack': [{'term': 'coi[Gene]',\n 'field': 'Gene',\n 'count': '3270152',\n 'explode': 'N'},\n {'term': 'cox1[Gene]', 'field': 'Gene', 'count': '221561', 'explode': 'N'},\n 'OR',\n {'term': 'co1[Gene]', 'field': 'Gene', 'count': '56118', 'explode': 'N'},\n 'OR'],\n 'querytranslation': 'coi[Gene] OR cox1[Gene] OR co1[Gene]'}}"
},
"metadata": {}
}
]
},
{
"metadata": {
"ExecuteTime": {
"start_time": "2020-03-31T12:55:15.327879Z",
"end_time": "2020-03-31T12:55:15.331651Z"
},
"trusted": true
},
"cell_type": "code",
"source": "query_key = search_results['esearchresult']['querykey']\nweb_env = search_results['esearchresult']['webenv']\nprint(query_key, web_env)",
"execution_count": 3,
"outputs": [
{
"output_type": "stream",
"text": "1 NCID_1_87154153_130.14.18.48_9001_1585659315_335741641_0MetA0_S_MegaStore\n",
"name": "stdout"
}
]
},
{
"metadata": {
"ExecuteTime": {
"start_time": "2020-03-31T12:55:15.333739Z",
"end_time": "2020-03-31T12:55:15.470108Z"
},
"trusted": true
},
"cell_type": "code",
"source": "efetch = \"https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi\"\nparams = {'db': 'nuccore',\n 'rettype': 'fasta',\n 'retmode': 'xml',\n 'query_key': query_key,\n 'WebEnv': web_env,\n 'retmax':3}\nr = requests.post(efetch, data=params)\nxml_results = objectify.fromstring(r.content)",
"execution_count": 4,
"outputs": []
},
{
"metadata": {
"ExecuteTime": {
"start_time": "2020-03-31T12:55:15.472228Z",
"end_time": "2020-03-31T12:55:15.477416Z"
},
"trusted": true
},
"cell_type": "code",
"source": "print(objectify.dump(xml_results))",
"execution_count": 5,
"outputs": [
{
"output_type": "stream",
"text": "TSeqSet = None [ObjectifiedElement]\n TSeq = None [ObjectifiedElement]\n TSeq_seqtype = '' [StringElement]\n * value = 'nucleotide'\n TSeq_accver = 'KY430860.2' [StringElement]\n TSeq_taxid = 696725 [IntElement]\n TSeq_orgname = 'Hedychrum nobile' [StringElement]\n TSeq_defline = 'Hedychrum nobile voucher ZFMK-TIS-29687 cytochrome oxidase subunit 1 (COI) gene, partial cds; mitochondrial' [StringElement]\n TSeq_length = 675 [IntElement]\n TSeq_sequence = 'ATATTATATTTTTTATTTGGAATATGATCAGGAGTTTTAGGTTCTTCATTAAGAATAATTATTCGATTAGAGTTGGGATTTTCAGGTTCATTAATTAAAGATGATCAATTTTATAATAGAGTAATTACAATACATGCATTTGTGATAATTTTTTTTATAGTTATACCTTTTATGATTGGTGGATTTGGAAATTGATTAATTCCTYTAATATTAGGTGCTCCTGATATAGCTTATCCTCGAATAAATAATATAAGGTTTTGATTATTACCTCCATCAATTTTATTTTTATTATTGAGTGGTTTTGTTAGATCGGGTTGTGGGACTGGGTGGACTGTATATCCTCCTCTCTCTTCATTATTAGGTCATTCAGGAATTAGGGTTGATTTAGCAATTTTTTCTTTGCATATTGCTGGGGTTTCGTCAATTATAGGGGCAGTAAATTTTATTTCTACTATTATAAATATACGTAGATTTTCATTAAAGATAGATCAATTAACTTTATTAACATGGTCAATTATTATTACTGCAATTTTATTATTACTATCTCTTCCAGTTTTAGCTGGTGCAATTACTATATTATTAACTGATCGAAATTTTAATACTTCATTTTTTGATCCTATGGGAGGTGGGGACCCAATTCTTTATCAACATTTATTTTGATTTTTTGGTCACCCA' [StringElement]\n TSeq = None [ObjectifiedElement]\n TSeq_seqtype = '' [StringElement]\n * value = 'nucleotide'\n TSeq_accver = 'KY430858.2' [StringElement]\n TSeq_taxid = 212593 [IntElement]\n TSeq_orgname = 'Hedychrum rutilans' [StringElement]\n TSeq_defline = 'Hedychrum rutilans voucher ZFMK-TIS-29801 cytochrome oxidase subunit 1 (COI) gene, partial cds; mitochondrial' [StringElement]\n TSeq_length = 675 [IntElement]\n TSeq_sequence = 'ATATTGTATTTTTTGTTTGGGATGTGATCAGGAATTTTAGGAGCATCTTTAAGAATAATTATTCGTTTAGAGTTGGGAAGTTCAGGGTCATTAATTAAAGATGATCAATTTTATAATAGGATTATTACTATACATGCTTTTGTAATGATTTTTTTTATAGTCATACCTTTTATGATTGGTGGATTTGGAAATTGATTAATTCCTTTAATATTAGGTGCTCCTGATATAGCTTATCCTCGAATAAATAATATGAGATTTTGATTATTACCTCCTTCTATTTTATTTTTATTACTTAGAGGATTTGTGAGATCTGGATGTGGAACTGGATGAACTGTTTACCCTCCTTTATCCTCATTACTTGGTCACTCAGGGATGAGGGTTGATTTAGCAATTTTTTCTTTGCATATTGCTGGGGCTTCTTCAATTATAGGTGCTGTAAATTTTATTTCTACAATTATAAATATACGAAGGGTTTCACTAAAAATAGATCAGTTAACATTATTGATTTGATCAATTATAATTACTGCAATTTTATTATTATTATCTCTTCCTGTATTGGCTGGTGCAATTACTATATTACTGACTGATCGAAATTTTAATACTTCATTTTTTGATCCAATAGGAGGGGGTGATCCAATTCTTTATCAACATTTATTTTGATTTTTTGGTCATCCA' [StringElement]\n TSeq = None [ObjectifiedElement]\n TSeq_seqtype = '' [StringElement]\n * value = 'nucleotide'\n TSeq_accver = 'KY430816.2' [StringElement]\n TSeq_taxid = 212606 [IntElement]\n TSeq_orgname = 'Parnopes grandior' [StringElement]\n TSeq_defline = 'Parnopes grandior voucher ZFMK-TIS-29757 cytochrome oxidase subunit 1 (COI) gene, partial cds; mitochondrial' [StringElement]\n TSeq_length = 444 [IntElement]\n TSeq_sequence = 'TATCCTCGAATAAATAATATGAGATTTTGACTTTTACCACCTTCTTTATTAATATTGTTATTAAGAAGAATTGTAGGGGGAGGGGTAGGAACAGGGTGAACAGTGTATCCTCCTTTATCATTATTAAGAGGCCACTCAAGAATAAGTGTGGATTTTGGAATTTTTTCTCTTCATATTGCGGGAGTATCTTCTATTATAGGAGCAATTAACTTTATTTCAACAGTTGGGAATATTAAAAGTAAAAGATTAAAAACTGAACAATTAACTTTATTAGTATGATCAATTTTTATTACAGCAATTTTATTACTTTTGTCATTACCAGTTTTGGCAGGAGCTATTACTATATTATTAAGAGATCGAAATTTAAATACTTCATTTTTTGATCCTGTGGGAGGGGGAGATCCAGTTTTATATCAGCATTTATTTTGATTTTTTGGTCATCCT' [StringElement]\n",
"name": "stdout"
}
]
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "",
"execution_count": null,
"outputs": []
}
],
"metadata": {
"kernelspec": {
"name": "python3",
"display_name": "Python 3",
"language": "python"
},
"language_info": {
"name": "python",
"version": "3.7.1",
"mimetype": "text/x-python",
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"pygments_lexer": "ipython3",
"nbconvert_exporter": "python",
"file_extension": ".py"
},
"gist": {
"id": "",
"data": {
"description": "eutils fasta xml",
"public": true
}
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment