Skip to content

Instantly share code, notes, and snippets.

@gregcaporaso
Last active December 14, 2015 13:39
Show Gist options
  • Save gregcaporaso/5095020 to your computer and use it in GitHub Desktop.
Save gregcaporaso/5095020 to your computer and use it in GitHub Desktop.
This is some really ancient code of mine that I'm reviving to generate some data for a figure.
#!/usr/bin/python
from urllib import urlopen
from time import sleep
from sys import argv
__author__ = "Greg Caporaso"
__copyright__ = "Copyright 2013, Greg Caporaso"
__credits__ = ["Greg Caporaso"]
__license__ = "GPL"
__version__ = "0.0.0-dev"
__maintainer__ = "Greg Caporaso"
__email__ = "[email protected]"
__status__ = "Development"
class PubMedXmlRetriever(object):
""" A class to retrieve Medline XML for PMIDs """
base_url = "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?"
parameters = {'db':'pubmed','retmode':'xml'}
# The time to sleep between calls to NCBI -- they require that this be
# at least 3 seconds.
_sleep_time = 3.0
def __init__(self):
""" Initialize the class """
pass
def __call__(self,pmids):
""" Retrieve the Medline XML for pmids
pmids: a list of pubmed identifiers as strings
For each pmid in pmids retrieve the Medline XML, and
return a dictionary of PMID mapped to the XML.
"""
results = {}
for pmid in pmids:
# For the current pmid, build the url to retrieve the xml.
# the urlopen output casts to a list of lines. I strip the
# newline off of each before returning the list.
results[pmid] = [line.strip() \
for line in list(urlopen(self._construct_url(pmid)))]
# sleep before the next hit
sleep(self._sleep_time)
return results
def _construct_url(self,pmid):
""" Construct the url string from the variables provided
Ids: a list of strings, each containing a valid pubmed id
"""
result = [self.base_url]
for key,value in self.parameters.items():
if value:
result.append(''.join([key,"=",value,"&"]))
result.append(''.join(["id=",pmid]))
return ''.join(result)
def retrieve_abstracts(pmids):
""" Given a list of pmids, return a dict mapping pmids to abstracts
Abstracts will be stored as a single string with no newlines.
"""
results = PubMedXmlRetriever()(pmids)
for pmid, xml in results.items():
try:
# Convert the list of lines into a single string
raw_text = ' '.join(xml)
# Pull the abstract from the text, and overwrite the full xml
# in results with the abstract alone.
results[pmid] = raw_text[raw_text.index('<AbstractText>')+14:\
raw_text.index('</AbstractText>')]
except ValueError:
# If there are no <AbstractText> tags, associate None with the pmid
results[pmid] = None
return results
if __name__ == "__main__":
if len(argv) != 3:
raise ValueError, "Usage: query_pubmed.py input_pmids_fp output_dir"
_, input_fp, output_dir = argv
pmids = [l.strip() for l in open(input_fp,'U')]
results = PubMedXmlRetriever()(pmids)
for pmid, xml in results.items():
open('%s/%s.xml' % (output_dir,pmid),'w').write(' '.join(xml))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment