Last active
December 14, 2015 13:39
-
-
Save gregcaporaso/5095020 to your computer and use it in GitHub Desktop.
This is some really ancient code of mine that I'm reviving to generate some data for a figure.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python | |
from urllib import urlopen | |
from time import sleep | |
from sys import argv | |
__author__ = "Greg Caporaso" | |
__copyright__ = "Copyright 2013, Greg Caporaso" | |
__credits__ = ["Greg Caporaso"] | |
__license__ = "GPL" | |
__version__ = "0.0.0-dev" | |
__maintainer__ = "Greg Caporaso" | |
__email__ = "[email protected]" | |
__status__ = "Development" | |
class PubMedXmlRetriever(object): | |
""" A class to retrieve Medline XML for PMIDs """ | |
base_url = "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?" | |
parameters = {'db':'pubmed','retmode':'xml'} | |
# The time to sleep between calls to NCBI -- they require that this be | |
# at least 3 seconds. | |
_sleep_time = 3.0 | |
def __init__(self): | |
""" Initialize the class """ | |
pass | |
def __call__(self,pmids): | |
""" Retrieve the Medline XML for pmids | |
pmids: a list of pubmed identifiers as strings | |
For each pmid in pmids retrieve the Medline XML, and | |
return a dictionary of PMID mapped to the XML. | |
""" | |
results = {} | |
for pmid in pmids: | |
# For the current pmid, build the url to retrieve the xml. | |
# the urlopen output casts to a list of lines. I strip the | |
# newline off of each before returning the list. | |
results[pmid] = [line.strip() \ | |
for line in list(urlopen(self._construct_url(pmid)))] | |
# sleep before the next hit | |
sleep(self._sleep_time) | |
return results | |
def _construct_url(self,pmid): | |
""" Construct the url string from the variables provided | |
Ids: a list of strings, each containing a valid pubmed id | |
""" | |
result = [self.base_url] | |
for key,value in self.parameters.items(): | |
if value: | |
result.append(''.join([key,"=",value,"&"])) | |
result.append(''.join(["id=",pmid])) | |
return ''.join(result) | |
def retrieve_abstracts(pmids): | |
""" Given a list of pmids, return a dict mapping pmids to abstracts | |
Abstracts will be stored as a single string with no newlines. | |
""" | |
results = PubMedXmlRetriever()(pmids) | |
for pmid, xml in results.items(): | |
try: | |
# Convert the list of lines into a single string | |
raw_text = ' '.join(xml) | |
# Pull the abstract from the text, and overwrite the full xml | |
# in results with the abstract alone. | |
results[pmid] = raw_text[raw_text.index('<AbstractText>')+14:\ | |
raw_text.index('</AbstractText>')] | |
except ValueError: | |
# If there are no <AbstractText> tags, associate None with the pmid | |
results[pmid] = None | |
return results | |
if __name__ == "__main__": | |
if len(argv) != 3: | |
raise ValueError, "Usage: query_pubmed.py input_pmids_fp output_dir" | |
_, input_fp, output_dir = argv | |
pmids = [l.strip() for l in open(input_fp,'U')] | |
results = PubMedXmlRetriever()(pmids) | |
for pmid, xml in results.items(): | |
open('%s/%s.xml' % (output_dir,pmid),'w').write(' '.join(xml)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment