gregcaporaso · December 14, 2015 13:39
diff --git a/query_pubmed.py b/query_pubmed.py
 #!/usr/bin/python

 from urllib import urlopen
 from time import sleep
 from sys import argv

 __author__ = "Greg Caporaso"
 __copyright__ = "Copyright 2013, Greg Caporaso" 
 __credits__ = ["Greg Caporaso"]
 __license__ = "GPL"
 __version__ = "0.0.0-dev"
 __maintainer__ = "Greg Caporaso"
 __email__ = "[email protected]"
 __status__ = "Development"

 class PubMedXmlRetriever(object):
    """ A class to retrieve Medline XML for PMIDs """ 

    base_url = "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?"
    parameters = {'db':'pubmed','retmode':'xml'}
    # The time to sleep between calls to NCBI -- they require that this be
    # at least 3 seconds.
    _sleep_time = 3.0

    def __init__(self):
        """ Initialize the class """
        pass

    def __call__(self,pmids):
        """ Retrieve the Medline XML for pmids 

            pmids: a list of pubmed identifiers as strings            

            For each pmid in pmids retrieve the Medline XML, and 
                return a dictionary of PMID mapped to the XML.
        """
        results = {}
        for pmid in pmids:
            # For the current pmid, build the url to retrieve the xml.
            # the urlopen output casts to a list of lines. I  strip the 
            # newline off of each before returning the list.
            results[pmid] = [line.strip() \
                for line in list(urlopen(self._construct_url(pmid)))]
            # sleep before the next hit
            sleep(self._sleep_time)
        return results

    def _construct_url(self,pmid):
        """ Construct the url string from the variables provided 
            Ids: a list of strings, each containing a valid pubmed id
        """
        result = [self.base_url]
        for key,value in self.parameters.items():
            if value:
                result.append(''.join([key,"=",value,"&"]))
        result.append(''.join(["id=",pmid]))
        
        return ''.join(result)  

 def retrieve_abstracts(pmids):
    """ Given a list of pmids, return a dict mapping pmids to abstracts
        
        Abstracts will be stored as a single string with no newlines.
    """
    results = PubMedXmlRetriever()(pmids)

    for pmid, xml in results.items():
        try:
            # Convert the list of lines into a single string
            raw_text = ' '.join(xml)
            # Pull the abstract from the text, and overwrite the full xml
            # in results with the abstract alone.
            results[pmid] = raw_text[raw_text.index('<AbstractText>')+14:\
                raw_text.index('</AbstractText>')]
        except ValueError:
            # If there are no <AbstractText> tags, associate None with the pmid
            results[pmid] = None

    return results 

 if __name__ == "__main__":
    if len(argv) != 3:
        raise ValueError, "Usage: query_pubmed.py input_pmids_fp output_dir"
    _, input_fp, output_dir = argv
    
    pmids = [l.strip() for l in open(input_fp,'U')]
    results = PubMedXmlRetriever()(pmids)
    for pmid, xml in results.items():
        open('%s/%s.xml' % (output_dir,pmid),'w').write(' '.join(xml))
	#!/usr/bin/python

	from urllib import urlopen
	from time import sleep
	from sys import argv

	__author__ = "Greg Caporaso"
	__copyright__ = "Copyright 2013, Greg Caporaso"
	__credits__ = ["Greg Caporaso"]
	__license__ = "GPL"
	__version__ = "0.0.0-dev"
	__maintainer__ = "Greg Caporaso"
	__email__ = "[email protected]"
	__status__ = "Development"

	class PubMedXmlRetriever(object):
	""" A class to retrieve Medline XML for PMIDs """

	base_url = "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?"
	parameters = {'db':'pubmed','retmode':'xml'}
	# The time to sleep between calls to NCBI -- they require that this be
	# at least 3 seconds.
	_sleep_time = 3.0

	def __init__(self):
	""" Initialize the class """
	pass

	def __call__(self,pmids):
	""" Retrieve the Medline XML for pmids

	pmids: a list of pubmed identifiers as strings

	For each pmid in pmids retrieve the Medline XML, and
	return a dictionary of PMID mapped to the XML.
	"""
	results = {}
	for pmid in pmids:
	# For the current pmid, build the url to retrieve the xml.
	# the urlopen output casts to a list of lines. I strip the
	# newline off of each before returning the list.
	results[pmid] = [line.strip() \
	for line in list(urlopen(self._construct_url(pmid)))]
	# sleep before the next hit
	sleep(self._sleep_time)
	return results

	def _construct_url(self,pmid):
	""" Construct the url string from the variables provided
	Ids: a list of strings, each containing a valid pubmed id
	"""
	result = [self.base_url]
	for key,value in self.parameters.items():
	if value:
	result.append(''.join([key,"=",value,"&"]))
	result.append(''.join(["id=",pmid]))

	return ''.join(result)

	def retrieve_abstracts(pmids):
	""" Given a list of pmids, return a dict mapping pmids to abstracts

	Abstracts will be stored as a single string with no newlines.
	"""
	results = PubMedXmlRetriever()(pmids)

	for pmid, xml in results.items():
	try:
	# Convert the list of lines into a single string
	raw_text = ' '.join(xml)
	# Pull the abstract from the text, and overwrite the full xml
	# in results with the abstract alone.
	results[pmid] = raw_text[raw_text.index('<AbstractText>')+14:\
	raw_text.index('</AbstractText>')]
	except ValueError:
	# If there are no <AbstractText> tags, associate None with the pmid
	results[pmid] = None

	return results

	if __name__ == "__main__":
	if len(argv) != 3:
	raise ValueError, "Usage: query_pubmed.py input_pmids_fp output_dir"
	_, input_fp, output_dir = argv

	pmids = [l.strip() for l in open(input_fp,'U')]
	results = PubMedXmlRetriever()(pmids)
	for pmid, xml in results.items():
	open('%s/%s.xml' % (output_dir,pmid),'w').write(' '.join(xml))