Skip to content

Instantly share code, notes, and snippets.

@nhoffman
Created October 31, 2013 16:46
Show Gist options
  • Save nhoffman/7252970 to your computer and use it in GitHub Desktop.
Save nhoffman/7252970 to your computer and use it in GitHub Desktop.
Retrieve and format pubmed citations
#!/usr/bin/env python
# see http://eutils.ncbi.nlm.nih.gov/entrez/query/static/efetchlit_help.html
import sys
import urllib
import os
import xml.dom.minidom
import re
import urllib2
from operator import itemgetter
from optparse import OptionParser
# see http://www.xml.com/pub/a/2005/05/18/unicode.html
import codecs
enc, dec = codecs.lookup('UTF-8')[:2]
def params(**args):
return args
def send_web_query(url):
f = urllib.urlopen(url)
return f.read()
def to_text(node):
t = []
for c in node.childNodes:
if c.nodeType == 3:
t.append(c.nodeValue)
return ''.join(t)
def get_node_vals(doc, nodename):
vals = [cast(to_text(e)) for e in doc.getElementsByTagName(nodename)]
if len(vals) == 0:
return None
elif len(vals) == 1:
return vals[0]
else:
return vals
def get_document_object(target):
"""target is a) a filename, b) an open file-like object, or
c) a string representing an xml document. Returns output from either
xml.dom.minidom.parse or .parseString as appropriate"""
try:
if os.access(target, os.F_OK):
parser_function = xml.dom.minidom.parse
else:
# is a string but isn't a filename
parser_function = xml.dom.minidom.parseString
except TypeError:
# parhaps an open file object?
parser_function = xml.dom.minidom.parse
return parser_function(target)
def cast(instr):
"""Tries to cast instr as an int or float; returns instr
if both fail."""
try:
return int(instr)
except ValueError:
try:
return float(instr)
except ValueError:
return instr.strip()
def get_author(authornode):
lastname = get_node_vals(authornode, 'LastName')
initials = get_node_vals(authornode, 'Initials')
return '%s %s' % (lastname, initials)
def get_all_authors(authors):
if len(authors) == 1:
return get_author(authors[0])
else:
return ', '.join([get_author(a) for a in authors[:-1]]) + ', and ' + get_author(authors[-1])
def get_short_authors(authors):
if len(authors) == 1:
return get_author(authors[0])
elif len(authors) == 2:
return ' and '.join([get_author(a) for a in authors])
elif len(authors) > 2:
return '%s, et al' % get_author(authors[0])
def get_first_author(authors):
return get_author(authors[0])
def get_citation_data(doc):
d = {}
# authors
authors = doc.getElementsByTagName('Author')
d['authors_long'] = get_all_authors(authors)
d['first_author'] = get_first_author(authors)
d['first_author_name'] = get_node_vals(authors[0], 'LastName')
d['authors_short'] = get_short_authors(authors)
d['title'] = get_node_vals(doc, 'ArticleTitle')
d['pages'] = get_node_vals(doc, 'MedlinePgn')
try:
d['firstpage'] = d['pages'].split('-')[0]
except AttributeError:
d['firstpage'] = d['pages']
journal_info = doc.getElementsByTagName('Journal')[0]
d['journal'] = get_node_vals(
journal_info, 'ISOAbbreviation') or get_node_vals(journal_info, 'Title')
d['year'] = get_node_vals(journal_info, 'Year') or get_node_vals(
journal_info, 'MedlineDate')
d['revyear'] = int(re.findall(r'\d{4}', `d['year']`)[0]) * -1
d['volume'] = get_node_vals(journal_info, 'Volume')
pmid = get_node_vals(doc, 'PMID')
d['pmid'] = str(pmid if isinstance(pmid, int) else pmid[0])
d['abstract'] = get_node_vals(doc, 'AbstractText')
# handle unicode
for k, v in d.items():
d[k] = enc(v)[0] if isinstance(v, unicode) else v
return d
def check_format(format_value):
ok_formats = []
for key in globals().keys():
if key.startswith('format_'):
ok_formats.append(key.replace('format_', ''))
if format_value not in ok_formats:
print 'format must have one of the following values: %s' % \
' '.join(ok_formats)
sys.exit()
def show_all_formats(xmlstr, showkeys=True):
doc = get_document_object(xmlstr)
articles = doc.getElementsByTagName('PubmedArticle')
d = get_citation_data(articles[0])
for format in xrange(1, 1000):
try:
fun = globals()['format_%s' % format]
print '%s) %s\n%s\n' % (format,
fun.__doc__,
formatter(fun, d))
except KeyError:
break
if showkeys:
print 'Raw citation data'
keys = d.keys()
keys.sort()
for k in keys:
print '( %s ) %s' % (k, d[k])
def fetch_refs(pmid_list):
baseurl = """http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?"""
searchstr = urllib.urlencode(params(
db='pubmed',
id=','.join(pmid_list),
retmode='xml',
rettype='medline'))
url = baseurl + searchstr
# print url
xmlstr = send_web_query(url)
return xmlstr
def format_1(citation_data):
"""Long format"""
return '%(authors_long)s. %(title)s %(journal)s. %(year)s, %(volume)s:%(pages)s. PMID %(pmid)s' % citation_data
def format_2(citation_data):
"""Abbreviared authors"""
return '%(authors_short)s. %(title)s %(journal)s. %(year)s, %(volume)s:%(pages)s. PMID %(pmid)s' % citation_data
def format_3(citation_data):
"""Brief"""
return '%(journal)s %(year)s, %(volume)s:%(firstpage)s. PMID %(pmid)s' % citation_data
def format_4(citation_data):
"""Format tag"""
outstr = '%(first_author_name)s_%(year)s_%(pmid)s' % citation_data
return '_'.join(outstr.split())
def format_5(citation_data):
"""Brief with first author"""
return '%(first_author_name)s, et al. %(journal)s %(year)s, %(volume)s:%(firstpage)s. PMID %(pmid)s' % citation_data
def format_6(citation_data):
"""thebibliography entry for latex (for single reference)"""
return r"""\cite{%(pmid)s}
\begin{thebibliography}{99}
\bibitem{%(pmid)s} %(authors_short)s. %(title)s %(journal)s. %(year)s, %(volume)s:%(pages)s. PMID %(pmid)s
\end{thebibliography}""" % citation_data
def format_7(citation_data):
"""thebibliography entry for latex (for multiple references)"""
return r"""\bibitem{%(pmid)s} %(authors_short)s. %(title)s %(journal)s. %(year)s, %(volume)s:%(pages)s. PMID %(pmid)s""" % citation_data
def format_8(citation_data):
"""Format for refitem custom latex environment"""
return r"""\refitem{%(authors_long)s.}{%(title)s}{%(journal)s.}{%(year)s, %(volume)s:%(pages)s.}{PMID %(pmid)s}""" % citation_data
def format_9(citation_data):
"""Abbreviared authors with abstract"""
return '%(authors_short)s. %(title)s %(journal)s. %(year)s, %(volume)s:%(pages)s. PMID %(pmid)s \n\n%(abstract)s' % citation_data
def format_10(citation_data):
"""MoinMoin numbered list"""
s = r""" 1. %(authors_long)s. %(title)s %(journal)s. %(year)s, %(volume)s:%(pages)s. [[http://www.ncbi.nlm.nih.gov/pubmed/%(pmid)s|%(pmid)s]]""" % citation_data
return escape_camelcase(s)
def format_11(citation_data):
"""Long format with me in bold html tags"""
citation_data['authors_long'] = re.sub(
r'(Hoffman NG?)',
lambda mo: r'<b>%s</b>' % mo.group(0), citation_data['authors_long'])
return r'<li>%(authors_long)s. %(title)s %(journal)s. %(year)s, %(volume)s:%(pages)s. %(pmcid+)s</li>' % citation_data
def escape_camelcase(instr):
return re.sub(r'\b[A-Z][a-zA-Z]*(?:[a-z][a-zA-Z]*[A-Z]|[A-Z][a-zA-Z]*[a-z])[a-zA-Z]*\b', lambda mo: '!' + mo.group(0), instr)
def formatter(format_fun, d):
output = re.sub(r'\.{2,10}', '.', format_fun(d))
return output
def get_stdin(outval='file'):
"""Returns an open file object (outval='file') or string
containing stdin (outval='string') if there is standard input;
otherwise returns None."""
try:
sys.__stdin__.tell()
except IOError, msg:
if outval == 'file':
return sys.__stdin__
elif outval == 'string':
return sys.__stdin__.read().strip()
except ValueError, msg:
print 'Stdin is closed:', msg
return
else:
return None
def main():
usage = """%prog [options] PMID's
Format a set of references retrieved from pubmed by pubmed id (PMID)
Additional PMIDs (whitespace delimited) will be read from stdin if provided.""".strip()
parser = OptionParser(
usage=usage, version="$Id: gref.py,v 1.1 2005/12/25 23:48:10 nghoffma Exp $")
parser.add_option("-o", "--output", dest="filename",
help="write otuput to FILE", metavar="FILE", default=None)
parser.add_option("-f", "--format", dest="format",
help="choose reference format number", default='1')
parser.add_option("-e", "--show-examples",
action="store_true", dest="show_examples", default=False,
help="show example of each reference format")
parser.add_option("-k", "--show-keys",
action="store_true", dest="show_keys", default=False,
help="show keys into raw citation data (must be accompanied by -e)")
parser.add_option("-x", "--xml-output", dest="xmlfile",
help="write downloaded xml output to FILE", metavar="FILE", default=False)
parser.add_option("-t", "--sort-terms", dest="sort_terms",
help="sort multiple references according to SORT_TERMS (comma delimited list, eg 'first_author_name,year' [default]). Use -ek for list of terms.", default=None)
(options, args) = parser.parse_args()
html_formats = {'11'}
# print options, args
if options.show_examples:
try:
xmlstr = open(
os.path.join(os.path.split(__file__)[0], 'gref.xml')).read()
except IOError:
xmlstr = fetch_refs(['16284180'])
show_all_formats(xmlstr, showkeys=options.show_keys)
sys.exit()
stdin = get_stdin('string')
if stdin:
args += stdin.split()
if not args:
parser.print_help()
sys.exit()
if options.filename:
outfile = open(options.filename, 'w')
else:
outfile = sys.stdout
formats = options.format.split(',')
for f in formats:
check_format(f)
xmlstr = fetch_refs(args)
doc = get_document_object(xmlstr)
articles = doc.getElementsByTagName('PubmedArticle')
datalist = [get_citation_data(a) for a in articles]
# add pmcids
url = 'http://www.pubmedcentral.nih.gov/utils/entrezpmc.cgi?' + ','.join(d['pmid'] for d in datalist)
pmcids = {}
for line in urllib2.urlopen(url):
try:
pmid, _, pmcid = line.split()
except ValueError:
pmid, _ = line.split(None, 1)
pmcids[pmid] = None
else:
pmcids[pmid] = pmcid
for d in datalist:
pmcid = pmcids[d['pmid']]
d['pmcid'] = pmcid
d['pmcid+'] = 'PMCID: PMC{}'.format(pmcid) if pmcid else ''
if options.sort_terms:
datalist = sorted(
datalist, key=itemgetter(*options.sort_terms.split(',')))
if options.format in html_formats:
outfile.write(r'<html><!--###--><body><ol>')
outfile.write('\n')
for d in datalist:
for format in formats:
output = formatter(globals()['format_%s' % format], d)
outfile.write(output + '\n')
if options.format in html_formats:
outfile.write(r'</ol></body><!--###--></html>')
outfile.write('\n')
if options.xmlfile:
f = open(options.xmlfile, 'w')
f.write(xmlstr)
f.close()
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment