Created
October 31, 2013 16:46
-
-
Save nhoffman/7252970 to your computer and use it in GitHub Desktop.
Retrieve and format pubmed citations
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# see http://eutils.ncbi.nlm.nih.gov/entrez/query/static/efetchlit_help.html | |
import sys | |
import urllib | |
import os | |
import xml.dom.minidom | |
import re | |
import urllib2 | |
from operator import itemgetter | |
from optparse import OptionParser | |
# see http://www.xml.com/pub/a/2005/05/18/unicode.html | |
import codecs | |
enc, dec = codecs.lookup('UTF-8')[:2] | |
def params(**args): | |
return args | |
def send_web_query(url): | |
f = urllib.urlopen(url) | |
return f.read() | |
def to_text(node): | |
t = [] | |
for c in node.childNodes: | |
if c.nodeType == 3: | |
t.append(c.nodeValue) | |
return ''.join(t) | |
def get_node_vals(doc, nodename): | |
vals = [cast(to_text(e)) for e in doc.getElementsByTagName(nodename)] | |
if len(vals) == 0: | |
return None | |
elif len(vals) == 1: | |
return vals[0] | |
else: | |
return vals | |
def get_document_object(target): | |
"""target is a) a filename, b) an open file-like object, or | |
c) a string representing an xml document. Returns output from either | |
xml.dom.minidom.parse or .parseString as appropriate""" | |
try: | |
if os.access(target, os.F_OK): | |
parser_function = xml.dom.minidom.parse | |
else: | |
# is a string but isn't a filename | |
parser_function = xml.dom.minidom.parseString | |
except TypeError: | |
# parhaps an open file object? | |
parser_function = xml.dom.minidom.parse | |
return parser_function(target) | |
def cast(instr): | |
"""Tries to cast instr as an int or float; returns instr | |
if both fail.""" | |
try: | |
return int(instr) | |
except ValueError: | |
try: | |
return float(instr) | |
except ValueError: | |
return instr.strip() | |
def get_author(authornode): | |
lastname = get_node_vals(authornode, 'LastName') | |
initials = get_node_vals(authornode, 'Initials') | |
return '%s %s' % (lastname, initials) | |
def get_all_authors(authors): | |
if len(authors) == 1: | |
return get_author(authors[0]) | |
else: | |
return ', '.join([get_author(a) for a in authors[:-1]]) + ', and ' + get_author(authors[-1]) | |
def get_short_authors(authors): | |
if len(authors) == 1: | |
return get_author(authors[0]) | |
elif len(authors) == 2: | |
return ' and '.join([get_author(a) for a in authors]) | |
elif len(authors) > 2: | |
return '%s, et al' % get_author(authors[0]) | |
def get_first_author(authors): | |
return get_author(authors[0]) | |
def get_citation_data(doc): | |
d = {} | |
# authors | |
authors = doc.getElementsByTagName('Author') | |
d['authors_long'] = get_all_authors(authors) | |
d['first_author'] = get_first_author(authors) | |
d['first_author_name'] = get_node_vals(authors[0], 'LastName') | |
d['authors_short'] = get_short_authors(authors) | |
d['title'] = get_node_vals(doc, 'ArticleTitle') | |
d['pages'] = get_node_vals(doc, 'MedlinePgn') | |
try: | |
d['firstpage'] = d['pages'].split('-')[0] | |
except AttributeError: | |
d['firstpage'] = d['pages'] | |
journal_info = doc.getElementsByTagName('Journal')[0] | |
d['journal'] = get_node_vals( | |
journal_info, 'ISOAbbreviation') or get_node_vals(journal_info, 'Title') | |
d['year'] = get_node_vals(journal_info, 'Year') or get_node_vals( | |
journal_info, 'MedlineDate') | |
d['revyear'] = int(re.findall(r'\d{4}', `d['year']`)[0]) * -1 | |
d['volume'] = get_node_vals(journal_info, 'Volume') | |
pmid = get_node_vals(doc, 'PMID') | |
d['pmid'] = str(pmid if isinstance(pmid, int) else pmid[0]) | |
d['abstract'] = get_node_vals(doc, 'AbstractText') | |
# handle unicode | |
for k, v in d.items(): | |
d[k] = enc(v)[0] if isinstance(v, unicode) else v | |
return d | |
def check_format(format_value): | |
ok_formats = [] | |
for key in globals().keys(): | |
if key.startswith('format_'): | |
ok_formats.append(key.replace('format_', '')) | |
if format_value not in ok_formats: | |
print 'format must have one of the following values: %s' % \ | |
' '.join(ok_formats) | |
sys.exit() | |
def show_all_formats(xmlstr, showkeys=True): | |
doc = get_document_object(xmlstr) | |
articles = doc.getElementsByTagName('PubmedArticle') | |
d = get_citation_data(articles[0]) | |
for format in xrange(1, 1000): | |
try: | |
fun = globals()['format_%s' % format] | |
print '%s) %s\n%s\n' % (format, | |
fun.__doc__, | |
formatter(fun, d)) | |
except KeyError: | |
break | |
if showkeys: | |
print 'Raw citation data' | |
keys = d.keys() | |
keys.sort() | |
for k in keys: | |
print '( %s ) %s' % (k, d[k]) | |
def fetch_refs(pmid_list): | |
baseurl = """http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?""" | |
searchstr = urllib.urlencode(params( | |
db='pubmed', | |
id=','.join(pmid_list), | |
retmode='xml', | |
rettype='medline')) | |
url = baseurl + searchstr | |
# print url | |
xmlstr = send_web_query(url) | |
return xmlstr | |
def format_1(citation_data): | |
"""Long format""" | |
return '%(authors_long)s. %(title)s %(journal)s. %(year)s, %(volume)s:%(pages)s. PMID %(pmid)s' % citation_data | |
def format_2(citation_data): | |
"""Abbreviared authors""" | |
return '%(authors_short)s. %(title)s %(journal)s. %(year)s, %(volume)s:%(pages)s. PMID %(pmid)s' % citation_data | |
def format_3(citation_data): | |
"""Brief""" | |
return '%(journal)s %(year)s, %(volume)s:%(firstpage)s. PMID %(pmid)s' % citation_data | |
def format_4(citation_data): | |
"""Format tag""" | |
outstr = '%(first_author_name)s_%(year)s_%(pmid)s' % citation_data | |
return '_'.join(outstr.split()) | |
def format_5(citation_data): | |
"""Brief with first author""" | |
return '%(first_author_name)s, et al. %(journal)s %(year)s, %(volume)s:%(firstpage)s. PMID %(pmid)s' % citation_data | |
def format_6(citation_data): | |
"""thebibliography entry for latex (for single reference)""" | |
return r"""\cite{%(pmid)s} | |
\begin{thebibliography}{99} | |
\bibitem{%(pmid)s} %(authors_short)s. %(title)s %(journal)s. %(year)s, %(volume)s:%(pages)s. PMID %(pmid)s | |
\end{thebibliography}""" % citation_data | |
def format_7(citation_data): | |
"""thebibliography entry for latex (for multiple references)""" | |
return r"""\bibitem{%(pmid)s} %(authors_short)s. %(title)s %(journal)s. %(year)s, %(volume)s:%(pages)s. PMID %(pmid)s""" % citation_data | |
def format_8(citation_data): | |
"""Format for refitem custom latex environment""" | |
return r"""\refitem{%(authors_long)s.}{%(title)s}{%(journal)s.}{%(year)s, %(volume)s:%(pages)s.}{PMID %(pmid)s}""" % citation_data | |
def format_9(citation_data): | |
"""Abbreviared authors with abstract""" | |
return '%(authors_short)s. %(title)s %(journal)s. %(year)s, %(volume)s:%(pages)s. PMID %(pmid)s \n\n%(abstract)s' % citation_data | |
def format_10(citation_data): | |
"""MoinMoin numbered list""" | |
s = r""" 1. %(authors_long)s. %(title)s %(journal)s. %(year)s, %(volume)s:%(pages)s. [[http://www.ncbi.nlm.nih.gov/pubmed/%(pmid)s|%(pmid)s]]""" % citation_data | |
return escape_camelcase(s) | |
def format_11(citation_data): | |
"""Long format with me in bold html tags""" | |
citation_data['authors_long'] = re.sub( | |
r'(Hoffman NG?)', | |
lambda mo: r'<b>%s</b>' % mo.group(0), citation_data['authors_long']) | |
return r'<li>%(authors_long)s. %(title)s %(journal)s. %(year)s, %(volume)s:%(pages)s. %(pmcid+)s</li>' % citation_data | |
def escape_camelcase(instr): | |
return re.sub(r'\b[A-Z][a-zA-Z]*(?:[a-z][a-zA-Z]*[A-Z]|[A-Z][a-zA-Z]*[a-z])[a-zA-Z]*\b', lambda mo: '!' + mo.group(0), instr) | |
def formatter(format_fun, d): | |
output = re.sub(r'\.{2,10}', '.', format_fun(d)) | |
return output | |
def get_stdin(outval='file'): | |
"""Returns an open file object (outval='file') or string | |
containing stdin (outval='string') if there is standard input; | |
otherwise returns None.""" | |
try: | |
sys.__stdin__.tell() | |
except IOError, msg: | |
if outval == 'file': | |
return sys.__stdin__ | |
elif outval == 'string': | |
return sys.__stdin__.read().strip() | |
except ValueError, msg: | |
print 'Stdin is closed:', msg | |
return | |
else: | |
return None | |
def main(): | |
usage = """%prog [options] PMID's | |
Format a set of references retrieved from pubmed by pubmed id (PMID) | |
Additional PMIDs (whitespace delimited) will be read from stdin if provided.""".strip() | |
parser = OptionParser( | |
usage=usage, version="$Id: gref.py,v 1.1 2005/12/25 23:48:10 nghoffma Exp $") | |
parser.add_option("-o", "--output", dest="filename", | |
help="write otuput to FILE", metavar="FILE", default=None) | |
parser.add_option("-f", "--format", dest="format", | |
help="choose reference format number", default='1') | |
parser.add_option("-e", "--show-examples", | |
action="store_true", dest="show_examples", default=False, | |
help="show example of each reference format") | |
parser.add_option("-k", "--show-keys", | |
action="store_true", dest="show_keys", default=False, | |
help="show keys into raw citation data (must be accompanied by -e)") | |
parser.add_option("-x", "--xml-output", dest="xmlfile", | |
help="write downloaded xml output to FILE", metavar="FILE", default=False) | |
parser.add_option("-t", "--sort-terms", dest="sort_terms", | |
help="sort multiple references according to SORT_TERMS (comma delimited list, eg 'first_author_name,year' [default]). Use -ek for list of terms.", default=None) | |
(options, args) = parser.parse_args() | |
html_formats = {'11'} | |
# print options, args | |
if options.show_examples: | |
try: | |
xmlstr = open( | |
os.path.join(os.path.split(__file__)[0], 'gref.xml')).read() | |
except IOError: | |
xmlstr = fetch_refs(['16284180']) | |
show_all_formats(xmlstr, showkeys=options.show_keys) | |
sys.exit() | |
stdin = get_stdin('string') | |
if stdin: | |
args += stdin.split() | |
if not args: | |
parser.print_help() | |
sys.exit() | |
if options.filename: | |
outfile = open(options.filename, 'w') | |
else: | |
outfile = sys.stdout | |
formats = options.format.split(',') | |
for f in formats: | |
check_format(f) | |
xmlstr = fetch_refs(args) | |
doc = get_document_object(xmlstr) | |
articles = doc.getElementsByTagName('PubmedArticle') | |
datalist = [get_citation_data(a) for a in articles] | |
# add pmcids | |
url = 'http://www.pubmedcentral.nih.gov/utils/entrezpmc.cgi?' + ','.join(d['pmid'] for d in datalist) | |
pmcids = {} | |
for line in urllib2.urlopen(url): | |
try: | |
pmid, _, pmcid = line.split() | |
except ValueError: | |
pmid, _ = line.split(None, 1) | |
pmcids[pmid] = None | |
else: | |
pmcids[pmid] = pmcid | |
for d in datalist: | |
pmcid = pmcids[d['pmid']] | |
d['pmcid'] = pmcid | |
d['pmcid+'] = 'PMCID: PMC{}'.format(pmcid) if pmcid else '' | |
if options.sort_terms: | |
datalist = sorted( | |
datalist, key=itemgetter(*options.sort_terms.split(','))) | |
if options.format in html_formats: | |
outfile.write(r'<html><!--###--><body><ol>') | |
outfile.write('\n') | |
for d in datalist: | |
for format in formats: | |
output = formatter(globals()['format_%s' % format], d) | |
outfile.write(output + '\n') | |
if options.format in html_formats: | |
outfile.write(r'</ol></body><!--###--></html>') | |
outfile.write('\n') | |
if options.xmlfile: | |
f = open(options.xmlfile, 'w') | |
f.write(xmlstr) | |
f.close() | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment