nhoffman · October 31, 2013 16:46
diff --git a/gref.py b/gref.py
 #!/usr/bin/env python

 # see http://eutils.ncbi.nlm.nih.gov/entrez/query/static/efetchlit_help.html

 import sys
 import urllib
 import os
 import xml.dom.minidom
 import re
 import urllib2
 from operator import itemgetter
 from optparse import OptionParser

 # see http://www.xml.com/pub/a/2005/05/18/unicode.html
 import codecs
 enc, dec = codecs.lookup('UTF-8')[:2]


 def params(**args):
    return args


 def send_web_query(url):

    f = urllib.urlopen(url)
    return f.read()


 def to_text(node):

    t = []
    for c in node.childNodes:
        if c.nodeType == 3:
            t.append(c.nodeValue)
    return ''.join(t)


 def get_node_vals(doc, nodename):

    vals = [cast(to_text(e)) for e in doc.getElementsByTagName(nodename)]

    if len(vals) == 0:
        return None
    elif len(vals) == 1:
        return vals[0]
    else:
        return vals


 def get_document_object(target):
    """target is a) a filename, b) an open file-like object, or
    c) a string representing an xml document. Returns output from either
    xml.dom.minidom.parse or .parseString as appropriate"""

    try:
        if os.access(target, os.F_OK):
            parser_function = xml.dom.minidom.parse
        else:
            # is a string but isn't a filename
            parser_function = xml.dom.minidom.parseString
    except TypeError:
        # parhaps an open file object?
        parser_function = xml.dom.minidom.parse

    return parser_function(target)


 def cast(instr):
    """Tries to cast instr as an int or float; returns instr
    if both fail."""

    try:
        return int(instr)
    except ValueError:
        try:
            return float(instr)
        except ValueError:
            return instr.strip()


 def get_author(authornode):

    lastname = get_node_vals(authornode, 'LastName')
    initials = get_node_vals(authornode, 'Initials')
    return '%s %s' % (lastname, initials)


 def get_all_authors(authors):

    if len(authors) == 1:
        return get_author(authors[0])
    else:
        return ', '.join([get_author(a) for a in authors[:-1]]) + ', and ' + get_author(authors[-1])


 def get_short_authors(authors):

    if len(authors) == 1:
        return get_author(authors[0])
    elif len(authors) == 2:
        return ' and '.join([get_author(a) for a in authors])
    elif len(authors) > 2:
        return '%s, et al' % get_author(authors[0])


 def get_first_author(authors):

    return get_author(authors[0])


 def get_citation_data(doc):

    d = {}

    # authors
    authors = doc.getElementsByTagName('Author')
    d['authors_long'] = get_all_authors(authors)
    d['first_author'] = get_first_author(authors)
    d['first_author_name'] = get_node_vals(authors[0], 'LastName')
    d['authors_short'] = get_short_authors(authors)

    d['title'] = get_node_vals(doc, 'ArticleTitle')
    d['pages'] = get_node_vals(doc, 'MedlinePgn')

    try:
        d['firstpage'] = d['pages'].split('-')[0]
    except AttributeError:
        d['firstpage'] = d['pages']

    journal_info = doc.getElementsByTagName('Journal')[0]

    d['journal'] = get_node_vals(
        journal_info, 'ISOAbbreviation') or get_node_vals(journal_info, 'Title')
    d['year'] = get_node_vals(journal_info, 'Year') or get_node_vals(
        journal_info, 'MedlineDate')
    d['revyear'] = int(re.findall(r'\d{4}', `d['year']`)[0]) * -1
    d['volume'] = get_node_vals(journal_info, 'Volume')

    pmid = get_node_vals(doc, 'PMID')
    d['pmid'] = str(pmid if isinstance(pmid, int) else pmid[0])
    d['abstract'] = get_node_vals(doc, 'AbstractText')

    # handle unicode
    for k, v in d.items():
        d[k] = enc(v)[0] if isinstance(v, unicode) else v

    return d


 def check_format(format_value):

    ok_formats = []
    for key in globals().keys():
        if key.startswith('format_'):
            ok_formats.append(key.replace('format_', ''))

    if format_value not in ok_formats:
        print 'format must have one of the following values: %s' % \
            ' '.join(ok_formats)
        sys.exit()


 def show_all_formats(xmlstr, showkeys=True):

    doc = get_document_object(xmlstr)
    articles = doc.getElementsByTagName('PubmedArticle')
    d = get_citation_data(articles[0])

    for format in xrange(1, 1000):
        try:
            fun = globals()['format_%s' % format]

            print '%s) %s\n%s\n' % (format,
                                    fun.__doc__,
                                    formatter(fun, d))
        except KeyError:
            break

    if showkeys:
        print 'Raw citation data'
        keys = d.keys()
        keys.sort()
        for k in keys:
            print '( %s ) %s' % (k, d[k])


 def fetch_refs(pmid_list):

    baseurl = """http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?"""
    searchstr = urllib.urlencode(params(
        db='pubmed',
        id=','.join(pmid_list),
        retmode='xml',
        rettype='medline'))

    url = baseurl + searchstr
    # print url
    xmlstr = send_web_query(url)
    return xmlstr


 def format_1(citation_data):
    """Long format"""
    return '%(authors_long)s. %(title)s %(journal)s. %(year)s, %(volume)s:%(pages)s. PMID %(pmid)s' % citation_data


 def format_2(citation_data):
    """Abbreviared authors"""
    return '%(authors_short)s. %(title)s %(journal)s. %(year)s, %(volume)s:%(pages)s. PMID %(pmid)s' % citation_data


 def format_3(citation_data):
    """Brief"""
    return '%(journal)s %(year)s, %(volume)s:%(firstpage)s. PMID %(pmid)s' % citation_data


 def format_4(citation_data):
    """Format tag"""
    outstr = '%(first_author_name)s_%(year)s_%(pmid)s' % citation_data

    return '_'.join(outstr.split())


 def format_5(citation_data):
    """Brief with first author"""

    return '%(first_author_name)s, et al. %(journal)s %(year)s, %(volume)s:%(firstpage)s. PMID %(pmid)s' % citation_data


 def format_6(citation_data):
    """thebibliography entry for latex (for single reference)"""

    return r"""\cite{%(pmid)s}
 \begin{thebibliography}{99}
 \bibitem{%(pmid)s} %(authors_short)s. %(title)s %(journal)s. %(year)s, %(volume)s:%(pages)s. PMID %(pmid)s
 \end{thebibliography}""" % citation_data


 def format_7(citation_data):
    """thebibliography entry for latex (for multiple references)"""

    return r"""\bibitem{%(pmid)s} %(authors_short)s. %(title)s %(journal)s. %(year)s, %(volume)s:%(pages)s. PMID %(pmid)s""" % citation_data


 def format_8(citation_data):
    """Format for refitem custom latex environment"""

    return r"""\refitem{%(authors_long)s.}{%(title)s}{%(journal)s.}{%(year)s, %(volume)s:%(pages)s.}{PMID %(pmid)s}""" % citation_data


 def format_9(citation_data):
    """Abbreviared authors with abstract"""
    return '%(authors_short)s. %(title)s %(journal)s. %(year)s, %(volume)s:%(pages)s. PMID %(pmid)s \n\n%(abstract)s' % citation_data


 def format_10(citation_data):
    """MoinMoin numbered list"""
    s = r""" 1. %(authors_long)s. %(title)s %(journal)s. %(year)s, %(volume)s:%(pages)s. [[http://www.ncbi.nlm.nih.gov/pubmed/%(pmid)s|%(pmid)s]]""" % citation_data
    return escape_camelcase(s)


 def format_11(citation_data):
    """Long format with me in bold html tags"""

    citation_data['authors_long'] = re.sub(
        r'(Hoffman NG?)',
        lambda mo: r'<b>%s</b>' % mo.group(0), citation_data['authors_long'])

    return r'<li>%(authors_long)s. %(title)s %(journal)s. %(year)s, %(volume)s:%(pages)s. %(pmcid+)s</li>' % citation_data


 def escape_camelcase(instr):
    return re.sub(r'\b[A-Z][a-zA-Z]*(?:[a-z][a-zA-Z]*[A-Z]|[A-Z][a-zA-Z]*[a-z])[a-zA-Z]*\b', lambda mo: '!' + mo.group(0), instr)


 def formatter(format_fun, d):

    output = re.sub(r'\.{2,10}', '.', format_fun(d))

    return output


 def get_stdin(outval='file'):
    """Returns an open file object (outval='file') or string
    containing stdin (outval='string') if there is standard input;
    otherwise returns None."""

    try:
        sys.__stdin__.tell()
    except IOError, msg:
        if outval == 'file':
            return sys.__stdin__
        elif outval == 'string':
            return sys.__stdin__.read().strip()
    except ValueError, msg:
        print 'Stdin is closed:', msg
        return
    else:
        return None


 def main():

    usage = """%prog [options] PMID's
 Format a set of references retrieved from pubmed by pubmed id (PMID)
 Additional PMIDs (whitespace delimited) will be read from stdin if provided.""".strip()

    parser = OptionParser(
        usage=usage, version="$Id: gref.py,v 1.1 2005/12/25 23:48:10 nghoffma Exp $")

    parser.add_option("-o", "--output", dest="filename",
                      help="write otuput to FILE", metavar="FILE", default=None)
    parser.add_option("-f", "--format", dest="format",
                      help="choose reference format number", default='1')
    parser.add_option("-e", "--show-examples",
                      action="store_true", dest="show_examples", default=False,
                      help="show example of each reference format")
    parser.add_option("-k", "--show-keys",
                      action="store_true", dest="show_keys", default=False,
                      help="show keys into raw citation data (must be accompanied by -e)")
    parser.add_option("-x", "--xml-output", dest="xmlfile",
                      help="write downloaded xml output to FILE", metavar="FILE", default=False)
    parser.add_option("-t", "--sort-terms", dest="sort_terms",
                      help="sort multiple references according to SORT_TERMS (comma delimited list, eg 'first_author_name,year' [default]). Use -ek for list of terms.", default=None)

    (options, args) = parser.parse_args()
    html_formats = {'11'}

    # print options, args
    if options.show_examples:
        try:
            xmlstr = open(
                os.path.join(os.path.split(__file__)[0], 'gref.xml')).read()
        except IOError:
            xmlstr = fetch_refs(['16284180'])
        show_all_formats(xmlstr, showkeys=options.show_keys)
        sys.exit()

    stdin = get_stdin('string')
    if stdin:
        args += stdin.split()

    if not args:
        parser.print_help()
        sys.exit()

    if options.filename:
        outfile = open(options.filename, 'w')
    else:
        outfile = sys.stdout

    formats = options.format.split(',')
    for f in formats:
        check_format(f)

    xmlstr = fetch_refs(args)
    doc = get_document_object(xmlstr)
    articles = doc.getElementsByTagName('PubmedArticle')

    datalist = [get_citation_data(a) for a in articles]

    # add pmcids
    url = 'http://www.pubmedcentral.nih.gov/utils/entrezpmc.cgi?' + ','.join(d['pmid'] for d in datalist)
    pmcids = {}
    for line in urllib2.urlopen(url):
        try:
            pmid, _, pmcid = line.split()
        except ValueError:
            pmid, _ = line.split(None, 1)
            pmcids[pmid] = None
        else:
            pmcids[pmid] = pmcid

    for d in datalist:
        pmcid = pmcids[d['pmid']]
        d['pmcid'] = pmcid
        d['pmcid+'] = 'PMCID: PMC{}'.format(pmcid) if pmcid else ''

    if options.sort_terms:
        datalist = sorted(
            datalist, key=itemgetter(*options.sort_terms.split(',')))

    if options.format in html_formats:
        outfile.write(r'<html><!--###--><body><ol>')
        outfile.write('\n')

    for d in datalist:
        for format in formats:
            output = formatter(globals()['format_%s' % format], d)
            outfile.write(output + '\n')

    if options.format in html_formats:
        outfile.write(r'</ol></body><!--###--></html>')
        outfile.write('\n')

    if options.xmlfile:
        f = open(options.xmlfile, 'w')
        f.write(xmlstr)
        f.close()

 if __name__ == '__main__':
    main()
	#!/usr/bin/env python

	# see http://eutils.ncbi.nlm.nih.gov/entrez/query/static/efetchlit_help.html

	import sys
	import urllib
	import os
	import xml.dom.minidom
	import re
	import urllib2
	from operator import itemgetter
	from optparse import OptionParser

	# see http://www.xml.com/pub/a/2005/05/18/unicode.html
	import codecs
	enc, dec = codecs.lookup('UTF-8')[:2]


	def params(**args):
	return args


	def send_web_query(url):

	f = urllib.urlopen(url)
	return f.read()


	def to_text(node):

	t = []
	for c in node.childNodes:
	if c.nodeType == 3:
	t.append(c.nodeValue)
	return ''.join(t)


	def get_node_vals(doc, nodename):

	vals = [cast(to_text(e)) for e in doc.getElementsByTagName(nodename)]

	if len(vals) == 0:
	return None
	elif len(vals) == 1:
	return vals[0]
	else:
	return vals


	def get_document_object(target):
	"""target is a) a filename, b) an open file-like object, or
	c) a string representing an xml document. Returns output from either
	xml.dom.minidom.parse or .parseString as appropriate"""

	try:
	if os.access(target, os.F_OK):
	parser_function = xml.dom.minidom.parse
	else:
	# is a string but isn't a filename
	parser_function = xml.dom.minidom.parseString
	except TypeError:
	# parhaps an open file object?
	parser_function = xml.dom.minidom.parse

	return parser_function(target)


	def cast(instr):
	"""Tries to cast instr as an int or float; returns instr
	if both fail."""

	try:
	return int(instr)
	except ValueError:
	try:
	return float(instr)
	except ValueError:
	return instr.strip()


	def get_author(authornode):

	lastname = get_node_vals(authornode, 'LastName')
	initials = get_node_vals(authornode, 'Initials')
	return '%s %s' % (lastname, initials)


	def get_all_authors(authors):

	if len(authors) == 1:
	return get_author(authors[0])
	else:
	return ', '.join([get_author(a) for a in authors[:-1]]) + ', and ' + get_author(authors[-1])


	def get_short_authors(authors):

	if len(authors) == 1:
	return get_author(authors[0])
	elif len(authors) == 2:
	return ' and '.join([get_author(a) for a in authors])
	elif len(authors) > 2:
	return '%s, et al' % get_author(authors[0])


	def get_first_author(authors):

	return get_author(authors[0])


	def get_citation_data(doc):

	d = {}

	# authors
	authors = doc.getElementsByTagName('Author')
	d['authors_long'] = get_all_authors(authors)
	d['first_author'] = get_first_author(authors)
	d['first_author_name'] = get_node_vals(authors[0], 'LastName')
	d['authors_short'] = get_short_authors(authors)

	d['title'] = get_node_vals(doc, 'ArticleTitle')
	d['pages'] = get_node_vals(doc, 'MedlinePgn')

	try:
	d['firstpage'] = d['pages'].split('-')[0]
	except AttributeError:
	d['firstpage'] = d['pages']

	journal_info = doc.getElementsByTagName('Journal')[0]

	d['journal'] = get_node_vals(
	journal_info, 'ISOAbbreviation') or get_node_vals(journal_info, 'Title')
	d['year'] = get_node_vals(journal_info, 'Year') or get_node_vals(
	journal_info, 'MedlineDate')
	d['revyear'] = int(re.findall(r'\d{4}', `d['year']`)[0]) * -1
	d['volume'] = get_node_vals(journal_info, 'Volume')

	pmid = get_node_vals(doc, 'PMID')
	d['pmid'] = str(pmid if isinstance(pmid, int) else pmid[0])
	d['abstract'] = get_node_vals(doc, 'AbstractText')

	# handle unicode
	for k, v in d.items():
	d[k] = enc(v)[0] if isinstance(v, unicode) else v

	return d


	def check_format(format_value):

	ok_formats = []
	for key in globals().keys():
	if key.startswith('format_'):
	ok_formats.append(key.replace('format_', ''))

	if format_value not in ok_formats:
	print 'format must have one of the following values: %s' % \
	' '.join(ok_formats)
	sys.exit()


	def show_all_formats(xmlstr, showkeys=True):

	doc = get_document_object(xmlstr)
	articles = doc.getElementsByTagName('PubmedArticle')
	d = get_citation_data(articles[0])

	for format in xrange(1, 1000):
	try:
	fun = globals()['format_%s' % format]

	print '%s) %s\n%s\n' % (format,
	fun.__doc__,
	formatter(fun, d))
	except KeyError:
	break

	if showkeys:
	print 'Raw citation data'
	keys = d.keys()
	keys.sort()
	for k in keys:
	print '( %s ) %s' % (k, d[k])


	def fetch_refs(pmid_list):

	baseurl = """http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?"""
	searchstr = urllib.urlencode(params(
	db='pubmed',
	id=','.join(pmid_list),
	retmode='xml',
	rettype='medline'))

	url = baseurl + searchstr
	# print url
	xmlstr = send_web_query(url)
	return xmlstr


	def format_1(citation_data):
	"""Long format"""
	return '%(authors_long)s. %(title)s %(journal)s. %(year)s, %(volume)s:%(pages)s. PMID %(pmid)s' % citation_data


	def format_2(citation_data):
	"""Abbreviared authors"""
	return '%(authors_short)s. %(title)s %(journal)s. %(year)s, %(volume)s:%(pages)s. PMID %(pmid)s' % citation_data


	def format_3(citation_data):
	"""Brief"""
	return '%(journal)s %(year)s, %(volume)s:%(firstpage)s. PMID %(pmid)s' % citation_data


	def format_4(citation_data):
	"""Format tag"""
	outstr = '%(first_author_name)s_%(year)s_%(pmid)s' % citation_data

	return '_'.join(outstr.split())


	def format_5(citation_data):
	"""Brief with first author"""

	return '%(first_author_name)s, et al. %(journal)s %(year)s, %(volume)s:%(firstpage)s. PMID %(pmid)s' % citation_data


	def format_6(citation_data):
	"""thebibliography entry for latex (for single reference)"""

	return r"""\cite{%(pmid)s}
	\begin{thebibliography}{99}
	\bibitem{%(pmid)s} %(authors_short)s. %(title)s %(journal)s. %(year)s, %(volume)s:%(pages)s. PMID %(pmid)s
	\end{thebibliography}""" % citation_data


	def format_7(citation_data):
	"""thebibliography entry for latex (for multiple references)"""

	return r"""\bibitem{%(pmid)s} %(authors_short)s. %(title)s %(journal)s. %(year)s, %(volume)s:%(pages)s. PMID %(pmid)s""" % citation_data


	def format_8(citation_data):
	"""Format for refitem custom latex environment"""

	return r"""\refitem{%(authors_long)s.}{%(title)s}{%(journal)s.}{%(year)s, %(volume)s:%(pages)s.}{PMID %(pmid)s}""" % citation_data


	def format_9(citation_data):
	"""Abbreviared authors with abstract"""
	return '%(authors_short)s. %(title)s %(journal)s. %(year)s, %(volume)s:%(pages)s. PMID %(pmid)s \n\n%(abstract)s' % citation_data


	def format_10(citation_data):
	"""MoinMoin numbered list"""
	s = r""" 1. %(authors_long)s. %(title)s %(journal)s. %(year)s, %(volume)s:%(pages)s. [[http://www.ncbi.nlm.nih.gov/pubmed/%(pmid)s\|%(pmid)s]]""" % citation_data
	return escape_camelcase(s)


	def format_11(citation_data):
	"""Long format with me in bold html tags"""

	citation_data['authors_long'] = re.sub(
	r'(Hoffman NG?)',
	lambda mo: r'<b>%s</b>' % mo.group(0), citation_data['authors_long'])

	return r'<li>%(authors_long)s. %(title)s %(journal)s. %(year)s, %(volume)s:%(pages)s. %(pmcid+)s</li>' % citation_data


	def escape_camelcase(instr):
	return re.sub(r'\b[A-Z][a-zA-Z](?:[a-z][a-zA-Z][A-Z]\|[A-Z][a-zA-Z][a-z])[a-zA-Z]\b', lambda mo: '!' + mo.group(0), instr)


	def formatter(format_fun, d):

	output = re.sub(r'\.{2,10}', '.', format_fun(d))

	return output


	def get_stdin(outval='file'):
	"""Returns an open file object (outval='file') or string
	containing stdin (outval='string') if there is standard input;
	otherwise returns None."""

	try:
	sys.__stdin__.tell()
	except IOError, msg:
	if outval == 'file':
	return sys.__stdin__
	elif outval == 'string':
	return sys.__stdin__.read().strip()
	except ValueError, msg:
	print 'Stdin is closed:', msg
	return
	else:
	return None


	def main():

	usage = """%prog [options] PMID's
	Format a set of references retrieved from pubmed by pubmed id (PMID)
	Additional PMIDs (whitespace delimited) will be read from stdin if provided.""".strip()

	parser = OptionParser(
	usage=usage, version="$Id: gref.py,v 1.1 2005/12/25 23:48:10 nghoffma Exp $")

	parser.add_option("-o", "--output", dest="filename",
	help="write otuput to FILE", metavar="FILE", default=None)
	parser.add_option("-f", "--format", dest="format",
	help="choose reference format number", default='1')
	parser.add_option("-e", "--show-examples",
	action="store_true", dest="show_examples", default=False,
	help="show example of each reference format")
	parser.add_option("-k", "--show-keys",
	action="store_true", dest="show_keys", default=False,
	help="show keys into raw citation data (must be accompanied by -e)")
	parser.add_option("-x", "--xml-output", dest="xmlfile",
	help="write downloaded xml output to FILE", metavar="FILE", default=False)
	parser.add_option("-t", "--sort-terms", dest="sort_terms",
	help="sort multiple references according to SORT_TERMS (comma delimited list, eg 'first_author_name,year' [default]). Use -ek for list of terms.", default=None)

	(options, args) = parser.parse_args()
	html_formats = {'11'}

	# print options, args
	if options.show_examples:
	try:
	xmlstr = open(
	os.path.join(os.path.split(__file__)[0], 'gref.xml')).read()
	except IOError:
	xmlstr = fetch_refs(['16284180'])
	show_all_formats(xmlstr, showkeys=options.show_keys)
	sys.exit()

	stdin = get_stdin('string')
	if stdin:
	args += stdin.split()

	if not args:
	parser.print_help()
	sys.exit()

	if options.filename:
	outfile = open(options.filename, 'w')
	else:
	outfile = sys.stdout

	formats = options.format.split(',')
	for f in formats:
	check_format(f)

	xmlstr = fetch_refs(args)
	doc = get_document_object(xmlstr)
	articles = doc.getElementsByTagName('PubmedArticle')

	datalist = [get_citation_data(a) for a in articles]

	# add pmcids
	url = 'http://www.pubmedcentral.nih.gov/utils/entrezpmc.cgi?' + ','.join(d['pmid'] for d in datalist)
	pmcids = {}
	for line in urllib2.urlopen(url):
	try:
	pmid, _, pmcid = line.split()
	except ValueError:
	pmid, _ = line.split(None, 1)
	pmcids[pmid] = None
	else:
	pmcids[pmid] = pmcid

	for d in datalist:
	pmcid = pmcids[d['pmid']]
	d['pmcid'] = pmcid
	d['pmcid+'] = 'PMCID: PMC{}'.format(pmcid) if pmcid else ''

	if options.sort_terms:
	datalist = sorted(
	datalist, key=itemgetter(*options.sort_terms.split(',')))

	if options.format in html_formats:
	outfile.write(r'<html><!--###--><body><ol>')
	outfile.write('\n')

	for d in datalist:
	for format in formats:
	output = formatter(globals()['format_%s' % format], d)
	outfile.write(output + '\n')

	if options.format in html_formats:
	outfile.write(r'</ol></body><!--###--></html>')
	outfile.write('\n')

	if options.xmlfile:
	f = open(options.xmlfile, 'w')
	f.write(xmlstr)
	f.close()

	if __name__ == '__main__':
	main()