Skip to content

Instantly share code, notes, and snippets.

@lawlesst
Created May 18, 2012 01:01
Show Gist options
  • Save lawlesst/2722536 to your computer and use it in GitHub Desktop.
Save lawlesst/2722536 to your computer and use it in GitHub Desktop.
Parser for retrieving PubMed metadata
"""
Lookup PMIDS and parse the returned metadata into the BibJSON format.
See: http://www.ncbi.nlm.nih.gov/books/NBK25499/#chapter4.ESearch
"""
from lxml import etree
import urllib
from pprint import pprint
class BibJSON(object):
def __init__(self, pmid):
self.pmid = pmid
handle = urllib.urlopen('http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi?db=pubmed&id=%s' % pmid)
self.doc = etree.parse(handle)
self.dsum = self.doc.xpath('/eSummaryResult/DocSum')[0]
def _x(self, path, text=True):
if text:
return self.dsum.xpath('%s/text()' % path)[0]
else:
return self.dsum.xpath(path)
def _name(self, name):
try:
return self._x('Item[@Name="%s"][1]' % name)
except IndexError:
return ''
def identifiers(self):
out = [
{'type': 'pmid',
'value': 'pmid:%s' % self.pmid}
]
d = {'type': 'doi',
'value': "doi:%s" % self._name('DOI')}
out.append(d)
d = {}
for k in ['ISSN', 'ESSN']:
v = self._name(k)
if v != '':
if k == 'ESSN':
k = 'eissn'
out.append({'type': k.lower(), 'value': v})
#Look at article ids section
aids = self.doc.xpath('/eSummaryResult/DocSum/Item[@Name="ArticleIds"]')[0]
for aid in aids:
if aid.get('Name') == 'pmc':
out.append({'type': 'pmcid', 'value': aid.text})
#Try pubmed central id
#pmc = self.xpath
return out
def authors(self):
#author => [{'lastname': 'Frogner', 'name': 'BK Frogner', 'firstname': 'BK'}]
out = []
author_list = self.doc.xpath('/eSummaryResult/DocSum/Item[@Name="AuthorList"]')[0]
for auth in author_list:
out.append({'name': auth.text})
return out
def type(self):
pub = self.doc.xpath('/eSummaryResult/DocSum/Item/Item[@Name="PubType"]')[0].text
if pub == 'Journal Article':
return 'article'
return pub
def parse(self):
def _name(name):
try:
return self._x('Item[@Name="%s"][1]' % name)
except IndexError:
return ''
b = {}
b['type'] = self.type()
b['title'] = _name('Title')
b['year'] = _name('PubDate')[:4]
b['volume'] = _name('Volume')
b['issue'] = _name('Issue')
b['pages'] = _name('Pages')
if b['pages'] != '':
chunks = b['pages'].split('-')
b['start_page'] = chunks[0]
b['end_page'] = chunks[1]
b['journal'] = {'name': _name('FullJournalName'),
'shortcode': _name('Source')}
b['_pubstatus'] = _name('PubStatus')
b['identifiers'] = self.identifiers()
b['link'] = []
for bid in b['identifiers']:
if bid['type'] == 'pmcid':
b['link'].append({'anchor': 'Full text from PubMed Central',
'url': 'http://www.ncbi.nlm.nih.gov/pmc/articles/%s' % bid['value']})
if b['link'] == []:
del b['link']
b['author'] = self.authors()
return b
def to_bibjson(pmid):
d = BibJSON(pmid).parse()
return d
to_check = [
'6869634',
'9285281',
'8996046'
]
from bibjsontools import to_openurl
for pmid in to_check:
print pmid
bib = to_bibjson(pmid)
pprint(bib)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment