Skip to content

Instantly share code, notes, and snippets.

@markmacgillivray
Created May 10, 2012 10:46
Show Gist options
  • Save markmacgillivray/2652376 to your computer and use it in GitHub Desktop.
Save markmacgillivray/2652376 to your computer and use it in GitHub Desktop.
Convert records from Medline, MalariaWorld, and Wikipedia Open Access Media Importer from their native formats into BibJSON
# query to retrieve articles relevant to malaria from indices
# and write the result object to search.json
# (change ES URL target and size param for different datasets)
# size can be discovered by looking at the value in data['hits']['total']
# 10 are returned at a time by default, so just increase to a number larger than total
curl -X GET 'http://localhost:9200/medline2012/record/_search?size=10000&q=gametocyte%20OR%20merozoite%20OR%20sporozoite%20OR%20trophozoite%20OR%20schizont%20OR%20artemisia%20OR%20ITN%20OR%20LLIN%20OR%20malaria%20OR%20mosquito%20OR%20anopheles%20OR%20plasmodium%20OR%20falciparum%20OR%20vivax%20OR%20ovale%20OR%20malariae%20OR%20knowlesi%20OR%20DDT%20OR%20pyrethroid%20OR%20carbamate%20OR%20organophosphate%20OR%20ogranochlorine%20OR%20bednet%20OR%20repellent%20OR%20artemisinin%20OR%20chloroquine%20OR%20quinine%20OR%20artesunate%20OR%20lumefantrin%20OR%20mefloquine%20OR%20atovaquone%20OR%20paludrine%20OR%20"insecticide%20treated%20bednet"%20OR%20"indoor%20residual%20spraying"' -o search.json
# query to retrieve articles that identify as belonging to 2010 or 2011,
# either by article year or journal year, from a suitable prepared ES index
# and write the result object to 2010-2011.json
# (again change ES target URL and size as required)
curl -X GET 'http://localhost:9200/malaria/record/_search?size=50000q=year:2011%20OR%20journal.year:2011%20OR%20year:2010%20OR%20journal.year:2010' -o 2010-2011.json
# these queries could be combined into one, and sent as a JSON data object via POST instead of GET,
# but I was interested in the intermediate results anyway, so did it this way.
# here are all the terms from the above query show neatly
gametocyte
merozoite
sporozoite
trophozoite
schizont
artemisia
ITN
LLIN
malaria
mosquito
anopheles
plasmodium
falciparum
vivax
ovale
malariae
knowlesi
DDT
pyrethroid
carbamate
organophosphate
ogranochlorine
bednet
repellent
artemisinin
chloroquine
quinine
artesunate
lumefantrin
mefloquine
atovaquone
paludrine
"insecticide treated bednet"
"indoor residual spraying"
# reads through all tar files in the Medline package, formats them to bibJSON,
# and writes them to an ElasticSearch index
# (split the tars into multiple directories and run multiple versions of this
# script to speed the process - or add proper threading)
from elementtree import ElementTree as ET
import httplib
import json
import os
import shutil
import gzip
es_url = "localhost:9200"
es_path = "/medline2012/record"
tardir = '/home/ichi/openbiblio/medline/a/'
dirList=os.listdir(tardir)
filecount = 1
for tarname in dirList:
# skip ones already done, or where reading the tar folder
if filecount > 0:
print filecount, tardir, tarname
# read the tar file and unpack it to tar
tarobj = gzip.open(tardir + tarname)
outfile = open(tardir + tarname.replace(".gz",""), 'w')
outfile.write(tarobj.read())
outfile.close()
# parse the xml file
# this causes delay and requires enough free memory to fit the file
tree = ET.parse(tardir + tarname.replace('.gz',''))
elem = tree.getroot()
# for every item in the xml file, parse it and create a JSON object of it
recordcount = 0
for sub in elem:
#print filecount, recordcount
# parse the item into a dict
doc = {}
doc["collection"] = ["medline"]
doc["identifier"] = [{"type":"PMID","id":sub.find("PMID").text}]
try:
doc["affiliation"] = sub.find("Affiliation").text
except:
pass
# abstracts are copyright
'''
try:
otherabstract = sub.find("OtherAbstract")
doc["abstract"] = otherabstract.find("AbstractText").text
except:
pass
'''
try:
keywordlist = sub.find("KeywordList")
doc["keyword"] = []
for keyword in keywordlist:
doc["keyword"].append(keyword.text)
except:
pass
try:
grantlist = sub.find("GrantList")
doc["grants"] = []
for grant in grantlist:
doc["grants"].append(grant.find("Agency").text)
except:
pass
try:
comments = sub.find("CommentsCorrectionsList")
doc["cites"] = []
for comment in comments:
doc["cites"].append({
"id":comment.find("RefSource").text,
"description":comment.find("PMID").text
})
except:
pass
try:
article = sub.find("Article")
doc["title"] = article.find("ArticleTitle").text
doc["language"] = article.find("Language").text
except:
pass
# abstracts are copyright
try:
abstract = article.find("Abstract")
if abstract:
for item in abstract:
#doc["abstract"] = abstract.find("AbstractText").text
try:
doc["license"] = [{"description":abstract.find("CopyrightInformation").text}]
except:
pass
except:
pass
try:
doi = article.find("ELocationID")
if doi.attrib["EIdType"] == "doi":
doc["identifier"] = [{"type":"DOI","id":doi.text}]
except:
pass
try:
authorlist = article.find("AuthorList")
doc["author"] = []
for author in authorlist:
lastname = author.find("LastName").text
firstname = author.find("ForeName").text
initials = author.find("Initials").text
doc["author"].append({
"name": lastname + " " + firstname,
"lastname":lastname,
"firstname":firstname
})
except:
pass
try:
journal = article.find("Journal")
doc["journal"] = {
"name":journal.find("Title").text,
"identifier":[
{
"type": "issn",
"id": journal.find("ISSN").text
},
{
"type": "iso",
"id": journal.find("ISOAbbreviation").text
}
]
}
try:
journalissue = journal.find("JournalIssue")
doc["journal"]["volume"] = journalissue.find("Volume").text
except:
pass
try:
journalpubdate = journalissue.find("PubDate")
doc["journal"]["year"] = journalpubdate.find("Year").text
doc["journal"]["month"] = journalpubdate.find("Month").text
except:
pass
except:
pass
try:
articledate = article.find("ArticleDate")
doc["year"] = articledate.find("Year").text
doc["month"] = articledate.find("Month").text
doc["day"] = articledate.find("Day").text
except:
pass
# dump the dict to JSON
data = json.dumps(doc)
# send to ES index
c = httplib.HTTPConnection(es_url)
c.request('POST', es_path, data)
result = c.getresponse()
#print result.status#,data, result.reason
# increment the record count then it is time to loop
recordcount += 1
# tidy up
print recordcount
del tarobj, tree, elem
os.remove(tardir + tarname.replace(".gz",""))
# increment the file count then it is time to loop
filecount += 1
# converts the MalariaWorld data in their CSV dump format into bibJSON and writes the result to file
import csv, json, re
infile = open('MalariaWorld-articles.csv')
files = csv.DictReader(infile)
outfile = open('bib.json','w')
outfile.write('[\n')
count = 0
for record in files:
#print json.dumps(record,indent=4)
if count != 0:
outfile.write(',\n')
count += 1
brecord = {}
if 'Author(s)' in record:
auths = record['Author(s)'].split(',')
authss = []
for auth in auths:
if " and " in auth:
sauths = auth.split(" and ")
for s in sauths:
authss.append(s.strip())
else:
authss.append(auth.strip())
brecord["author"] = [{"name":i} for i in authss]
if 'Title' in record:
m = re.match('Open Access',record["Title"])
if m:
title = record["Title"].replace('Open Access','')
if ' | ' in title:
title = title.replace(' | ','')
brecord["license"] = [{"description":"open access"}]
else:
title = record["Title"]
brecord["title"] = title
if 'Body' in record:
brecord["abstract"] = record["Body"]
elif 'Teaser' in record:
brecord["abstract"] = record["Teaser"]
if 'Tags' in record:
brecord["keyword"] = [i.strip() for i in record["Tags"].split(',')]
if 'Reference' in record:
journal = record["Reference"].split(',')[0]
m = re.search('(\d){4}',record["Reference"])
if m:
year = m.group(0)
if 1980 < int(year) < 2013:
print year
journal = journal.replace(year,'').strip()
else:
year = None
else:
year = False
brecord["journal"] = {"name":journal}
if year:
brecord["journal"]["year"] = year
if "URL" in record:
brecord["link"] = [{"url":record["URL"]}]
#if "Calais Entity Name" in record:
# brecord["centity"] = record["Calais Entity Name"]
#if "Calais Linked Data URI" in record:
# brecord["cld"] = record["Calais Linked Data URI"]
#if "Calais Relevance Score" in record:
# brecord["crs"] = record["Calais Relevance Score"]
outfile.write(json.dumps(brecord,indent=4))
#print count
outfile.write('\n]')
outfile.close()
# add this to oa-cache in the open-access-media-importer
# also requires addition of bibjson-articles to the top list of accepted actions
# and some fixing of the code (but this gives the core idea of how the bibJSON
# is made)
if action == 'bibjson-articles':
outfile = open('bib.json','w')
outfile.write('[\n')
count = 0
# categories based on:
# “Citation Rules with Examples for Journal Articles on the Internet”
# <http://www.ncbi.nlm.nih.gov/books/NBK7281/#A55596>
source_path = config.get_metadata_raw_source_path(target)
for result in source_module.list_articles(source_path):
dataset = {
"author": [{"name":i.strip()} for i in result['article-contrib-authors'].split(',')],
"title": result['article-title'],
"abstract": result['article-abstract']
}
if result['journal-title']:
dataset["journal"] = {"name":result['journal-title']}
if result['article-license-url']:
dataset["license"] = [{"url":result['article-license-url']}]
if result['article-url']:
dataset["link"] = [{"url":result['article-url']}]
if result['article-copyright-holder']:
dataset["copyright"] = result['article-copyright-holder']
if result['article-id']:
dataset["identifier"] = [{"type":"PMCID","id":result['article-id']}]
if result['article-date']:
yearparts = result['article-date'].split('-')
dataset["year"] = yearparts[0]
if len(yearparts) > 1:
dataset["month"] = yearparts[1]
if len(yearparts) > 2:
dataset["day"] = yearparts[2]
if count != 0:
outfile.write(',\n')
outfile.write(json.dumps(dataset,indent=4))
#stdout.write(json.dumps(dataset,indent=4))
count += 1
print count
outfile.write('\n]')
outfile.close()
# given a JSON file containing an ES search result object,
# this script will send all the results to a specified ES index
import httplib
import json
es_url = "localhost:9200"
es_path = "/malaria20102011/record"
res = json.load(open('2010-2011.json'))
for result in [i['_source'] for i in res['hits']['hits']]:
c = httplib.HTTPConnection(es_url)
c.request('POST', es_path, json.dumps(result))
result = c.getresponse()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment