Created
May 10, 2012 10:46
-
-
Save markmacgillivray/2652376 to your computer and use it in GitHub Desktop.
Convert records from Medline, MalariaWorld, and Wikipedia Open Access Media Importer from their native formats into BibJSON
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# query to retrieve articles relevant to malaria from indices | |
# and write the result object to search.json | |
# (change ES URL target and size param for different datasets) | |
# size can be discovered by looking at the value in data['hits']['total'] | |
# 10 are returned at a time by default, so just increase to a number larger than total | |
curl -X GET 'http://localhost:9200/medline2012/record/_search?size=10000&q=gametocyte%20OR%20merozoite%20OR%20sporozoite%20OR%20trophozoite%20OR%20schizont%20OR%20artemisia%20OR%20ITN%20OR%20LLIN%20OR%20malaria%20OR%20mosquito%20OR%20anopheles%20OR%20plasmodium%20OR%20falciparum%20OR%20vivax%20OR%20ovale%20OR%20malariae%20OR%20knowlesi%20OR%20DDT%20OR%20pyrethroid%20OR%20carbamate%20OR%20organophosphate%20OR%20ogranochlorine%20OR%20bednet%20OR%20repellent%20OR%20artemisinin%20OR%20chloroquine%20OR%20quinine%20OR%20artesunate%20OR%20lumefantrin%20OR%20mefloquine%20OR%20atovaquone%20OR%20paludrine%20OR%20"insecticide%20treated%20bednet"%20OR%20"indoor%20residual%20spraying"' -o search.json | |
# query to retrieve articles that identify as belonging to 2010 or 2011, | |
# either by article year or journal year, from a suitable prepared ES index | |
# and write the result object to 2010-2011.json | |
# (again change ES target URL and size as required) | |
curl -X GET 'http://localhost:9200/malaria/record/_search?size=50000q=year:2011%20OR%20journal.year:2011%20OR%20year:2010%20OR%20journal.year:2010' -o 2010-2011.json | |
# these queries could be combined into one, and sent as a JSON data object via POST instead of GET, | |
# but I was interested in the intermediate results anyway, so did it this way. | |
# here are all the terms from the above query show neatly | |
gametocyte | |
merozoite | |
sporozoite | |
trophozoite | |
schizont | |
artemisia | |
ITN | |
LLIN | |
malaria | |
mosquito | |
anopheles | |
plasmodium | |
falciparum | |
vivax | |
ovale | |
malariae | |
knowlesi | |
DDT | |
pyrethroid | |
carbamate | |
organophosphate | |
ogranochlorine | |
bednet | |
repellent | |
artemisinin | |
chloroquine | |
quinine | |
artesunate | |
lumefantrin | |
mefloquine | |
atovaquone | |
paludrine | |
"insecticide treated bednet" | |
"indoor residual spraying" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# reads through all tar files in the Medline package, formats them to bibJSON, | |
# and writes them to an ElasticSearch index | |
# (split the tars into multiple directories and run multiple versions of this | |
# script to speed the process - or add proper threading) | |
from elementtree import ElementTree as ET | |
import httplib | |
import json | |
import os | |
import shutil | |
import gzip | |
es_url = "localhost:9200" | |
es_path = "/medline2012/record" | |
tardir = '/home/ichi/openbiblio/medline/a/' | |
dirList=os.listdir(tardir) | |
filecount = 1 | |
for tarname in dirList: | |
# skip ones already done, or where reading the tar folder | |
if filecount > 0: | |
print filecount, tardir, tarname | |
# read the tar file and unpack it to tar | |
tarobj = gzip.open(tardir + tarname) | |
outfile = open(tardir + tarname.replace(".gz",""), 'w') | |
outfile.write(tarobj.read()) | |
outfile.close() | |
# parse the xml file | |
# this causes delay and requires enough free memory to fit the file | |
tree = ET.parse(tardir + tarname.replace('.gz','')) | |
elem = tree.getroot() | |
# for every item in the xml file, parse it and create a JSON object of it | |
recordcount = 0 | |
for sub in elem: | |
#print filecount, recordcount | |
# parse the item into a dict | |
doc = {} | |
doc["collection"] = ["medline"] | |
doc["identifier"] = [{"type":"PMID","id":sub.find("PMID").text}] | |
try: | |
doc["affiliation"] = sub.find("Affiliation").text | |
except: | |
pass | |
# abstracts are copyright | |
''' | |
try: | |
otherabstract = sub.find("OtherAbstract") | |
doc["abstract"] = otherabstract.find("AbstractText").text | |
except: | |
pass | |
''' | |
try: | |
keywordlist = sub.find("KeywordList") | |
doc["keyword"] = [] | |
for keyword in keywordlist: | |
doc["keyword"].append(keyword.text) | |
except: | |
pass | |
try: | |
grantlist = sub.find("GrantList") | |
doc["grants"] = [] | |
for grant in grantlist: | |
doc["grants"].append(grant.find("Agency").text) | |
except: | |
pass | |
try: | |
comments = sub.find("CommentsCorrectionsList") | |
doc["cites"] = [] | |
for comment in comments: | |
doc["cites"].append({ | |
"id":comment.find("RefSource").text, | |
"description":comment.find("PMID").text | |
}) | |
except: | |
pass | |
try: | |
article = sub.find("Article") | |
doc["title"] = article.find("ArticleTitle").text | |
doc["language"] = article.find("Language").text | |
except: | |
pass | |
# abstracts are copyright | |
try: | |
abstract = article.find("Abstract") | |
if abstract: | |
for item in abstract: | |
#doc["abstract"] = abstract.find("AbstractText").text | |
try: | |
doc["license"] = [{"description":abstract.find("CopyrightInformation").text}] | |
except: | |
pass | |
except: | |
pass | |
try: | |
doi = article.find("ELocationID") | |
if doi.attrib["EIdType"] == "doi": | |
doc["identifier"] = [{"type":"DOI","id":doi.text}] | |
except: | |
pass | |
try: | |
authorlist = article.find("AuthorList") | |
doc["author"] = [] | |
for author in authorlist: | |
lastname = author.find("LastName").text | |
firstname = author.find("ForeName").text | |
initials = author.find("Initials").text | |
doc["author"].append({ | |
"name": lastname + " " + firstname, | |
"lastname":lastname, | |
"firstname":firstname | |
}) | |
except: | |
pass | |
try: | |
journal = article.find("Journal") | |
doc["journal"] = { | |
"name":journal.find("Title").text, | |
"identifier":[ | |
{ | |
"type": "issn", | |
"id": journal.find("ISSN").text | |
}, | |
{ | |
"type": "iso", | |
"id": journal.find("ISOAbbreviation").text | |
} | |
] | |
} | |
try: | |
journalissue = journal.find("JournalIssue") | |
doc["journal"]["volume"] = journalissue.find("Volume").text | |
except: | |
pass | |
try: | |
journalpubdate = journalissue.find("PubDate") | |
doc["journal"]["year"] = journalpubdate.find("Year").text | |
doc["journal"]["month"] = journalpubdate.find("Month").text | |
except: | |
pass | |
except: | |
pass | |
try: | |
articledate = article.find("ArticleDate") | |
doc["year"] = articledate.find("Year").text | |
doc["month"] = articledate.find("Month").text | |
doc["day"] = articledate.find("Day").text | |
except: | |
pass | |
# dump the dict to JSON | |
data = json.dumps(doc) | |
# send to ES index | |
c = httplib.HTTPConnection(es_url) | |
c.request('POST', es_path, data) | |
result = c.getresponse() | |
#print result.status#,data, result.reason | |
# increment the record count then it is time to loop | |
recordcount += 1 | |
# tidy up | |
print recordcount | |
del tarobj, tree, elem | |
os.remove(tardir + tarname.replace(".gz","")) | |
# increment the file count then it is time to loop | |
filecount += 1 | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# converts the MalariaWorld data in their CSV dump format into bibJSON and writes the result to file | |
import csv, json, re | |
infile = open('MalariaWorld-articles.csv') | |
files = csv.DictReader(infile) | |
outfile = open('bib.json','w') | |
outfile.write('[\n') | |
count = 0 | |
for record in files: | |
#print json.dumps(record,indent=4) | |
if count != 0: | |
outfile.write(',\n') | |
count += 1 | |
brecord = {} | |
if 'Author(s)' in record: | |
auths = record['Author(s)'].split(',') | |
authss = [] | |
for auth in auths: | |
if " and " in auth: | |
sauths = auth.split(" and ") | |
for s in sauths: | |
authss.append(s.strip()) | |
else: | |
authss.append(auth.strip()) | |
brecord["author"] = [{"name":i} for i in authss] | |
if 'Title' in record: | |
m = re.match('Open Access',record["Title"]) | |
if m: | |
title = record["Title"].replace('Open Access','') | |
if ' | ' in title: | |
title = title.replace(' | ','') | |
brecord["license"] = [{"description":"open access"}] | |
else: | |
title = record["Title"] | |
brecord["title"] = title | |
if 'Body' in record: | |
brecord["abstract"] = record["Body"] | |
elif 'Teaser' in record: | |
brecord["abstract"] = record["Teaser"] | |
if 'Tags' in record: | |
brecord["keyword"] = [i.strip() for i in record["Tags"].split(',')] | |
if 'Reference' in record: | |
journal = record["Reference"].split(',')[0] | |
m = re.search('(\d){4}',record["Reference"]) | |
if m: | |
year = m.group(0) | |
if 1980 < int(year) < 2013: | |
print year | |
journal = journal.replace(year,'').strip() | |
else: | |
year = None | |
else: | |
year = False | |
brecord["journal"] = {"name":journal} | |
if year: | |
brecord["journal"]["year"] = year | |
if "URL" in record: | |
brecord["link"] = [{"url":record["URL"]}] | |
#if "Calais Entity Name" in record: | |
# brecord["centity"] = record["Calais Entity Name"] | |
#if "Calais Linked Data URI" in record: | |
# brecord["cld"] = record["Calais Linked Data URI"] | |
#if "Calais Relevance Score" in record: | |
# brecord["crs"] = record["Calais Relevance Score"] | |
outfile.write(json.dumps(brecord,indent=4)) | |
#print count | |
outfile.write('\n]') | |
outfile.close() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# add this to oa-cache in the open-access-media-importer | |
# also requires addition of bibjson-articles to the top list of accepted actions | |
# and some fixing of the code (but this gives the core idea of how the bibJSON | |
# is made) | |
if action == 'bibjson-articles': | |
outfile = open('bib.json','w') | |
outfile.write('[\n') | |
count = 0 | |
# categories based on: | |
# “Citation Rules with Examples for Journal Articles on the Internet” | |
# <http://www.ncbi.nlm.nih.gov/books/NBK7281/#A55596> | |
source_path = config.get_metadata_raw_source_path(target) | |
for result in source_module.list_articles(source_path): | |
dataset = { | |
"author": [{"name":i.strip()} for i in result['article-contrib-authors'].split(',')], | |
"title": result['article-title'], | |
"abstract": result['article-abstract'] | |
} | |
if result['journal-title']: | |
dataset["journal"] = {"name":result['journal-title']} | |
if result['article-license-url']: | |
dataset["license"] = [{"url":result['article-license-url']}] | |
if result['article-url']: | |
dataset["link"] = [{"url":result['article-url']}] | |
if result['article-copyright-holder']: | |
dataset["copyright"] = result['article-copyright-holder'] | |
if result['article-id']: | |
dataset["identifier"] = [{"type":"PMCID","id":result['article-id']}] | |
if result['article-date']: | |
yearparts = result['article-date'].split('-') | |
dataset["year"] = yearparts[0] | |
if len(yearparts) > 1: | |
dataset["month"] = yearparts[1] | |
if len(yearparts) > 2: | |
dataset["day"] = yearparts[2] | |
if count != 0: | |
outfile.write(',\n') | |
outfile.write(json.dumps(dataset,indent=4)) | |
#stdout.write(json.dumps(dataset,indent=4)) | |
count += 1 | |
print count | |
outfile.write('\n]') | |
outfile.close() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# given a JSON file containing an ES search result object, | |
# this script will send all the results to a specified ES index | |
import httplib | |
import json | |
es_url = "localhost:9200" | |
es_path = "/malaria20102011/record" | |
res = json.load(open('2010-2011.json')) | |
for result in [i['_source'] for i in res['hits']['hits']]: | |
c = httplib.HTTPConnection(es_url) | |
c.request('POST', es_path, json.dumps(result)) | |
result = c.getresponse() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment