markmacgillivray · May 10, 2012 10:46
diff --git a/additional b/additional
 # query to retrieve articles relevant to malaria from indices
 # and write the result object to search.json
 # (change ES URL target and size param for different datasets)

 # size can be discovered by looking at the value in data['hits']['total']
 # 10 are returned at a time by default, so just increase to a number larger than total

 curl -X GET 'http://localhost:9200/medline2012/record/_search?size=10000&q=gametocyte%20OR%20merozoite%20OR%20sporozoite%20OR%20trophozoite%20OR%20schizont%20OR%20artemisia%20OR%20ITN%20OR%20LLIN%20OR%20malaria%20OR%20mosquito%20OR%20anopheles%20OR%20plasmodium%20OR%20falciparum%20OR%20vivax%20OR%20ovale%20OR%20malariae%20OR%20knowlesi%20OR%20DDT%20OR%20pyrethroid%20OR%20carbamate%20OR%20organophosphate%20OR%20ogranochlorine%20OR%20bednet%20OR%20repellent%20OR%20artemisinin%20OR%20chloroquine%20OR%20quinine%20OR%20artesunate%20OR%20lumefantrin%20OR%20mefloquine%20OR%20atovaquone%20OR%20paludrine%20OR%20"insecticide%20treated%20bednet"%20OR%20"indoor%20residual%20spraying"' -o search.json

 # query to retrieve articles that identify as belonging to 2010 or 2011,
 # either by article year or journal year, from a suitable prepared ES index
 # and write the result object to 2010-2011.json
 # (again change ES target URL and size as required)

 curl -X GET 'http://localhost:9200/malaria/record/_search?size=50000q=year:2011%20OR%20journal.year:2011%20OR%20year:2010%20OR%20journal.year:2010' -o 2010-2011.json

 # these queries could be combined into one, and sent as a JSON data object via POST instead of GET, 
 # but I was interested in the intermediate results anyway, so did it this way.

 # here are all the terms from the above query show neatly
 gametocyte 
 merozoite 
 sporozoite 
 trophozoite 
 schizont 
 artemisia 
 ITN 
 LLIN 
 malaria 
 mosquito 
 anopheles 
 plasmodium 
 falciparum 
 vivax 
 ovale 
 malariae 
 knowlesi 
 DDT 
 pyrethroid 
 carbamate 
 organophosphate 
 ogranochlorine 
 bednet 
 repellent 
 artemisinin 
 chloroquine 
 quinine 
 artesunate 
 lumefantrin 
 mefloquine 
 atovaquone 
 paludrine 
 "insecticide treated bednet" 
 "indoor residual spraying"
diff --git a/medline2es.py b/medline2es.py
 # reads through all tar files in the Medline package, formats them to bibJSON, 
 # and writes them to an ElasticSearch index
 # (split the tars into multiple directories and run multiple versions of this 
 # script to speed the process - or add proper threading)

 from elementtree import ElementTree as ET

 import httplib
 import json
 import os
 import shutil
 import gzip

 es_url = "localhost:9200"
 es_path = "/medline2012/record"

 tardir = '/home/ichi/openbiblio/medline/a/'
 dirList=os.listdir(tardir)

 filecount = 1
 for tarname in dirList:
    # skip ones already done, or where reading the tar folder
    if filecount > 0:

        print filecount, tardir, tarname
        
        # read the tar file and unpack it to tar 
        tarobj = gzip.open(tardir + tarname)
        outfile = open(tardir + tarname.replace(".gz",""), 'w')
        outfile.write(tarobj.read())
        outfile.close()

        # parse the xml file
        # this causes delay and requires enough free memory to fit the file
        tree = ET.parse(tardir + tarname.replace('.gz',''))
        elem = tree.getroot()

        # for every item in the xml file, parse it and create a JSON object of it
        recordcount = 0
        for sub in elem:
            #print filecount, recordcount
            
            # parse the item into a dict        
            doc = {}
            doc["collection"] = ["medline"]

            doc["identifier"] = [{"type":"PMID","id":sub.find("PMID").text}]


            try:
                doc["affiliation"] = sub.find("Affiliation").text
            except:
                pass

            # abstracts are copyright
            '''
            try:
                otherabstract = sub.find("OtherAbstract")
                doc["abstract"] = otherabstract.find("AbstractText").text
            except:
                pass
            '''

            try:
                keywordlist = sub.find("KeywordList")
                doc["keyword"] = []
                for keyword in keywordlist:
                    doc["keyword"].append(keyword.text)
            except:
                pass

            try:
                grantlist = sub.find("GrantList")
                doc["grants"] = []
                for grant in grantlist:
                    doc["grants"].append(grant.find("Agency").text)
            except:
                pass

            try:
                comments = sub.find("CommentsCorrectionsList")
                doc["cites"] = []
                for comment in comments:
                    doc["cites"].append({
                        "id":comment.find("RefSource").text,
                        "description":comment.find("PMID").text
                    })
            except:
                pass

            try:
                article = sub.find("Article")
                doc["title"] = article.find("ArticleTitle").text
                doc["language"] = article.find("Language").text
            except:
                pass
            
            # abstracts are copyright
            try:
                abstract = article.find("Abstract")
                if abstract:
                    for item in abstract:
                        #doc["abstract"] = abstract.find("AbstractText").text
                        try:
                            doc["license"] = [{"description":abstract.find("CopyrightInformation").text}]
                        except:
                            pass
            except:
                pass
            
            try:
                doi = article.find("ELocationID")
                if doi.attrib["EIdType"] == "doi":
                    doc["identifier"] = [{"type":"DOI","id":doi.text}]
            except:
                pass
            
            try:
                authorlist = article.find("AuthorList")
                doc["author"] = []
                for author in authorlist:
                    lastname = author.find("LastName").text
                    firstname = author.find("ForeName").text
                    initials = author.find("Initials").text
                    doc["author"].append({
                        "name": lastname + " " + firstname,
                        "lastname":lastname,
                        "firstname":firstname
                    })
            except:
                pass

            try:
                journal = article.find("Journal")
                doc["journal"] = {
                    "name":journal.find("Title").text,
                    "identifier":[
                        {
                            "type": "issn",
                            "id": journal.find("ISSN").text
                        },
                        {
                            "type": "iso",
                            "id": journal.find("ISOAbbreviation").text
                        }
                    ]
                }
                try:
                    journalissue = journal.find("JournalIssue")
                    doc["journal"]["volume"] = journalissue.find("Volume").text
                except:
                    pass
                try:
                    journalpubdate = journalissue.find("PubDate")
                    doc["journal"]["year"] = journalpubdate.find("Year").text
                    doc["journal"]["month"] = journalpubdate.find("Month").text
                except:
                    pass
            except:
                pass
                        

            try:
                articledate = article.find("ArticleDate")
                doc["year"] = articledate.find("Year").text
                doc["month"] = articledate.find("Month").text
                doc["day"] = articledate.find("Day").text
            except:
                pass
            
            # dump the dict to JSON
            data = json.dumps(doc)

            # send to ES index
            c =  httplib.HTTPConnection(es_url)
            c.request('POST', es_path, data)
            result = c.getresponse()

            #print result.status#,data, result.reason

            # increment the record count then it is time to loop
            recordcount += 1

        # tidy up
        print recordcount
        del tarobj, tree, elem
        os.remove(tardir + tarname.replace(".gz",""))

    # increment the file count then it is time to loop
    filecount += 1


diff --git a/mw2bibjson.py b/mw2bibjson.py
 # converts the MalariaWorld data in their CSV dump format into bibJSON and writes the result to file

 import csv, json, re

 infile = open('MalariaWorld-articles.csv')
 files = csv.DictReader(infile)

 outfile = open('bib.json','w')
 outfile.write('[\n')

 count = 0

 for record in files:

    #print json.dumps(record,indent=4)

    if count != 0:
        outfile.write(',\n')
    count += 1

    brecord = {}

    if 'Author(s)' in record:
        auths = record['Author(s)'].split(',')
        authss = []
        for auth in auths:
            if " and " in auth:
                sauths = auth.split(" and ")
                for s in sauths:
                    authss.append(s.strip())
            else:
                authss.append(auth.strip())
        brecord["author"] = [{"name":i} for i in authss]

    if 'Title' in record:
        m = re.match('Open Access',record["Title"])
        if m:
            title = record["Title"].replace('Open Access','')
            if ' | ' in title:
                title = title.replace(' | ','')
            brecord["license"] = [{"description":"open access"}]
        else:
            title = record["Title"]
        brecord["title"] = title

    if 'Body' in record:
        brecord["abstract"] = record["Body"]
    elif 'Teaser' in record:
        brecord["abstract"] = record["Teaser"]
    
    if 'Tags' in record:
        brecord["keyword"] = [i.strip() for i in record["Tags"].split(',')]

    if 'Reference' in record:
        journal = record["Reference"].split(',')[0]
        m = re.search('(\d){4}',record["Reference"])
        if m:
            year = m.group(0)
            if 1980 < int(year) < 2013:
                print year
                journal = journal.replace(year,'').strip()
            else:
                year = None
        else:
            year = False
        brecord["journal"] = {"name":journal}
        if year:
            brecord["journal"]["year"] = year

    if "URL" in record:
        brecord["link"] = [{"url":record["URL"]}]

    #if "Calais Entity Name" in record:
    #    brecord["centity"] = record["Calais Entity Name"]

    #if "Calais Linked Data URI" in record:
    #    brecord["cld"] = record["Calais Linked Data URI"]

    #if "Calais Relevance Score" in record:
    #    brecord["crs"] = record["Calais Relevance Score"]

    outfile.write(json.dumps(brecord,indent=4))
    #print count

 outfile.write('\n]')
 outfile.close()
diff --git a/oa-cache b/oa-cache
 # add this to oa-cache in the open-access-media-importer
 # also requires addition of bibjson-articles to the top list of accepted actions
 # and some fixing of the code (but this gives the core idea of how the bibJSON
 # is made)

 if action == 'bibjson-articles':
    outfile = open('bib.json','w')
    outfile.write('[\n')
    count = 0
    # categories based on:
    # “Citation Rules with Examples for Journal Articles on the Internet”
    # <http://www.ncbi.nlm.nih.gov/books/NBK7281/#A55596>
    source_path = config.get_metadata_raw_source_path(target)
    for result in source_module.list_articles(source_path):
        dataset = {
            "author": [{"name":i.strip()} for i in result['article-contrib-authors'].split(',')],
            "title": result['article-title'],
            "abstract": result['article-abstract']
        }
        if result['journal-title']:
            dataset["journal"] = {"name":result['journal-title']}
        if result['article-license-url']:
            dataset["license"] = [{"url":result['article-license-url']}]
        if result['article-url']:
            dataset["link"] = [{"url":result['article-url']}]
        if result['article-copyright-holder']:
            dataset["copyright"] = result['article-copyright-holder']
        if result['article-id']:
            dataset["identifier"] = [{"type":"PMCID","id":result['article-id']}]
        if result['article-date']:
            yearparts = result['article-date'].split('-')
            dataset["year"] = yearparts[0]
            if len(yearparts) > 1:
                dataset["month"] = yearparts[1]
            if len(yearparts) > 2:
                dataset["day"] = yearparts[2]
        if count != 0:
            outfile.write(',\n')
        outfile.write(json.dumps(dataset,indent=4))
        #stdout.write(json.dumps(dataset,indent=4))
        count += 1
        print count
    outfile.write('\n]')
    outfile.close()
diff --git a/toindex.py b/toindex.py
 # given a JSON file containing an ES search result object, 
 # this script will send all the results to a specified ES index

 import httplib
 import json

 es_url = "localhost:9200"
 es_path = "/malaria20102011/record"

 res = json.load(open('2010-2011.json'))

 for result in [i['_source'] for i in res['hits']['hits']]:
    c =  httplib.HTTPConnection(es_url)
    c.request('POST', es_path, json.dumps(result))
    result = c.getresponse()
	# query to retrieve articles relevant to malaria from indices
	# and write the result object to search.json
	# (change ES URL target and size param for different datasets)

	# size can be discovered by looking at the value in data['hits']['total']
	# 10 are returned at a time by default, so just increase to a number larger than total

	curl -X GET 'http://localhost:9200/medline2012/record/_search?size=10000&q=gametocyte%20OR%20merozoite%20OR%20sporozoite%20OR%20trophozoite%20OR%20schizont%20OR%20artemisia%20OR%20ITN%20OR%20LLIN%20OR%20malaria%20OR%20mosquito%20OR%20anopheles%20OR%20plasmodium%20OR%20falciparum%20OR%20vivax%20OR%20ovale%20OR%20malariae%20OR%20knowlesi%20OR%20DDT%20OR%20pyrethroid%20OR%20carbamate%20OR%20organophosphate%20OR%20ogranochlorine%20OR%20bednet%20OR%20repellent%20OR%20artemisinin%20OR%20chloroquine%20OR%20quinine%20OR%20artesunate%20OR%20lumefantrin%20OR%20mefloquine%20OR%20atovaquone%20OR%20paludrine%20OR%20"insecticide%20treated%20bednet"%20OR%20"indoor%20residual%20spraying"' -o search.json

	# query to retrieve articles that identify as belonging to 2010 or 2011,
	# either by article year or journal year, from a suitable prepared ES index
	# and write the result object to 2010-2011.json
	# (again change ES target URL and size as required)

	curl -X GET 'http://localhost:9200/malaria/record/_search?size=50000q=year:2011%20OR%20journal.year:2011%20OR%20year:2010%20OR%20journal.year:2010' -o 2010-2011.json

	# these queries could be combined into one, and sent as a JSON data object via POST instead of GET,
	# but I was interested in the intermediate results anyway, so did it this way.

	# here are all the terms from the above query show neatly
	gametocyte
	merozoite
	sporozoite
	trophozoite
	schizont
	artemisia
	ITN
	LLIN
	malaria
	mosquito
	anopheles
	plasmodium
	falciparum
	vivax
	ovale
	malariae
	knowlesi
	DDT
	pyrethroid
	carbamate
	organophosphate
	ogranochlorine
	bednet
	repellent
	artemisinin
	chloroquine
	quinine
	artesunate
	lumefantrin
	mefloquine
	atovaquone
	paludrine
	"insecticide treated bednet"
	"indoor residual spraying"
	# reads through all tar files in the Medline package, formats them to bibJSON,
	# and writes them to an ElasticSearch index
	# (split the tars into multiple directories and run multiple versions of this
	# script to speed the process - or add proper threading)

	from elementtree import ElementTree as ET

	import httplib
	import json
	import os
	import shutil
	import gzip

	es_url = "localhost:9200"
	es_path = "/medline2012/record"

	tardir = '/home/ichi/openbiblio/medline/a/'
	dirList=os.listdir(tardir)

	filecount = 1
	for tarname in dirList:
	# skip ones already done, or where reading the tar folder
	if filecount > 0:

	print filecount, tardir, tarname

	# read the tar file and unpack it to tar
	tarobj = gzip.open(tardir + tarname)
	outfile = open(tardir + tarname.replace(".gz",""), 'w')
	outfile.write(tarobj.read())
	outfile.close()

	# parse the xml file
	# this causes delay and requires enough free memory to fit the file
	tree = ET.parse(tardir + tarname.replace('.gz',''))
	elem = tree.getroot()

	# for every item in the xml file, parse it and create a JSON object of it
	recordcount = 0
	for sub in elem:
	#print filecount, recordcount

	# parse the item into a dict
	doc = {}
	doc["collection"] = ["medline"]

	doc["identifier"] = [{"type":"PMID","id":sub.find("PMID").text}]


	try:
	doc["affiliation"] = sub.find("Affiliation").text
	except:
	pass

	# abstracts are copyright
	'''
	try:
	otherabstract = sub.find("OtherAbstract")
	doc["abstract"] = otherabstract.find("AbstractText").text
	except:
	pass
	'''

	try:
	keywordlist = sub.find("KeywordList")
	doc["keyword"] = []
	for keyword in keywordlist:
	doc["keyword"].append(keyword.text)
	except:
	pass

	try:
	grantlist = sub.find("GrantList")
	doc["grants"] = []
	for grant in grantlist:
	doc["grants"].append(grant.find("Agency").text)
	except:
	pass

	try:
	comments = sub.find("CommentsCorrectionsList")
	doc["cites"] = []
	for comment in comments:
	doc["cites"].append({
	"id":comment.find("RefSource").text,
	"description":comment.find("PMID").text
	})
	except:
	pass

	try:
	article = sub.find("Article")
	doc["title"] = article.find("ArticleTitle").text
	doc["language"] = article.find("Language").text
	except:
	pass

	# abstracts are copyright
	try:
	abstract = article.find("Abstract")
	if abstract:
	for item in abstract:
	#doc["abstract"] = abstract.find("AbstractText").text
	try:
	doc["license"] = [{"description":abstract.find("CopyrightInformation").text}]
	except:
	pass
	except:
	pass

	try:
	doi = article.find("ELocationID")
	if doi.attrib["EIdType"] == "doi":
	doc["identifier"] = [{"type":"DOI","id":doi.text}]
	except:
	pass

	try:
	authorlist = article.find("AuthorList")
	doc["author"] = []
	for author in authorlist:
	lastname = author.find("LastName").text
	firstname = author.find("ForeName").text
	initials = author.find("Initials").text
	doc["author"].append({
	"name": lastname + " " + firstname,
	"lastname":lastname,
	"firstname":firstname
	})
	except:
	pass

	try:
	journal = article.find("Journal")
	doc["journal"] = {
	"name":journal.find("Title").text,
	"identifier":[
	{
	"type": "issn",
	"id": journal.find("ISSN").text
	},
	{
	"type": "iso",
	"id": journal.find("ISOAbbreviation").text
	}
	]
	}
	try:
	journalissue = journal.find("JournalIssue")
	doc["journal"]["volume"] = journalissue.find("Volume").text
	except:
	pass
	try:
	journalpubdate = journalissue.find("PubDate")
	doc["journal"]["year"] = journalpubdate.find("Year").text
	doc["journal"]["month"] = journalpubdate.find("Month").text
	except:
	pass
	except:
	pass


	try:
	articledate = article.find("ArticleDate")
	doc["year"] = articledate.find("Year").text
	doc["month"] = articledate.find("Month").text
	doc["day"] = articledate.find("Day").text
	except:
	pass

	# dump the dict to JSON
	data = json.dumps(doc)

	# send to ES index
	c = httplib.HTTPConnection(es_url)
	c.request('POST', es_path, data)
	result = c.getresponse()

	#print result.status#,data, result.reason

	# increment the record count then it is time to loop
	recordcount += 1

	# tidy up
	print recordcount
	del tarobj, tree, elem
	os.remove(tardir + tarname.replace(".gz",""))

	# increment the file count then it is time to loop
	filecount += 1
	# converts the MalariaWorld data in their CSV dump format into bibJSON and writes the result to file

	import csv, json, re

	infile = open('MalariaWorld-articles.csv')
	files = csv.DictReader(infile)

	outfile = open('bib.json','w')
	outfile.write('[\n')

	count = 0

	for record in files:

	#print json.dumps(record,indent=4)

	if count != 0:
	outfile.write(',\n')
	count += 1

	brecord = {}

	if 'Author(s)' in record:
	auths = record['Author(s)'].split(',')
	authss = []
	for auth in auths:
	if " and " in auth:
	sauths = auth.split(" and ")
	for s in sauths:
	authss.append(s.strip())
	else:
	authss.append(auth.strip())
	brecord["author"] = [{"name":i} for i in authss]

	if 'Title' in record:
	m = re.match('Open Access',record["Title"])
	if m:
	title = record["Title"].replace('Open Access','')
	if ' \| ' in title:
	title = title.replace(' \| ','')
	brecord["license"] = [{"description":"open access"}]
	else:
	title = record["Title"]
	brecord["title"] = title

	if 'Body' in record:
	brecord["abstract"] = record["Body"]
	elif 'Teaser' in record:
	brecord["abstract"] = record["Teaser"]

	if 'Tags' in record:
	brecord["keyword"] = [i.strip() for i in record["Tags"].split(',')]

	if 'Reference' in record:
	journal = record["Reference"].split(',')[0]
	m = re.search('(\d){4}',record["Reference"])
	if m:
	year = m.group(0)
	if 1980 < int(year) < 2013:
	print year
	journal = journal.replace(year,'').strip()
	else:
	year = None
	else:
	year = False
	brecord["journal"] = {"name":journal}
	if year:
	brecord["journal"]["year"] = year

	if "URL" in record:
	brecord["link"] = [{"url":record["URL"]}]

	#if "Calais Entity Name" in record:
	# brecord["centity"] = record["Calais Entity Name"]

	#if "Calais Linked Data URI" in record:
	# brecord["cld"] = record["Calais Linked Data URI"]

	#if "Calais Relevance Score" in record:
	# brecord["crs"] = record["Calais Relevance Score"]

	outfile.write(json.dumps(brecord,indent=4))
	#print count

	outfile.write('\n]')
	outfile.close()
	# add this to oa-cache in the open-access-media-importer
	# also requires addition of bibjson-articles to the top list of accepted actions
	# and some fixing of the code (but this gives the core idea of how the bibJSON
	# is made)

	if action == 'bibjson-articles':
	outfile = open('bib.json','w')
	outfile.write('[\n')
	count = 0
	# categories based on:
	# “Citation Rules with Examples for Journal Articles on the Internet”
	# <http://www.ncbi.nlm.nih.gov/books/NBK7281/#A55596>
	source_path = config.get_metadata_raw_source_path(target)
	for result in source_module.list_articles(source_path):
	dataset = {
	"author": [{"name":i.strip()} for i in result['article-contrib-authors'].split(',')],
	"title": result['article-title'],
	"abstract": result['article-abstract']
	}
	if result['journal-title']:
	dataset["journal"] = {"name":result['journal-title']}
	if result['article-license-url']:
	dataset["license"] = [{"url":result['article-license-url']}]
	if result['article-url']:
	dataset["link"] = [{"url":result['article-url']}]
	if result['article-copyright-holder']:
	dataset["copyright"] = result['article-copyright-holder']
	if result['article-id']:
	dataset["identifier"] = [{"type":"PMCID","id":result['article-id']}]
	if result['article-date']:
	yearparts = result['article-date'].split('-')
	dataset["year"] = yearparts[0]
	if len(yearparts) > 1:
	dataset["month"] = yearparts[1]
	if len(yearparts) > 2:
	dataset["day"] = yearparts[2]
	if count != 0:
	outfile.write(',\n')
	outfile.write(json.dumps(dataset,indent=4))
	#stdout.write(json.dumps(dataset,indent=4))
	count += 1
	print count
	outfile.write('\n]')
	outfile.close()
	# given a JSON file containing an ES search result object,
	# this script will send all the results to a specified ES index

	import httplib
	import json

	es_url = "localhost:9200"
	es_path = "/malaria20102011/record"

	res = json.load(open('2010-2011.json'))

	for result in [i['_source'] for i in res['hits']['hits']]:
	c = httplib.HTTPConnection(es_url)
	c.request('POST', es_path, json.dumps(result))
	result = c.getresponse()