Created
September 25, 2016 03:27
-
-
Save harej/0ae3b77db7c1114cec3dbb0b8f1664cc to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import html | |
import requests | |
import threading | |
class AskPubMed(threading.Thread): | |
def __init__ (self, threadID, name, packages): | |
threading.Thread.__init__(self) | |
self.threadID = threadID | |
self.name = name | |
self.packages = packages | |
def run(self): | |
esummary_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi?db=pubmed&retmode=json&tool=wikidata_worker&[email protected]&id=" | |
idconv_url = "https://www.ncbi.nlm.nih.gov/pmc/utils/idconv/v1.0/?format=json&tool=wikidata_worker&[email protected]&ids=" | |
months = { | |
"Jan": "01", | |
"Feb": "02", | |
"Mar": "03", | |
"Apr": "04", | |
"May": "05", | |
"Jun": "06", | |
"Jul": "07", | |
"Aug": "08", | |
"Sep": "09", | |
"Oct": "10", | |
"Nov": "11", | |
"Dec": "12" | |
} | |
for package in self.packages: | |
bunch_of_numbers = "" | |
for pmid in package: | |
bunch_of_numbers += pmid + "," | |
bunch_of_numbers = bunch_of_numbers[:-1] # Remove trailing comma | |
summary_retriever = requests.get(esummary_url + bunch_of_numbers) | |
if summary_retriever.status_code != 200: | |
continue | |
# Now processing the bibliographic metadata from our summary retriever query... | |
summary_retriever_json = summary_retriever.json() | |
if "result" in summary_retriever_json: | |
for _, pmid_blob in summary_retriever_json["result"].items(): | |
if _ == "uids": | |
continue | |
pmid = pmid_blob["uid"] | |
# First: The basics | |
output_string = "CREATE\n" | |
output_string += "LAST\tP698\t\"" + pmid + "\"\tS248\tQ180686\n" | |
output_string += "LAST\tP31\tQ13442814\tS248\tQ180686\n" | |
output_string += "LAST\tDen\t\"" + "scientific article" + "\"\n" | |
# Are there other IDs we can add? | |
doi = None # if there is a DOI, this value will be overridden | |
if "articleids" in pmid_blob: | |
for identifier in pmid_blob["articleids"]: | |
if identifier["idtype"] == "doi": | |
doi = identifier["value"] # We want the DOI for later | |
output_string += "LAST\tP356\t\"" + identifier["value"] + "\"\tS248\tQ180686\n" | |
elif identifier["idtype"] == "pmc": | |
pmcid = identifier["value"].replace("PMC", "") | |
output_string += "LAST\tP932\t\"" + pmcid + "\"\tS248\tQ180686\n" | |
# Title | |
if "title" in pmid_blob: | |
t = html.unescape(pmid_blob["title"]) | |
if t != "": | |
output_string += "LAST\tLen\t\"" + t + "\"\n" | |
output_string += "LAST\tP1476\ten:\"" + t + "\"\tS248\tQ180686\n" | |
# Publication date | |
if "pubdate" in pmid_blob: | |
pubdate = None | |
pubdate_raw = pmid_blob["pubdate"].split(" ") # 2016 Aug 1 | |
if len(pubdate_raw) > 1: | |
if pubdate_raw[1] in months: | |
m = months[pubdate_raw[1]] | |
else: | |
m = "00" | |
if len(pubdate_raw) == 3: # Precision to the day | |
pubdate = "+{0}-{1}-{2}T00:00:00Z/11".format(pubdate_raw[0], m, pubdate_raw[2].zfill(2)) | |
elif len(pubdate_raw) == 2: # Precision to the month | |
pubdate = "+{0}-{1}-00T00:00:00Z/10".format(pubdate_raw[0], m) | |
elif len(pubdate_raw) == 1: # Precision to the year | |
pubdate = "+{0}-00-00T00:00:00Z/9".format(pubdate_raw[0]) | |
if pubdate != None: | |
output_string += "LAST\tP577\t" + pubdate + "\tS248\tQ180686\n" | |
# Published in | |
if "issn" in pmid_blob: | |
issn_query_url = "https://query.wikidata.org/sparql?format=json&query=select%20%3Fi%20%3Fissn%20where%20%7B%20%3Fi%20wdt%3AP236%20%22{0}%22%20%7D" | |
issn_query = requests.get(issn_query_url.format(pmid_blob["issn"])).json() | |
issn_results = issn_query["results"]["bindings"] | |
if len(issn_results) == 1: # We want no ambiguity here | |
journal = issn_results[0]["i"]["value"].replace("http://www.wikidata.org/entity/", "") | |
output_string += "LAST\tP1433\t" + journal + "\tS248\tQ180686\n" | |
# Volume | |
if "volume" in pmid_blob: | |
if pmid_blob["volume"] != "": | |
output_string += "LAST\tP478\t\"" + pmid_blob["volume"] + "\"\tS248\tQ180686\n" | |
# Issue | |
if "issue" in pmid_blob: | |
if pmid_blob["issue"] != "": | |
output_string += "LAST\tP433\t\"" + pmid_blob["issue"] + "\"\tS248\tQ180686\n" | |
# Pages | |
if "pages" in pmid_blob: | |
if pmid_blob["pages"] != "": | |
output_string += "LAST\tP304\t\"" + pmid_blob["pages"] + "\"\tS248\tQ180686\n" | |
# Original language | |
if "lang" in pmid_blob: | |
for langcode in pmid_blob["lang"]: | |
if langcode == "eng": | |
output_string += "LAST\tP364\tQ1860\tS248\tQ180686\n" | |
break | |
# Authors | |
authors_not_done = True # set to False if authors are successfully extracted via Crossref | |
if doi != None: | |
crossref = requests.get("https://dx.doi.org/" + doi, headers={"Accept": "application/json"}) | |
if crossref.status_code == 200: | |
try: | |
crossref_json = crossref.json() | |
if "author" in crossref_json: | |
authors_not_done = False | |
author_counter = 0 | |
for author in crossref_json["author"]: | |
author_counter += 1 | |
a = "" | |
if "family" in author: | |
a = author["family"] | |
if "given" in author: | |
a = author["given"] + " " + a | |
output_string += "LAST\tP2093\t\"" + a + "\"\tP1545\t\"" + str(author_counter) + "\"\tS248\tQ5188229\n" | |
except ValueError: | |
pass | |
if "authors" in pmid_blob and authors_not_done == True: | |
author_counter = 0 | |
for author in pmid_blob["authors"]: | |
if author["authtype"] == "Author": | |
author_counter += 1 | |
output_string += "LAST\tP2093\t\"" + author["name"] + "\"\tP1545\t\"" + str(author_counter) + "\"\tS248\tQ180686\n" | |
output_string = output_string[:-1] | |
print(output_string) | |
def main(seed_url): | |
seed = requests.get(seed_url).json() | |
full_pmid_list = [x for x in seed["esearchresult"]["idlist"]] | |
wikidata = requests.get("https://query.wikidata.org/sparql?format=json&query=select%20%3Fp%20where%20%7B%20%3Fi%20wdt%3AP698%20%3Fp%20%7D").json() | |
wikidata_pmid_list = [x["p"]["value"] for x in wikidata["results"]["bindings"]] | |
pmid_list = list(set(full_pmid_list) - set(wikidata_pmid_list)) | |
# A list of 200 IDs makes a package. These collectively are the "packages". | |
# The package of all these packages is the "Package of Packages". | |
packages = [pmid_list[x:x+200] for x in range(0, len(pmid_list), 200)] | |
package_of_packages = [packages[x:x+225] for x in range(0, len(packages), 225)] | |
thread_counter = 0 | |
for packages in package_of_packages: | |
thread = AskPubMed(thread_counter, "thread-" + str(thread_counter), packages) | |
thread_counter += 1 | |
thread.start() | |
if __name__ == "__main__": | |
main("https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?retmode=json&[email protected]&tool=wikidata_worker&db=pubmed&term=review[filter]%20free%20full%20text[filter]&reldate=1850&datetype=edat&retmax=180000") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment