Skip to content

Instantly share code, notes, and snippets.

@cydal
Created March 17, 2021 21:36
Show Gist options
  • Save cydal/c047cc79c0f28abd883ebf3e5a498696 to your computer and use it in GitHub Desktop.
Save cydal/c047cc79c0f28abd883ebf3e5a498696 to your computer and use it in GitHub Desktop.
# Import Libraries
import requests
import pandas as pd
import json
## Define lists to hold article information
titles = []
authors = []
publisher = []
doi = []
publishedDate = []
description = []
fullText = []
urll = []
document_type = []
types = []
# Define number of pages
pageCount = 100
# Function to append article info to lists
def append_to_list(dic):
## For each article returned
for eachObject in dic:
titles.append(eachObject["_source"]["title"])
authors.append(eachObject["_source"]["authors"])
publisher.append(eachObject["_source"]["publisher"])
doi.append(eachObject["_source"]["doi"])
publishedDate.append(eachObject["_source"]["datePublished"])
description.append(eachObject["_source"]["description"])
fullText.append(eachObject["_source"]["fullText"])
urll.append(eachObject["_source"]["downloadUrl"])
document_type.append(eachObject["_source"]["documentType"])
types.append(eachObject["_type"])
# Send request to CORE & call append_to_list
for page in range(1, pageCount):
params = {"page": page, "pageSize": 100, "apiKey": apikey}
response = requests.get(url+query[0], params=params)
response_json = response.json()
print("Page - ", page)
print("Length - ", len(response_json["data"]))
append_to_list(response_json["data"])
# Create Dictionary to hold lists
dicto = {
"Title": titles,
"authors": authors,
"publisher": publisher,
"doi": doi,
"publishedDate": publishedDate,
"abstract": description,
"description": fullText,
"url": urll,
"document_type": document_type,
"type": types
}
# Save dictinary as json to disk
with open("dicto.json", 'w') as outfile:
json.dump(dicto, outfile)
# Create Pandas Dataframe using lists
ExtractDf = pd.DataFrame(
{
"Title": titles,
"authors": authors,
"publisher": publisher,
"doi": doi,
"publishedDate": publishedDate,
"abstract": description,
"description": fullText,
"url": urll,
"document_type": document_type,
"type": types
}
)
# Save pandas dataframe to disk
ExtractDf.to_csv("core.csv", index=False)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment