Created
May 27, 2021 12:36
-
-
Save jaklinger/5f2ed4fc8d3f752d3de50e63b0445bcd to your computer and use it in GitHub Desktop.
Read CORD19 data
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from tempfile import TemporaryFile | |
import requests | |
import shutil | |
import tarfile | |
import csv | |
URL = 'https://ai2-semanticscholar-cord-19.s3-us-west-2.amazonaws.com/historical_releases/cord-19_{date}.tar.gz' | |
CSV_PATH = '{date}/all_sources_metadata_{date}.csv' | |
def stream_to_file(url, fileobj): | |
"""Stream contents from url to fileobj""" | |
with requests.get(url, stream=True) as r: | |
r.raise_for_status() | |
shutil.copyfileobj(r.raw, fileobj) | |
fileobj.seek(0) # reset | |
def cord_csv(date): | |
""" | |
Returns only the CSV metadata file from the CORD19 dataset, by | |
writing a temporary file to keep the in-memory processing to a minimum | |
since the largest CORD tarfile is 9GB | |
""" | |
url = URL.format(date=date) | |
filename = CSV_PATH.format(date=date) | |
with TemporaryFile(suffix='.tar.gz') as fileobj: | |
stream_to_file(url, fileobj) | |
with tarfile.open(fileobj=fileobj) as tf: | |
with tf.extractfile(filename) as csv: | |
return StringIO(csv.read().decode('latin')) | |
def cord_data(date): | |
"""Yield lines (dict) from the CORD19 CSV for this date""" | |
with cord_csv(date) as f: | |
for line in csv.DictReader(f): | |
yield line | |
data = list(cord_data('2020-03-13')) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment