Created
April 23, 2020 21:26
-
-
Save epoz/389bcc8a0fa61e6995f3556681d34b4b to your computer and use it in GitHub Desktop.
Scan Crossref data dump of 2020-04-14
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import json | |
import gzip | |
import os | |
from progress.bar import Bar | |
# Scan the Crossref data dump as mentioned in : https://twitter.com/CrossrefOrg/status/1250146935861886976 | |
# And parse out the publishers names, so you know where in the giant dump your own data can be found | |
# Note this script uses the progress library, so before running do a "pip install progress" | |
filenames = [filename for filename in os.listdir('.') if filename.endswith('.json.gz')] | |
b = Bar("Scanning .json.gz files", max=len(filenames)) | |
publishers = {} | |
for filename in filenames: | |
b.next() | |
data = json.load(gzip.open(filename)) | |
for item in data.get("items", []): | |
pub = item.get("publisher") | |
if pub is None: | |
continue | |
if pub in publishers: | |
continue | |
publishers.setdefault(pub, []).append(filename) | |
b.finish() | |
json.dump(publishers, open('publisher_mapping.json', 'w'), indent=2) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment