epoz · April 23, 2020 21:26
diff --git a/scan_publishers.py b/scan_publishers.py
 import json
 import gzip
 import os
 from progress.bar import Bar

 # Scan the Crossref data dump as mentioned in : https://twitter.com/CrossrefOrg/status/1250146935861886976
 # And parse out the publishers names, so you know where in the giant dump your own data can be found
 # Note this script uses the progress library, so before running do a "pip install progress"

 filenames = [filename for filename in os.listdir('.') if filename.endswith('.json.gz')]
 b = Bar("Scanning .json.gz files", max=len(filenames))
 publishers = {}
 for filename in filenames:
    b.next()
    data = json.load(gzip.open(filename))
    for item in data.get("items", []):
        pub = item.get("publisher")
        if pub is None:
            continue
        if pub in publishers:
            continue
        publishers.setdefault(pub, []).append(filename)
 b.finish()
 json.dump(publishers, open('publisher_mapping.json', 'w'), indent=2)
	import json
	import gzip
	import os
	from progress.bar import Bar

	# Scan the Crossref data dump as mentioned in : https://twitter.com/CrossrefOrg/status/1250146935861886976
	# And parse out the publishers names, so you know where in the giant dump your own data can be found
	# Note this script uses the progress library, so before running do a "pip install progress"

	filenames = [filename for filename in os.listdir('.') if filename.endswith('.json.gz')]
	b = Bar("Scanning .json.gz files", max=len(filenames))
	publishers = {}
	for filename in filenames:
	b.next()
	data = json.load(gzip.open(filename))
	for item in data.get("items", []):
	pub = item.get("publisher")
	if pub is None:
	continue
	if pub in publishers:
	continue
	publishers.setdefault(pub, []).append(filename)
	b.finish()
	json.dump(publishers, open('publisher_mapping.json', 'w'), indent=2)
No results found