Created
July 7, 2017 06:07
-
-
Save nad2000/54be888ab893ec06bd55fc0622e485a9 to your computer and use it in GitHub Desktop.
NZ items in ORCID's public data files
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
""" | |
Created on Wed Jul 05 11:53:20 2017 | |
Can we get json to do the same tricks as xml, and then pull into mongodb for reporting | |
ORCID public data files are tar.gz with all json serially before xml, so can speed the process a wee bit by using the json | |
The real question is whether the json library is quicker with json, than ETree is with xml | |
NB: version 2.0 of ORCID schema | |
@author: Jason | |
""" | |
import os, json | |
import tarfile | |
tarpath = 'D:/DATA/json_play.tar.gz' | |
tar = tarfile.open(tarpath, 'r:gz') | |
i = 0 | |
errorlog = open('D:/DATA/json_play.error.log', 'w') | |
for tar_info in tar: | |
i += 1 | |
tar.members = [] | |
to_write = 0 | |
filename = tar_info.name | |
# can stop when we hit the xml | |
if filename.endswith('.xml'): | |
break | |
if not filename.endswith('.json'): | |
continue | |
# counter is just to monitor progress | |
if i % 1000 == 0 : print i, filename | |
filename = filename.split("/")[-1] | |
tarjsonfile = tar.extractfile(tar_info).read() | |
data = json.loads(tarjsonfile) | |
try: | |
# check the person country for "NZ" | |
try: | |
if data['person']['addresses']['address'][0]['country']['value'] == "NZ": | |
to_write = 1 | |
except: | |
# person address doesn't exist | |
pass | |
if to_write == 0: | |
# loop through any education affiliations and check for country "NZ" | |
try: | |
for edu_organizations in data['activities-summary']['educations']['education-summary']: | |
if edu_organizations['organization']['address']['country'] == "NZ": | |
to_write = 1 | |
break | |
except: | |
#no education affiliations | |
pass | |
if to_write == 0: | |
# loop through any employment affiliations and check for country "NZ" | |
try: | |
for emp_organizations in data['activities-summary']['employments']['employment-summary']: | |
if emp_organizations['organization']['address']['country'] == "NZ": | |
to_write = 1 | |
break | |
except: | |
#no employment affiliations | |
pass | |
print filename + ', ' + str(to_write) | |
if to_write == 1: | |
# Somewhere an NZ affiliation has been found | |
with open(os.path.join('D:/DATA/json_write', filename), 'w') as outfile: | |
outfile.write(tarjsonfile) | |
except: | |
errorlog.write('Problem with: ' + filename + '\n') | |
with open(os.path.join('D:/DATA/json_error', filename), 'w') as outfile: | |
outfile.write(tarjsonfile) | |
tar.close() | |
errorlog.close() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment